In [1]:
import numpy as np
import sys
np.set_printoptions(threshold=sys.maxsize)

In [2]:
# üìå Import Seaborn for sample datasets
import seaborn as sns

# üìå Load the built-in "tips" dataset
# This dataset contains restaurant bill information ‚Äî total bill, tip,
# gender, smoker/non-smoker, day of the week, time of day, and party size.
#
# We will later use this dataset to demonstrate **Support Vector Regression (SVR)**,
# predicting continuous values (e.g., predicting the TOTAL BILL amount from the Other Idependent features).
tips = sns.load_dataset('tips')

# üìå Display the first 5 rows of the dataset
# This helps us understand the structure and columns we will work with.
display(tips.head())


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
# üìå Display dataset structure, categorical distribution, and missing values

# 1Ô∏è‚É£ Check overall dataset information
#    - Number of rows & columns
#    - Column names & data types
#    - Count of non-null values (for missing value detection)
print("üîç Dataset Info:\n" + "-"*60)
print(tips.info())


# 2Ô∏è‚É£ Check distribution of categorical column "sex"
#    Helps us understand if dataset is balanced (Male vs Female)
print("\nüë• Gender Distribution (sex column):\n" + "-"*60)
print(tips['sex'].value_counts(), "\n")


# 3Ô∏è‚É£ Check for missing values in every column
#    .isnull().sum() ‚Üí shows how many nulls exist per column
print("‚ùó Missing Values per Column:\n" + "-"*60)
print(tips.isnull().sum())

üîç Dataset Info:
------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB
None

üë• Gender Distribution (sex column):
------------------------------------------------------------
sex
Male      157
Female     87
Name: count, dtype: int64 

‚ùó Missing Values per Column:
------------------------------------------------------------
total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype

In [4]:
# üìå Identify all categorical columns (object + category type)
categorical_cols = tips.select_dtypes(include=['category']).columns

print("üîé Categorical Column Value Counts")
print("=" * 60)

# Loop through each categorical column and display counts
for col in categorical_cols:
    print(f"\nüìå Column: {col}")
    print("-" * 60)
    print(tips[col].value_counts())


üîé Categorical Column Value Counts

üìå Column: sex
------------------------------------------------------------
sex
Male      157
Female     87
Name: count, dtype: int64

üìå Column: smoker
------------------------------------------------------------
smoker
No     151
Yes     93
Name: count, dtype: int64

üìå Column: day
------------------------------------------------------------
day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64

üìå Column: time
------------------------------------------------------------
time
Dinner    176
Lunch      68
Name: count, dtype: int64


In [5]:
tips.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

In [6]:
## Independent and dependent features

# üìå Selecting the INPUT features (X)
# We are using the following columns to PREDICT the total restaurant bill:
#   tip     ‚Üí numeric (how much tip was given)
#   sex     ‚Üí categorical (gender of customer)
#   smoker  ‚Üí categorical (smoker / non-smoker)
#   day     ‚Üí categorical (day of the week)
#   time    ‚Üí categorical (Lunch / Dinner)
#   size    ‚Üí numeric (number of people at the table)
#
# Note: Categorical columns will later need to be encoded to numeric values
#       because Support Vector Regression (SVR) cannot interpret string labels.
X = tips[['tip', 'sex', 'smoker', 'day', 'time', 'size']]


# üìå Selecting the TARGET variable (y)
# We want the model to PREDICT ‚Üí total_bill
#   total_bill is a continuous number ‚Üí ‚úî suitable for REGRESSION
y = tips.total_bill


In [7]:
# üìå Splitting the dataset into Training and Testing sets
from sklearn.model_selection import train_test_split

# train_test_split divides the dataset into two parts:
#   üîπ Training Set ‚Üí used to TRAIN the model (learn patterns)
#   üîπ Test Set ‚Üí used to EVALUATE how well the model performs on unseen data
#
# Parameters used:
#   X  ‚Üí input features (tip, sex, smoker, day, time, size)
#   y  ‚Üí target variable (total_bill)
#   test_size = 0.25 ‚Üí 25% data for testing, 75% for training
#                     (dataset is small, so 75/25 split gives a healthy balance)
#   random_state = 10 ‚Üí ensures the SAME split every time the code runs
#                       (important for reproducibility)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=10
)

In [8]:
# üìå Feature Encoding (Label Encoding for selected categorical variables)
from sklearn.preprocessing import LabelEncoder

# We will encode the categorical columns "sex", "smoker", and "time" using Label Encoding.
# The "day" column is intentionally NOT label-encoded here, because it will be handled later
# using **One-Hot Encoding**, which is a better choice when category order does NOT matter.

# Create separate label encoders for each categorical column
sex_label = LabelEncoder()
smoker_label = LabelEncoder()
time_label = LabelEncoder()

# ‚ö† IMPORTANT ML RULE
# Always FIT encoders on TRAINING DATA ONLY, then use the SAME encoder to transform TEST DATA.
# This avoids data leakage and keeps numeric mappings consistent.

# Encode 'sex'
X_train['sex'] = sex_label.fit_transform(X_train['sex'])
X_test['sex']  = sex_label.transform(X_test['sex'])

# Encode 'smoker'
X_train['smoker'] = smoker_label.fit_transform(X_train['smoker'])
X_test['smoker']  = smoker_label.transform(X_test['smoker'])

# Encode 'time'
X_train['time'] = time_label.fit_transform(X_train['time'])
X_test['time']  = time_label.transform(X_test['time'])

# üìå NOTE ABOUT "day" COLUMN
# The "day" column remains unencoded at this step on purpose.
# We will apply **One-Hot Encoding** later because:
#   üîπ 'day' has no natural order (Thu, Fri, Sat, Sun)
#   üîπ One-Hot Encoding prevents the model from wrongly assuming ranking between days
#
# Example (later): Thu ‚Üí [1,0,0,0], Fri ‚Üí [0,1,0,0], Sat ‚Üí [0,0,1,0], Sun ‚Üí [0,0,0,1]

# Display encoded training dataset
display(X_train.head())


Unnamed: 0,tip,sex,smoker,day,time,size
58,1.76,1,1,Sat,0,2
1,1.66,1,0,Sun,0,3
2,3.5,1,0,Sun,0,3
68,2.01,1,0,Sat,0,2
184,3.0,1,1,Sun,0,2


In [9]:
## üìå One-Hot Encoding using ColumnTransformer

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


# ColumnTransformer allows us to apply transformations to **specific columns**
# while leaving the rest of the columns untouched.
#
# Here we apply One-Hot Encoding ONLY to the **'day' column**.
# The 'day' column contains non-numeric categories: Thu, Fri, Sat, Sun
# Since these categories have NO natural order, One-Hot Encoding is the best option.
#
# üîπ OneHotEncoder(drop='first') ‚Üí drops the first category to avoid the **dummy variable trap**
#     Example Encoding:
#       Thu ‚Üí [0,0,0]      (dropped)
#       Fri ‚Üí [1,0,0]
#       Sat ‚Üí [0,1,0]
#       Sun ‚Üí [0,0,1]
#
# [3] ‚Üí refers to the column index position of 'day' inside X_train
#       (0=tip, 1=sex, 2=smoker, 3=day, 4=time, 5=size)
#
# remainder='passthrough' ‚Üí keep all remaining columns as they are
column_transformer = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(drop='first'), [3])   # index 3 = 'day'
    ],
    remainder='passthrough'
)


# ‚ö† IMPORTANT
# We **fit** the transformer ONLY on TRAINING DATA
# so it learns the list of categories from training set only.
# Then we **transform** both TRAIN and TEST data using the same mapping.
X_train = column_transformer.fit_transform(X_train)   # learn + transform
X_test  = column_transformer.transform(X_test)         # only transform


# Now X_train and X_test contain:
#  üîπ new One-Hot encoded columns for "day"
#  üîπ AND all original numeric + label-encoded columns (passed through unchanged)


In [10]:
## üìå Applying Support Vector Regression (SVR)

from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# üìå Step 1 ‚Äî Create the SVR model
# SVR (Support Vector Regression) predicts continuous values instead of class labels.
# Default kernel = 'rbf', which is suitable for most non-linear regression problems.
svr = SVR()    # You may tune hyperparameters later (C, epsilon, gamma)


# üìå Step 2 ‚Äî Train (fit) the SVR model
# The model learns the mathematical relationship between:
#   X_train ‚Üí independent features (tip, sex, smoker, day encoded, time, size)
#   y_train ‚Üí target value (total_bill)
svr.fit(X_train, y_train)


# üìå Step 3 ‚Äî Make predictions on TEST data
# SVR now predicts total_bill values for unseen observations in X_test
y_pred = svr.predict(X_test)


In [11]:
# üìå Calculate Regression Evaluation Metrics

# 1Ô∏è‚É£ Mean Absolute Error (MAE)
# Average of absolute differences ‚Üí "on average, how much are we off?"
mae = mean_absolute_error(y_test, y_pred)

# 2Ô∏è‚É£ Mean Squared Error (MSE)
# Penalizes larger errors more strongly (squares the error)
mse = mean_squared_error(y_test, y_pred)

# 3Ô∏è‚É£ Root Mean Squared Error (RMSE)
# Square root of MSE ‚Üí interpretable on the same scale as the target variable
rmse = np.sqrt(mse)

# 4Ô∏è‚É£ R¬≤ Score (Coefficient of Determination)
# Measures how well the model explains variations in target values
#   1.0 ‚Üí Perfect model
#   0.0 ‚Üí No better than guessing average
# < 0  ‚Üí Worse than the baseline model
r2 = r2_score(y_test, y_pred)

# üìå Print results clearly
print("üìä SVR Regression Metrics")
print("=" * 40)
print(f"üîπ Mean Absolute Error (MAE): {mae:.4f}")
print(f"üîπ Mean Squared Error (MSE): {mse:.4f}")
print(f"üîπ Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"üîπ R¬≤ Score: {r2:.4f}")


üìä SVR Regression Metrics
üîπ Mean Absolute Error (MAE): 4.1486
üîπ Mean Squared Error (MSE): 45.6921
üîπ Root Mean Squared Error (RMSE): 6.7596
üîπ R¬≤ Score: 0.4603


In [12]:
import pandas as pd 

# üìå Method to predict the total bill for NEW / UNSEEN customer data
# This function takes raw input values (same features used during training),
# applies the SAME preprocessing steps (label encoding + one-hot encoding),
# and then uses the trained SVR model to predict the total_bill.

def predict_bill(tip, sex, smoker, day, time, size):

    # üîπ Create a 1-row DataFrame from the input values
    #   ‚Äî replicates the same column structure the model was trained on
    sample = pd.DataFrame([[tip, sex, smoker, day, time, size]],
                          columns=['tip', 'sex', 'smoker', 'day', 'time', 'size'])

    # üîπ Apply the SAME label encoders used during model training
    #     (encoders must NEVER be refit ‚Äî always transform only)
    sample['sex']    = sex_label.transform(sample['sex'])
    sample['smoker'] = smoker_label.transform(sample['smoker'])
    sample['time']   = time_label.transform(sample['time'])

    # üîπ Apply the SAME one-hot encoder (ColumnTransformer) used during training
    #     ‚Äî ensures unseen data has the exact same feature structure as training data
    sample = column_transformer.transform(sample)

    # üîπ Predict using the trained Support Vector Regression model
    predicted_value = svr.predict(sample)[0]   

    # Return the numeric prediction (converted to Python float)
    return float(predicted_value)


In [13]:
total_bill = predict_bill(
    tip=5.00,
    sex='Male',
    smoker='No',
    day='Sun',
    time='Dinner',
    size=3
)

print(f"‚Üí Predicted total_bill: ${total_bill:.2f} dollars")


‚Üí Predicted total_bill: $26.27 dollars


In [14]:
## üìå Hyperparameter Tuning using GridSearchCV for SVR

from sklearn.model_selection import GridSearchCV

# üìå Parameter grid for Support Vector Regression (SVR)
# We tune:
#   üîπ C      ‚Üí Regularization strength (how much the model tolerates error)
#   üîπ gamma  ‚Üí Controls the curvature / influence of individual training points
#   üîπ kernel ‚Üí Type of decision function to learn
#
# Kernels included:
#   'rbf'   ‚Üí Best for most non-linear regression patterns (default & most commonly used)
#   'poly'  ‚Üí Useful when the relationship between features and target is polynomial in nature
#   'sigmoid' ‚Üí Behaves like a neural network activation function; works well in some datasets
#
# ‚ö† 'linear' kernel is NOT included here because:
#    ‚Üí total_bill has a non-linear dependency on tip, day, time, smoker etc.
#    ‚Üí earlier results proved non-linear kernels perform better for this dataset

param_grid = {
    'C': [0.1, 1, 10, 100, 1000],                  # Higher C = more focus on fitting training data
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001],        # Higher gamma = more curvature / complexity
    'kernel': ['rbf', 'poly', 'sigmoid']          # Choosing kernels suitable for non-linear regression
}

# üìå Initialize GridSearchCV
# GridSearchCV will train multiple SVR models (one for each parameter combination)
# and select the BEST MODEL based on cross-validation performance.
grid = GridSearchCV(
    estimator=SVR(),
    param_grid=param_grid,
    scoring='r2',        # R¬≤ score chosen to measure regression quality
    cv=5,                # 5-fold cross-validation for more reliable evaluation
    verbose=3,           # Show training progress
    n_jobs=-1,           # Use all CPU cores for faster search
      refit=True         # Retrain the best model on the FULL training dataset
)


In [15]:
# üìå Train GridSearchCV on the training data
# This step performs the full hyperparameter search:
#   üîπ Trains multiple SVR models ‚Äî one for EVERY combination in param_grid
#   üîπ Evaluates each model using 5-fold cross-validation (cv=5)
#   üîπ Selects the BEST hyperparameter combination based on scoring='r2'
#   üîπ Because refit=True ‚Üí it automatically RETRAINS the final SVR model
#       using the BEST hyperparameters on the ENTIRE training dataset.
#
# ‚è≥ This step may take time because dozens of models are trained internally.
grid.fit(X_train, y_train)


Fitting 5 folds for each of 75 candidates, totalling 375 fits
[CV 2/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.058 total time=   0.0s
[CV 1/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.067 total time=   0.0s
[CV 3/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.113 total time=   0.0s
[CV 3/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.145 total time=   0.0s
[CV 5/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.089 total time=   0.0s
[CV 2/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.328 total time=   0.0s
[CV 1/5] END ...C=0.1, gamma=1, kernel=sigmoid;, score=-0.083 total time=   0.0s
[CV 2/5] END ...C=0.1, gamma=1, kernel=sigmoid;, score=-0.074 total time=   0.0s
[CV 3/5] END ...C=0.1, gamma=1, kernel=sigmoid;, score=-0.174 total time=   0.0s
[CV 4/5] END ...C=0.1, gamma=1, kernel=sigmoid;, score=-0.013 total time=   0.0s
[CV 5/5] END ...C=0.1, gamma=1, kernel=sigmoid;, score=-0.111 total time=   0.0s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf

0,1,2
,estimator,SVR()
,param_grid,"{'C': [0.1, 1, ...], 'gamma': [1, 0.1, ...], 'kernel': ['rbf', 'poly', ...]}"
,scoring,'r2'
,n_jobs,-1
,refit,True
,cv,5
,verbose,3
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,kernel,'sigmoid'
,degree,3
,gamma,0.001
,coef0,0.0
,tol,0.001
,C,1000
,epsilon,0.1
,shrinking,True
,cache_size,200
,verbose,False


In [16]:
# üìå Beautiful printing of best parameters from GridSearchCV

print("\nüèÜ Best Hyperparameters Found:")
print("-" * 40)
for key, value in grid.best_params_.items():
    print(f"üîπ {key:<10} ‚Üí {value}")
print("-" * 40)



üèÜ Best Hyperparameters Found:
----------------------------------------
üîπ C          ‚Üí 1000
üîπ gamma      ‚Üí 0.001
üîπ kernel     ‚Üí sigmoid
----------------------------------------
