In [68]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Importing the dataset

In [69]:
df = sns.load_dataset("tips")

In [70]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


# Data Cleaning

In [71]:
# Looking for null values
df.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [72]:
# Checking Datatypes
df.dtypes

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
dtype: object

In [73]:
# Bringing the target variable - Tip- to the last column
new_order = ['total_bill', 'sex', 'smoker', 'day', 'time', 'size', 'tip']

df = df[new_order]
df.head()

Unnamed: 0,total_bill,sex,smoker,day,time,size,tip
0,16.99,Female,No,Sun,Dinner,2,1.01
1,10.34,Male,No,Sun,Dinner,3,1.66
2,21.01,Male,No,Sun,Dinner,3,3.5
3,23.68,Male,No,Sun,Dinner,2,3.31
4,24.59,Female,No,Sun,Dinner,4,3.61


# Train, Test Split

In [74]:
from sklearn.model_selection import train_test_split

# Splitting into independent and dependent features
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=10)

In [75]:
# Reseting indices so that, using pd.concat after OneHotEncoder does not produce null values
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)

# Features Encoding

In [76]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

In [77]:
df.nunique()

total_bill    229
sex             2
smoker          2
day             4
time            2
size            6
tip           123
dtype: int64

In [78]:
binary_columns = ['sex', 'smoker', 'time']
multiclass_columns = ['day']

## Label Encoding

In [79]:
def label_encoder(df: pd.DataFrame, binary_columns: list) -> dict:
    """
    Takes in a dataframe and encodes binary categorical features using LabelEncoder.
    
    Parameters:
    df (pd.DataFrame): The DataFrame to be encoded.
    binary_columns (list): List of binary columns to be encoded.
    
    Returns:
    dict: Dictionary of column names and their respective LabelEncoders.
    """
    label_encoders = {}
    # Using LabelEncoder on binary features
    for column in binary_columns:
        label_encoder = LabelEncoder()
        df[column] = label_encoder.fit_transform(df[column])
        label_encoders[column] = label_encoder
        
    return label_encoders

In [80]:
# Using LabelEncoder on Binary Categorical features of X_train
encoder_dict = label_encoder(X_train, binary_columns)
X_train.head()

Unnamed: 0,total_bill,sex,smoker,day,time,size
0,11.24,1,1,Sat,0,2
1,10.34,1,0,Sun,0,3
2,21.01,1,0,Sun,0,3
3,20.23,1,0,Sat,0,2
4,40.55,1,1,Sun,0,2


In [81]:
# Applying label encoders for each column on X_test
for column, encoder in encoder_dict.items():
    X_test[column] = encoder.transform(X_test[column])
    
X_test.head()

Unnamed: 0,total_bill,sex,smoker,day,time,size
0,16.21,0,0,Sun,0,3
1,20.29,1,1,Sat,0,2
2,13.81,1,1,Sat,0,2
3,18.29,1,1,Sat,0,4
4,15.01,1,1,Sat,0,2


In [82]:
X_train.isnull().sum()

total_bill    0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

## One-hot Encoding and Standard Scaling using Column Transformer

In [84]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

In [86]:
preprocessor = ColumnTransformer(transformers=[('scaled', StandardScaler(), ['total_bill', 'size']),
                                               ('onehot', OneHotEncoder(sparse_output=False, dtype=int), multiclass_columns)],
                                remainder='passthrough')

In [91]:
# Fitting and transforming X_train
transformed_array = preprocessor.fit_transform(X_train)
X_train = pd.DataFrame(transformed_array, columns=preprocessor.get_feature_names_out())
X_train.head()

Unnamed: 0,scaler__total_bill,scaler__size,onehot__day_Fri,onehot__day_Sat,onehot__day_Sun,onehot__day_Thur,remainder__sex,remainder__smoker,remainder__time
0,-0.979646,-0.593989,0.0,1.0,0.0,0.0,1.0,1.0,0.0
1,-1.082206,0.441249,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,0.133695,0.441249,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0.04481,-0.593989,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,2.360377,-0.593989,0.0,0.0,1.0,0.0,1.0,1.0,0.0


In [94]:
# Transforming X_test
transformed_array = preprocessor.transform(X_test)
X_test = pd.DataFrame(transformed_array, columns=preprocessor.get_feature_names_out())
X_test.head()

Unnamed: 0,scaler__total_bill,scaler__size,onehot__day_Fri,onehot__day_Sat,onehot__day_Sun,onehot__day_Thur,remainder__sex,remainder__smoker,remainder__time
0,-0.413289,0.441249,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.051647,-0.593989,0.0,1.0,0.0,0.0,1.0,1.0,0.0
2,-0.686782,-0.593989,0.0,1.0,0.0,0.0,1.0,1.0,0.0
3,-0.176263,1.476486,0.0,1.0,0.0,0.0,1.0,1.0,0.0
4,-0.550036,-0.593989,0.0,1.0,0.0,0.0,1.0,1.0,0.0


# Model Training and Predictions

In [96]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

In [114]:
svr = SVR()

In [115]:
# Creating the param_grid
kernel = ['linear', 'poly', 'rbf', 'sigmoid']
degree = [1, 2, 3]
C = [0.1, 1, 10, 100, 1000]
gamma = [1, 0.1, 0.01, 0.001, 0.0001]

param_grid = dict(kernel= kernel,
                 degree= degree,
                 C= C,
                 gamma=gamma)

In [116]:
grid = GridSearchCV(estimator=svr, param_grid=param_grid, n_jobs=-1)

In [117]:
grid.fit(X_train, y_train)

In [118]:
# Best params
grid.best_params_

{'C': 100, 'degree': 1, 'gamma': 0.0001, 'kernel': 'rbf'}

In [119]:
# Best Model
svr = grid.best_estimator_
# Best Score
grid.best_score_

0.35084576304955667

In [120]:
y_pred = svr.predict(X_test)

# Model Scores

In [124]:
from sklearn.metrics import r2_score, mean_absolute_error

In [125]:
r2_score = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2_score, mae

(0.38344683822601755, 0.810883680325955)