In [1]:
## data set (tips)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
warnings.filterwarnings('ignore')



In [2]:
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [4]:
df['sex'].value_counts()

sex
Male      157
Female     87
Name: count, dtype: int64

In [5]:
df['smoker'].value_counts()

smoker
No     151
Yes     93
Name: count, dtype: int64

In [6]:
df['day'].value_counts()

day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64

In [7]:
df['time'].value_counts()

time
Dinner    176
Lunch      68
Name: count, dtype: int64

In [8]:
# independent and dependent features

x = df[['tip', 'sex', 'smoker', 'day', 'time', 'size']]
y = df['total_bill']

In [9]:
# train test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [10]:
x_train.shape, x_test.shape

((170, 6), (74, 6))

In [11]:
# feature encodin(one hot encoding, label encoding)


In [12]:
from sklearn.preprocessing import OneHotEncoder,LabelEncoder

In [13]:
l1 = LabelEncoder()
l2 = LabelEncoder()
l3 = LabelEncoder()


In [14]:
x_train['sex']= l1.fit_transform(x_train['sex'])
x_train['smoker']=l2.fit_transform(x_train['smoker'])
x_train['time']=l3.fit_transform(x_train['time'])


In [15]:
x_train.head()

Unnamed: 0,tip,sex,smoker,day,time,size
234,3.0,1,1,Sat,0,2
227,3.0,1,0,Sat,0,4
180,3.68,1,1,Sun,0,4
5,4.71,1,0,Sun,0,4
56,3.0,1,1,Sat,0,4


In [16]:
x_test['sex'] = l1.transform(x_test['sex'])
x_test['smoker'] = l2.transform(x_test['smoker'])
x_test['time'] = l3.transform(x_test['time'])

In [17]:
x_test.head()

Unnamed: 0,tip,sex,smoker,day,time,size
24,3.18,1,0,Sat,0,2
6,2.0,1,0,Sun,0,2
153,2.0,1,0,Sun,0,4
211,5.16,1,1,Sat,0,4
198,2.0,0,1,Thur,1,2


In [18]:
## one hot encoding -> column transformer
from sklearn.compose import ColumnTransformer



In [19]:
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(drop='first'),[3])],
                       remainder='passthrough')

In [20]:
import sys
import numpy as np
np.set_printoptions(threshold=sys.maxsize)
x_train = ct.fit_transform(x_train)

In [21]:
x_test = ct.transform(x_test)

In [22]:
# svr

from sklearn.svm import SVR

svr = SVR()

In [23]:
svr.fit(x_train, y_train)

In [24]:
y_pred = svr.predict(x_test)

In [25]:
y_pred

array([17.29817835, 14.44316543, 19.08482736, 27.81171877, 13.76497288,
       15.12775863, 15.29560601, 14.46994309, 17.82231704, 16.87090037,
       17.07062749, 13.08950849, 12.60529636, 15.12775863, 12.67729756,
       14.26693057, 21.88958837, 20.19771537, 15.01532731, 27.44429646,
       19.80590834, 20.14873394, 20.47281611, 13.06913355, 21.18325617,
       14.15635532, 13.9541114 , 23.61405744, 19.08482736, 27.50777455,
       22.68388103, 13.69108447, 19.5319204 , 17.88949595, 20.83583833,
       20.4404197 , 13.65208944, 26.30430434, 15.11341024, 14.53273551,
       12.90750074, 13.29668771, 15.73308163, 15.67579728, 14.06648371,
       13.08497553, 13.7885589 , 16.71465308, 12.75325599, 15.76744732,
       14.59276898, 20.51220173, 26.21768814, 13.56175737, 18.77523034,
       13.50570119, 24.07581075, 13.45451285, 18.40535462, 19.42105983,
       28.19260314, 18.61115242, 25.09666279, 14.78330685, 13.92339752,
       20.54334313, 14.71712968, 12.66170931, 20.81096476, 20.87

In [26]:
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error

In [27]:
print('r2 score:',r2_score(y_test, y_pred))

r2 score: 0.496241694353699


In [28]:
print('mean_squared_error:',mean_squared_error(y_test, y_pred))

mean_squared_error: 35.089954899771485


In [29]:
print('mean_absolute_error:',mean_absolute_error(y_test, y_pred))

mean_absolute_error: 4.175959078886017


In [30]:
# hyperparameter tuning using grid search
from sklearn.model_selection import GridSearchCV

parameters = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001]
}

In [31]:
grid = GridSearchCV(SVR(), parameters, refit = True, verbose = 3)

In [32]:
# fitting the model for grid search
grid.fit(x_train, y_train)

Fitting 5 folds for each of 140 candidates, totalling 700 fits
[CV 1/5] END ..C=0.001, gamma=1, kernel=linear;, score=-0.032 total time=   0.0s
[CV 2/5] END ..C=0.001, gamma=1, kernel=linear;, score=-0.045 total time=   0.0s
[CV 3/5] END ..C=0.001, gamma=1, kernel=linear;, score=-0.009 total time=   0.0s
[CV 4/5] END ..C=0.001, gamma=1, kernel=linear;, score=-0.060 total time=   0.0s
[CV 5/5] END ..C=0.001, gamma=1, kernel=linear;, score=-0.108 total time=   0.0s
[CV 1/5] END .....C=0.001, gamma=1, kernel=poly;, score=0.524 total time=   0.0s
[CV 2/5] END .....C=0.001, gamma=1, kernel=poly;, score=0.449 total time=   0.0s
[CV 3/5] END .....C=0.001, gamma=1, kernel=poly;, score=0.350 total time=   0.0s
[CV 4/5] END .....C=0.001, gamma=1, kernel=poly;, score=0.576 total time=   0.0s
[CV 5/5] END .....C=0.001, gamma=1, kernel=poly;, score=0.258 total time=   0.0s
[CV 1/5] END .....C=0.001, gamma=1, kernel=rbf;, score=-0.059 total time=   0.0s
[CV 2/5] END .....C=0.001, gamma=1, kernel=rbf

In [None]:
grid.best_params_

In [None]:
grid_prediction = grid.predict(x_test)

In [None]:
from sklearn.metrics import r2_score,mean_absolute_error

print('r2 score:',r2_score(y_test, grid_prediction))

print('mean_absolute_error:',mean_absolute_error(y_test, grid_prediction))