In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df=sns.load_dataset('tips')

In [3]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
df.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [5]:
df['sex'].value_counts()

sex
Male      157
Female     87
Name: count, dtype: int64

In [6]:
df['smoker'].value_counts()

smoker
No     151
Yes     93
Name: count, dtype: int64

In [7]:
df['day'].value_counts()

day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64

In [8]:
df['time'].value_counts()

time
Dinner    176
Lunch      68
Name: count, dtype: int64

In [9]:
## independent and dependent features
X=df.iloc[:,1:]
y=df.iloc[:,0]

In [10]:
y.head()

0    16.99
1    10.34
2    21.01
3    23.68
4    24.59
Name: total_bill, dtype: float64

In [11]:
## train test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=42)

## Feature Encoding(Label encoding and Onehot encoding)

In [12]:
## Label encoding
# X['sex']=np.where(X['sex']=='Male',1,0)

In [13]:
from sklearn.preprocessing import LabelEncoder
l1=LabelEncoder()
l2=LabelEncoder()
l3=LabelEncoder()

In [14]:
X_train.head()

Unnamed: 0,tip,sex,smoker,day,time,size
115,3.5,Female,No,Sun,Dinner,2
181,5.65,Male,Yes,Sun,Dinner,2
225,2.5,Female,Yes,Fri,Lunch,2
68,2.01,Male,No,Sat,Dinner,2
104,4.08,Female,No,Sat,Dinner,2


In [15]:
X_train['sex']=l1.fit_transform(X_train['sex'])
X_train['smoker']=l2.fit_transform(X_train['smoker'])
X_train['time']=l3.fit_transform(X_train['time'])

In [16]:
X_train.head()

Unnamed: 0,tip,sex,smoker,day,time,size
115,3.5,0,0,Sun,0,2
181,5.65,1,1,Sun,0,2
225,2.5,0,1,Fri,1,2
68,2.01,1,0,Sat,0,2
104,4.08,0,0,Sat,0,2


In [17]:
X_test['sex']=l1.transform(X_test['sex'])
X_test['smoker']=l2.transform(X_test['smoker'])
X_test['time']=l3.transform(X_test['time'])

In [18]:
X_test.head()

Unnamed: 0,tip,sex,smoker,day,time,size
24,3.18,1,0,Sat,0,2
6,2.0,1,0,Sun,0,2
153,2.0,1,0,Sun,0,4
211,5.16,1,1,Sat,0,4
198,2.0,0,1,Thur,1,2


In [19]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [20]:
# passthrough -> change only those columns which we pass in to a tupple return remaining without changes or drop
# drop=first -> we avoid having perfectly correlated predictors so drop the first column afetr one hot encoding so that the model predict well and multicolinearity not occurs
ct=ColumnTransformer(transformers=[('oneHot',OneHotEncoder(drop='first'),X[3])],remainder='passthrough')

In [21]:
import sys
np.set_printoptions(threshold=sys.maxsize)
X_train = ct.fit_transform(X_train)
X_test=ct.transform(X_test)

In [22]:
## support vector regressor
from sklearn.svm import SVR
svr=SVR()

In [23]:
svr.fit(X_train,y_train)
y_pred=svr.predict(X_test)

In [24]:
from sklearn.metrics import r2_score,mean_absolute_error
print(r2_score(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))

0.49798620106004743
4.463296539661224


In [33]:
param=dict(kernel=['rbf','linear','poly'],C=[0.1,1,10,100,1000],epsilon=[1,0.1,0.01],gamma=[0.1,0.01,0.001,0.0001])
param

{'kernel': ['rbf', 'linear', 'poly'],
 'C': [0.1, 1, 10, 100, 1000],
 'epsilon': [1, 0.1, 0.01],
 'gamma': [0.1, 0.01, 0.001, 0.0001]}

In [34]:
from sklearn.model_selection import GridSearchCV
grid=GridSearchCV(estimator=svr,param_grid=param,n_jobs=-1)

In [35]:
grid.fit(X_train,y_train)

In [36]:
y_pred=grid.predict(X_test)

In [37]:
grid.best_params_

{'C': 100, 'epsilon': 1, 'gamma': 0.01, 'kernel': 'rbf'}

In [38]:
grid.best_estimator_

In [39]:
from sklearn.metrics import r2_score,mean_absolute_error
score=r2_score(y_test,y_pred)
print(mean_absolute_error(y_test,y_pred))
print(score)

4.257962961284045
0.5625127165435169
