In [1]:
import pandas as pd
import seaborn as sns
import numpy as np

In [4]:
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'dowjones',
 'exercise',
 'flights',
 'fmri',
 'geyser',
 'glue',
 'healthexp',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'seaice',
 'taxis',
 'tips',
 'titanic']

In [6]:
# load default dataset "tips"

tips_df = sns.load_dataset("tips");
tips_df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [12]:
# divide dataset into "feature" and "label" sets or columns

X = tips_df.drop(['tip'], axis=1)
y = tips_df['tip']

In [19]:
# verify division of dataset for feature set

X.head()


Unnamed: 0,total_bill,sex,smoker,day,time,size
0,16.99,Female,No,Sun,Dinner,2
1,10.34,Male,No,Sun,Dinner,3
2,21.01,Male,No,Sun,Dinner,3
3,23.68,Male,No,Sun,Dinner,2
4,24.59,Female,No,Sun,Dinner,4


In [20]:
# verify division of dataset for label set

y.head()

0    1.01
1    1.66
2    3.50
3    3.31
4    3.61
Name: tip, dtype: float64

In [24]:
# drop categorical columns from the dataset

X_num = X.drop(["sex", "smoker", "day", "time"], axis=1)

In [25]:
X_num.head()

Unnamed: 0,total_bill,size
0,16.99,2
1,10.34,3
2,21.01,3
3,23.68,2
4,24.59,4


In [27]:
# isolate categorical columns from dataset

X_cat = X.filter(["sex", "smoker", "day", "time"])

In [29]:
X_cat.head()

Unnamed: 0,sex,smoker,day,time
0,Female,No,Sun,Dinner
1,Male,No,Sun,Dinner
2,Male,No,Sun,Dinner
3,Male,No,Sun,Dinner
4,Female,No,Sun,Dinner


In [34]:
# convert categorical columns into one-hot encoded columns

X_cat_num = pd.get_dummies(X_cat, drop_first=True)

In [35]:
X_cat_num.head()

Unnamed: 0,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
0,True,True,False,False,True,True
1,False,True,False,False,True,True
2,False,True,False,False,True,True
3,False,True,False,False,True,True
4,True,True,False,False,True,True


In [37]:
# join one-hoted encoded with X_numabs

X_processed = pd.concat([X_num, X_cat_num], axis=1)

In [38]:
X_processed.head()

Unnamed: 0,total_bill,size,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
0,16.99,2,True,True,False,False,True,True
1,10.34,3,False,True,False,False,True,True
2,21.01,3,False,True,False,False,True,True
3,23.68,2,False,True,False,False,True,True
4,24.59,4,True,True,False,False,True,True


In [53]:
# Split the dataset into training (80%) and testing (20%) data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.20, random_state=0)

In [54]:
print(X_train)

     total_bill  size  sex_Female  smoker_No  day_Fri  day_Sat  day_Sun  \
7         26.88     4       False       True    False    False     True   
83        32.68     2       False      False    False    False    False   
176       17.89     2       False      False    False    False     True   
106       20.49     2       False      False    False     True    False   
156       48.17     6       False       True    False    False     True   
..          ...   ...         ...        ...      ...      ...      ...   
67         3.07     1        True      False    False     True    False   
192       28.44     2       False      False    False    False    False   
117       10.65     2        True       True    False    False    False   
47        32.40     4       False       True    False    False     True   
172        7.25     2       False      False    False    False     True   

     time_Dinner  
7           True  
83         False  
176         True  
106         True  
156 

In [55]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

# scaling training set

X_train = sc.fit_transform(X_train)

X_test = sc.transform(X_test)

In [57]:
print(X_test)

[[-0.28607202  0.43284366 -0.69084928  0.76509206 -0.26726124  1.35132785
  -0.67470281  0.61885275]
 [-0.20932963  1.48790007 -0.69084928 -1.30703226 -0.26726124  1.35132785
  -0.67470281  0.61885275]
 [-0.07777125 -0.62221275 -0.69084928  0.76509206 -0.26726124 -0.74001287
   1.48213404  0.61885275]
 [-1.41966678 -1.67726916  1.44749373  0.76509206 -0.26726124  1.35132785
  -0.67470281  0.61885275]
 [-0.43078625 -0.62221275  1.44749373 -1.30703226  3.74165739 -0.74001287
  -0.67470281 -1.61589329]
 [-1.58411476 -0.62221275  1.44749373 -1.30703226  3.74165739 -0.74001287
  -0.67470281  0.61885275]
 [-0.24989347 -0.62221275 -0.69084928 -1.30703226 -0.26726124  1.35132785
  -0.67470281  0.61885275]
 [ 0.34321559 -0.62221275 -0.69084928 -1.30703226 -0.26726124 -0.74001287
   1.48213404  0.61885275]
 [-0.22467811  0.43284366  1.44749373 -1.30703226 -0.26726124 -0.74001287
   1.48213404  0.61885275]
 [ 1.58424969  1.48790007 -0.69084928 -1.30703226 -0.26726124 -0.74001287
   1.48213404  0.

In [59]:
# using LinearRegression in scikit learn

from sklearn.linear_model import LinearRegression

In [61]:
# import metrics for MAE, MSE, RMSE

from sklearn import metrics

In [62]:
lr = LinearRegression()
lr_regressor = lr.fit(X_train, y_train)
y_pred = lr_regressor.predict(X_test)

In [63]:
print('MAE: ', metrics.mean_absolute_error(y_test, y_pred))
print('MSE: ', metrics.mean_squared_error(y_test, y_pred))
print('RMSE: ', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

MAE:  0.7080218832979825
MSE:  0.8939195221609609
MAE:  0.9454731736865731


In [64]:
# using LinearRegression in scikit learn

from sklearn.neighbors import KNeighborsRegressor

In [82]:
knn = KNeighborsRegressor(n_neighbors=5)
knn_regressor = knn.fit(X_train, y_train)

In [83]:
y_pred_knn = knn_regressor.predict(X_test)

In [84]:
print('MAE: ', metrics.mean_absolute_error(y_test, y_pred_knn))
print('MSE: ', metrics.mean_squared_error(y_test, y_pred_knn))
print('RMSE: ', np.sqrt(metrics.mean_squared_error(y_test, y_pred_knn)))

MAE:  0.7513877551020406
MSE:  0.9462902040816326
MAE:  0.9727744877830794
