In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [45]:
df = sns.load_dataset('tips')

In [46]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [47]:
df.shape

(244, 7)

In [48]:
df.total_bill.unique()

array([16.99, 10.34, 21.01, 23.68, 24.59, 25.29,  8.77, 26.88, 15.04,
       14.78, 10.27, 35.26, 15.42, 18.43, 14.83, 21.58, 10.33, 16.29,
       16.97, 20.65, 17.92, 20.29, 15.77, 39.42, 19.82, 17.81, 13.37,
       12.69, 21.7 , 19.65,  9.55, 18.35, 15.06, 20.69, 17.78, 24.06,
       16.31, 16.93, 18.69, 31.27, 16.04, 17.46, 13.94,  9.68, 30.4 ,
       18.29, 22.23, 32.4 , 28.55, 18.04, 12.54, 10.29, 34.81,  9.94,
       25.56, 19.49, 38.01, 26.41, 11.24, 48.27, 13.81, 11.02, 17.59,
       20.08, 16.45,  3.07, 20.23, 15.01, 12.02, 17.07, 26.86, 25.28,
       14.73, 10.51, 27.2 , 22.76, 17.29, 19.44, 16.66, 10.07, 32.68,
       15.98, 34.83, 13.03, 18.28, 24.71, 21.16, 28.97, 22.49,  5.75,
       16.32, 22.75, 40.17, 27.28, 12.03, 12.46, 11.35, 15.38, 44.3 ,
       22.42, 20.92, 15.36, 20.49, 25.21, 18.24, 14.31, 14.  ,  7.25,
       38.07, 23.95, 25.71, 17.31, 29.93, 10.65, 12.43, 24.08, 11.69,
       13.42, 14.26, 15.95, 12.48, 29.8 ,  8.52, 14.52, 11.38, 22.82,
       19.08, 20.27,

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [50]:
X = df.drop('total_bill', axis = 1)
y = df['total_bill']

In [51]:
X

Unnamed: 0,tip,sex,smoker,day,time,size
0,1.01,Female,No,Sun,Dinner,2
1,1.66,Male,No,Sun,Dinner,3
2,3.50,Male,No,Sun,Dinner,3
3,3.31,Male,No,Sun,Dinner,2
4,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...
239,5.92,Male,No,Sat,Dinner,3
240,2.00,Female,Yes,Sat,Dinner,2
241,2.00,Male,Yes,Sat,Dinner,2
242,1.75,Male,No,Sat,Dinner,2


In [52]:
y

Unnamed: 0,total_bill
0,16.99
1,10.34
2,21.01
3,23.68
4,24.59
...,...
239,29.03
240,27.18
241,22.67
242,17.82


In [53]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 1)

In [54]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((195, 6), (49, 6), (195,), (49,))

In [55]:
X_train.head()

Unnamed: 0,tip,sex,smoker,day,time,size
0,1.01,Female,No,Sun,Dinner,2
154,2.0,Male,No,Sun,Dinner,4
167,4.5,Male,No,Sun,Dinner,4
110,3.0,Male,No,Sat,Dinner,2
225,2.5,Female,Yes,Fri,Lunch,2


In [56]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [57]:
df.sample(1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
9,14.78,3.23,Male,No,Sun,Dinner,2


In [58]:
X.sample(1)

Unnamed: 0,tip,sex,smoker,day,time,size
210,2.0,Male,Yes,Sat,Dinner,3


In [59]:
cat_cols = ["sex", "smoker", "day", "time"]
num_cols = ["tip", "size"]

In [60]:
num_pipeline = Pipeline(steps = [('imputation', SimpleImputer(strategy='median')),
                  ('scaling', StandardScaler())])

In [61]:
cat_pipeline = Pipeline(steps = [('imputation', SimpleImputer(strategy='most_frequent')),
                                 ('encoding', OneHotEncoder())])

In [62]:
preprocessor = ColumnTransformer([('num_pipeline', num_pipeline, num_cols),
                                  ('cat_pipeline', cat_pipeline, cat_cols)])

In [63]:
preprocessor

In [64]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [65]:
X_train

array([[-1.47443803, -0.57766863,  1.        , ...,  0.        ,
         1.        ,  0.        ],
       [-0.71612531,  1.47042924,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 1.19880579,  1.47042924,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 0.43283335, -0.57766863,  1.        , ...,  1.        ,
         0.        ,  1.        ],
       [-1.29060464, -0.57766863,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.1034652 ,  0.44638031,  1.        , ...,  0.        ,
         1.        ,  0.        ]])

In [66]:
X_test

array([[-1.48209775, -1.60171757,  1.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         1.        ,  0.        ],
       [ 0.04984713, -0.57766863,  1.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         1.        ,  0.        ],
       [ 0.36389583,  0.44638031,  0.        ,  1.        ,  0.        ,
         1.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         1.        ,  0.        ],
       [-0.33313909, -0.57766863,  0.        ,  1.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  1.        ],
       [ 0.04984713, -0.57766863,  0.        ,  1.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  1.        ],
       [-1.14506988, -0.57766863,  0.        ,  1.        ,  0.        ,
         1.        ,  

In [67]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

In [68]:
models = {'lin_reg' : LinearRegression(),
          'dt_reg' : DecisionTreeRegressor(),
          'svm_reg' : SVR(),
          'rf_reg' : RandomForestRegressor()}

In [69]:
models

{'lin_reg': LinearRegression(),
 'dt_reg': DecisionTreeRegressor(),
 'svm_reg': SVR(),
 'rf_reg': RandomForestRegressor()}

In [70]:
for i in range(len(models)):
  print(list(models.values())[i])

LinearRegression()
DecisionTreeRegressor()
SVR()
RandomForestRegressor()


In [73]:
from sklearn.metrics import r2_score
def model_train_evaluate(X_train, X_test, y_train, y_test, models):
  evaluation = {}
  for i in range(len(models)):
    model = list(models.values())[i]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    model_score = r2_score(y_test, y_pred)
    evaluation[list(models.keys())[i]] = model_score
  return evaluation

In [74]:
model_train_evaluate(X_train, X_test, y_train, y_test, models)

{'lin_reg': 0.6719006761625261,
 'dt_reg': 0.1864373494374716,
 'svm_reg': 0.4069433055293954,
 'rf_reg': 0.6389322045098356}

In [75]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf

In [76]:
X_train, X_test

(array([[-1.47443803, -0.57766863,  1.        , ...,  0.        ,
          1.        ,  0.        ],
        [-0.71612531,  1.47042924,  0.        , ...,  0.        ,
          1.        ,  0.        ],
        [ 1.19880579,  1.47042924,  0.        , ...,  0.        ,
          1.        ,  0.        ],
        ...,
        [ 0.43283335, -0.57766863,  1.        , ...,  1.        ,
          0.        ,  1.        ],
        [-1.29060464, -0.57766863,  0.        , ...,  0.        ,
          1.        ,  0.        ],
        [ 0.1034652 ,  0.44638031,  1.        , ...,  0.        ,
          1.        ,  0.        ]]),
 array([[-1.48209775, -1.60171757,  1.        ,  0.        ,  0.        ,
          1.        ,  0.        ,  1.        ,  0.        ,  0.        ,
          1.        ,  0.        ],
        [ 0.04984713, -0.57766863,  1.        ,  0.        ,  1.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
          1.        ,  0.        ],
     

In [90]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
y_pred

array([ 9.6135    , 17.40401667, 24.406     , 13.780465  , 20.49435   ,
       11.1665    , 15.3382    , 23.4365    , 16.30617917, 22.8322    ,
       18.6792    , 12.80544214, 30.2116    , 20.5306    , 34.5928    ,
       14.26884048, 16.9284    , 15.6045    , 13.66052857, 12.7809    ,
       33.1573    , 18.7229    , 20.4877    , 34.5813    , 12.381575  ,
       11.414     , 18.3663    , 24.1391    , 19.1752    , 18.70555714,
       12.88906714, 32.6109    , 13.83735   , 25.6629    , 34.0623    ,
       12.75852833, 26.5487    , 19.38905333, 20.5685    , 11.414     ,
       21.433     , 19.2351    , 19.6583    , 22.0037    , 20.64815333,
       20.4662    , 20.0531    , 31.0439    , 15.8508    ])

In [91]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.6314859531184106