# Regression

- Linear regression
- ElasticNet
- decision tree regression
- Random forest regression


In [7]:
# We load the data

import seaborn as sns
tips_df = sns.load_dataset("tips")
tips_df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
5,25.29,4.71,Male,No,Sun,Dinner,4
6,8.77,2.00,Male,No,Sun,Dinner,2
7,26.88,3.12,Male,No,Sun,Dinner,4
8,15.04,1.96,Male,No,Sun,Dinner,2
9,14.78,3.23,Male,No,Sun,Dinner,2


## Data preparation

In [8]:
import pandas as pd

## replace by numbers
tips_df.replace({"sex":{"Male":1, "Female":0}, 
            "smoker": {"No": 1, "Yes": 0},
            "time": {"Lunch": 1, "Dinner": 0}}, inplace=True)

tips_df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,0,1,Sun,0,2
1,10.34,1.66,1,1,Sun,0,3
2,21.01,3.50,1,1,Sun,0,3
3,23.68,3.31,1,1,Sun,0,2
4,24.59,3.61,0,1,Sun,0,4
5,25.29,4.71,1,1,Sun,0,4
6,8.77,2.00,1,1,Sun,0,2
7,26.88,3.12,1,1,Sun,0,4
8,15.04,1.96,1,1,Sun,0,2
9,14.78,3.23,1,1,Sun,0,2


In [10]:
# We one-hot encode

tips_df_oh = pd.get_dummies(tips_df)
tips_df_oh

Unnamed: 0,total_bill,tip,sex,smoker,time,size,day_Thur,day_Fri,day_Sat,day_Sun
0,16.99,1.01,0,1,0,2,0,0,0,1
1,10.34,1.66,1,1,0,3,0,0,0,1
2,21.01,3.50,1,1,0,3,0,0,0,1
3,23.68,3.31,1,1,0,2,0,0,0,1
4,24.59,3.61,0,1,0,4,0,0,0,1
5,25.29,4.71,1,1,0,4,0,0,0,1
6,8.77,2.00,1,1,0,2,0,0,0,1
7,26.88,3.12,1,1,0,4,0,0,0,1
8,15.04,1.96,1,1,0,2,0,0,0,1
9,14.78,3.23,1,1,0,2,0,0,0,1


In [19]:
## We split the data into train and test

from sklearn.model_selection import train_test_split
y = tips_df_oh["tip"]
X =  tips_df_oh.drop("tip", 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=42)

## Predictive modeling

In [23]:
## We use linear regression

from sklearn.linear_model import LinearRegression

rg = LinearRegression()
rg.fit(X_train, y_train)
rg.predict(X_test)

array([ 2.98577733,  1.76319847,  4.0963818 ,  4.15224911,  2.04883145,
        2.41613679,  3.69964059,  2.22695999,  2.34397891,  2.17616715,
        2.68317015,  2.11122442,  2.05011669,  2.08660912,  1.58993186,
        3.22637193,  3.16351155,  3.00110155,  2.50136087,  6.37253304,
        3.71936844,  3.47012109,  2.06618142,  1.99332888,  2.95239211,
        2.30937202,  1.90815199,  3.12627484,  3.31749821,  7.18718887,
        4.99876007,  1.59003921,  3.59981189,  2.80281701,  2.80099298,
        4.15620617,  2.01119184,  5.7668041 ,  2.29267225,  3.03280993,
        2.08609914,  2.42409678,  3.23012811,  2.15933205,  1.95053476,
        0.57586022,  1.70669668,  3.28071515,  1.91026666,  2.35377998,
        3.18742702,  3.48488598,  4.78829328,  2.72393909,  2.79913297,
        2.43886383,  1.38793314,  2.79250597,  2.98777154,  2.76242902,
        4.85393283,  2.42971297,  2.9418349 ,  2.55566475,  3.02539796,
        2.93332236,  2.29258061,  1.20817646,  3.81575794,  3.68

In [39]:
pred_df = pd.DataFrame({"target": y_test, "prediction": rg.predict(X_test)})
pred_df

Unnamed: 0,prediction,target
24,2.985777,3.18
6,1.763198,2.00
153,4.096382,2.00
211,4.152249,5.16
198,2.048831,2.00
176,2.416137,2.00
192,3.699641,2.56
124,2.226960,2.52
9,2.343979,3.23
101,2.176167,3.00


In [40]:
(pred_df["target"] - pred_df["prediction"]).pow(2).mean() ** 0.5

1.0509540952296665

In [42]:
from sklearn import metrics

metrics.mean_squared_error(pred_df["target"], pred_df["prediction"]) ** 0.5

1.0509540952296665

In [43]:
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

lr_rg = LinearRegression()
en_rg = ElasticNet()
tree_rg = DecisionTreeRegressor()
RF_rg = RandomForestRegressor()

In [44]:
lr_rg.fit(X_train, y_train)
en_rg.fit(X_train, y_train)
tree_rg.fit(X_train, y_train)
RF_rg.fit(X_train, y_train)

pred_df = pd.DataFrame({"target": y_test,
                       "pred_lr": lr_rg.predict(X_test),
                       "pred_EN": en_rg.predict(X_test),
                       "pred_tree": tree_rg.predict(X_test),
                       "pred_RF": RF_rg.predict(X_test)}) 

pred_df

Unnamed: 0,pred_EN,pred_RF,pred_lr,pred_tree,target
24,3.035009,2.996,2.985777,2.24,3.18
6,1.837878,2.440,1.763198,4.00,2.00
153,3.547446,3.538,4.096382,3.50,2.00
211,3.692618,3.261,4.152249,2.00,5.16
198,2.296146,1.860,2.048831,2.00,2.00
176,2.825917,3.376,2.416137,3.00,2.00
192,3.968879,4.500,3.699641,5.00,2.56
124,2.239811,1.820,2.226960,1.80,2.52
9,2.488987,2.401,2.343979,1.96,3.23
101,2.553990,2.385,2.176167,3.02,3.00


In [48]:
pred_df.apply(lambda x: metrics.mean_squared_error(pred_df["target"], x) ** 0.5).sort_values(ascending=True)

target       0.000000
pred_EN      0.888949
pred_lr      1.050954
pred_RF      1.069940
pred_tree    1.546420
dtype: float64