In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_validate
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures

# (A)

### Load *tips.csv* data set and create a column for percent tipped and use it as the target. One-hot-encode categorical features

In [2]:
df = pd.read_csv("./tips.csv")
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
df['percent_tip'] = (df.tip / df.total_bill)*100
features = df.drop('percent_tip',axis=1)
targets = df.percent_tip

In [4]:
features = pd.get_dummies(features)
features.head()

Unnamed: 0,total_bill,tip,size,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
0,16.99,1.01,2,1,0,1,0,0,0,1,0,1,0
1,10.34,1.66,3,0,1,1,0,0,0,1,0,1,0
2,21.01,3.5,3,0,1,1,0,0,0,1,0,1,0
3,23.68,3.31,2,0,1,1,0,0,0,1,0,1,0
4,24.59,3.61,4,1,0,1,0,0,0,1,0,1,0


# (B)

### Engineer new features that represent the pairwise interactions of all the original features. The engineered features are just pairwise products of the original features.

In [5]:
polys = PolynomialFeatures(2, interaction_only=True)
features_engineered = polys.fit_transform(features)
cols = polys.get_feature_names(features.columns)
features_engineered = pd.DataFrame(features_engineered, columns=cols)
features_engineered.head()

Unnamed: 0,1,total_bill,tip,size,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,...,day_Sat day_Sun,day_Sat day_Thur,day_Sat time_Dinner,day_Sat time_Lunch,day_Sun day_Thur,day_Sun time_Dinner,day_Sun time_Lunch,day_Thur time_Dinner,day_Thur time_Lunch,time_Dinner time_Lunch
0,1.0,16.99,1.01,2.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,1.0,10.34,1.66,3.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,1.0,21.01,3.5,3.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,1.0,23.68,3.31,2.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,1.0,24.59,3.61,4.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [6]:
df.shape

(244, 8)

#### 8 features

In [7]:
features.shape

(244, 13)

#### 13 features

In [8]:
features_engineered.shape

(244, 92)

#### 92 features

In [9]:
# TODO
# compute possible interactions

# (C)

### Drop any features with zero standard deviations. Standardize the remaining features and fit a multiple linear regressor.

In [10]:
lx = (features_engineered.std() == 0)
drop_cols = features_engineered.columns[lx]
features_engineered = features_engineered.drop(drop_cols,axis=1)
print(features_engineered.shape)

(244, 80)


#### 12 features were dropped for having std = 0

# (D)

### Use cross-validation to determine train and test R-squared. Is there any evidence of over-fitting? Explain.

In [11]:
features_engineered = (features_engineered - features_engineered.mean())/features_engineered.std()

In [12]:
lr = LinearRegression()
results = cross_validate(lr, features_engineered,targets,return_train_score=True)



In [13]:
R2_train = results['train_score'].mean()
R2_train

0.8721172326540821

In [14]:
R2_test = results['test_score'].mean()
R2_test

-6.117258220092566e+25

In [15]:
lr.fit(features_engineered,targets)
coef = pd.Series(lr.coef_, index=features_engineered.columns)

In [16]:
coef.sort_values(ascending=False)

smoker_No day_Sun         1.987580e+13
smoker_No day_Sat         1.586793e+13
tip smoker_No             1.108622e+13
tip smoker_Yes            1.038487e+13
smoker_Yes time_Lunch     9.353277e+12
smoker_No time_Lunch      9.205282e+12
smoker_Yes                9.007892e+12
smoker_No day_Thur        8.052912e+12
size smoker_No            7.891474e+12
day_Fri time_Dinner       7.206639e+12
size smoker_Yes           6.590694e+12
sex_Female time_Dinner    5.003784e+12
smoker_Yes day_Sun        4.294763e+12
sex_Male time_Dinner      3.915069e+12
smoker_Yes day_Sat        3.762851e+12
total_bill time_Dinner    3.733916e+12
total_bill sex_Male       3.269479e+12
day_Thur                  3.215540e+12
day_Sun                   2.852406e+12
total_bill time_Lunch     2.678622e+12
total_bill sex_Female     2.605514e+12
sex_Male                  2.543277e+12
total_bill day_Sat        2.143174e+12
total_bill day_Sun        2.098509e+12
total_bill day_Thur       1.641386e+12
day_Fri time_Lunch       

#### This is terribly overfitted, as you can see by the coefficients being astronomical

# (E)

### Provide an interpretation of the *sex_Male x smoker_Yes* interaction feature

In [17]:
coef['sex_Male smoker_Yes']

-15560714522717.121

#### If you increase percent tip by 1 (the target), the percent tip of a male who also smokes decreases by this ^ value. But, this is an absurd number because the data set is too large and overfitting is occuring

In [18]:
coef['smoker_No smoker_Yes']

KeyError: 'smoker_No smoker_Yes'

#### This error occurs because there is no interaction between the two features. You cannot be both at the same time. You cannot be sitting in the smoker section and not sitting in the smoker section at the same time

# (F)

### Display all the feature coefficients sorted from most positive to most negative coefficient

In [19]:
coef.sort_values(ascending=False)

smoker_No day_Sun         1.987580e+13
smoker_No day_Sat         1.586793e+13
tip smoker_No             1.108622e+13
tip smoker_Yes            1.038487e+13
smoker_Yes time_Lunch     9.353277e+12
smoker_No time_Lunch      9.205282e+12
smoker_Yes                9.007892e+12
smoker_No day_Thur        8.052912e+12
size smoker_No            7.891474e+12
day_Fri time_Dinner       7.206639e+12
size smoker_Yes           6.590694e+12
sex_Female time_Dinner    5.003784e+12
smoker_Yes day_Sun        4.294763e+12
sex_Male time_Dinner      3.915069e+12
smoker_Yes day_Sat        3.762851e+12
total_bill time_Dinner    3.733916e+12
total_bill sex_Male       3.269479e+12
day_Thur                  3.215540e+12
day_Sun                   2.852406e+12
total_bill time_Lunch     2.678622e+12
total_bill sex_Female     2.605514e+12
sex_Male                  2.543277e+12
total_bill day_Sat        2.143174e+12
total_bill day_Sun        2.098509e+12
total_bill day_Thur       1.641386e+12
day_Fri time_Lunch       