In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_validate
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("./tips.csv")
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


# (A)

In [3]:
df['percent_tip'] = (df.tip / df.total_bill)*100

In [4]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,percent_tip
0,16.99,1.01,Female,No,Sun,Dinner,2,5.944673
1,10.34,1.66,Male,No,Sun,Dinner,3,16.054159
2,21.01,3.5,Male,No,Sun,Dinner,3,16.658734
3,23.68,3.31,Male,No,Sun,Dinner,2,13.978041
4,24.59,3.61,Female,No,Sun,Dinner,4,14.680765


In [5]:
df.iloc[df.percent_tip.idxmax(),:]

total_bill        7.25
tip               5.15
sex               Male
smoker             Yes
day                Sun
time            Dinner
size                 2
percent_tip    71.0345
Name: 172, dtype: object

The maximum percent tipped was 71%. The bill was small, only 7.25 USD with a 5.15 USD tip.

# (B)

In [6]:
df = pd.get_dummies(df)
df.head()

Unnamed: 0,total_bill,tip,size,percent_tip,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
0,16.99,1.01,2,5.944673,1,0,1,0,0,0,1,0,1,0
1,10.34,1.66,3,16.054159,0,1,1,0,0,0,1,0,1,0
2,21.01,3.5,3,16.658734,0,1,1,0,0,0,1,0,1,0
3,23.68,3.31,2,13.978041,0,1,1,0,0,0,1,0,1,0
4,24.59,3.61,4,14.680765,1,0,1,0,0,0,1,0,1,0


In [7]:
features = df.drop('percent_tip',axis=1)
targets = df.percent_tip

features = (features - features.mean())/features.std()
features.describe()

Unnamed: 0,total_bill,tip,size,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
count,244.0,244.0,244.0,244.0,244.0,244.0,244.0,244.0,244.0,244.0,244.0,244.0,244.0
mean,-4.887939e-16,-3.95245e-17,-1.9110400000000003e-17,8.099168e-17,-8.099168e-17,5.014204e-16,-4.8231e-16,2.020242e-16,5.396412e-16,-3.3670700000000004e-17,-1.155724e-16,-5.460113e-17,5.460113e-17
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-1.877687,-1.444221,-1.650376,-0.7428789,-1.340598,-1.271614,-0.7831793,-0.2899972,-0.7428789,-0.671213,-0.582463,-1.605499,-0.6203065
25%,-0.7232245,-0.7214882,-0.5989615,-0.7428789,-1.340598,-1.271614,-0.7831793,-0.2899972,-0.7428789,-0.671213,-0.582463,-1.605499,-0.6203065
50%,-0.2236408,-0.07102918,-0.5989615,-0.7428789,0.7428789,0.7831793,-0.7831793,-0.2899972,-0.7428789,-0.671213,-0.582463,0.6203065,-0.6203065
75%,0.4876833,0.407781,0.4524529,1.340598,0.7428789,0.7831793,1.271614,-0.2899972,1.340598,1.483734,1.709811,0.6203065,1.605499
max,3.484905,5.06037,3.606696,1.340598,0.7428789,0.7831793,1.271614,3.434177,1.340598,1.483734,1.709811,0.6203065,1.605499


In [11]:
lr = LinearRegression()
lr.fit(features,targets)
coef = lr.coef_
print(coef)
coef = pd.Series(coef, index=features.columns)
coef.sort_values(ascending=False)

[-6.44728811  6.47446452 -0.18231749  0.07329151 -0.07329151 -0.4015981
  0.4015981  -0.15454143 -0.14984432  0.4687448  -0.23863296 -0.08192756
  0.08192756]


tip            6.474465
day_Sun        0.468745
smoker_Yes     0.401598
time_Lunch     0.081928
sex_Female     0.073292
sex_Male      -0.073292
time_Dinner   -0.081928
day_Sat       -0.149844
day_Fri       -0.154541
size          -0.182317
day_Thur      -0.238633
smoker_No     -0.401598
total_bill    -6.447288
dtype: float64

#### The features that have the most postive effect are the day_Sun and smoker_Yes. Some of the more negatives were total_bill and smoker_No.

# (C)

In [None]:
df = df.drop(columns='tip',axis=1)

In [None]:
df.head()

In [None]:
targets = df.total_bill

In [None]:
lr = LinearRegression()
lr.fit(features,targets)
coef = lr.coef_
coef = pd.Series(coef, index=features.columns)
coef.sort_values(ascending=False)