<a href="https://colab.research.google.com/github/provincit/colab_public/blob/main/tips_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns



In [2]:
br = '\n'
sns.set(color_codes=True)
tips = sns.load_dataset('tips')
print (tips.head(), br)


   total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4 



In [3]:
tips.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [4]:

from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [5]:
br = '\n'


data = tips.drop(['tip'], axis=1)
target = tips['tip']
data = pd.get_dummies(data, columns=['sex', 'smoker','day', 'time'])

X = data.values
y = target.values
print ('X and y shapes (post conversion):')
print (X.shape, y.shape, br)


X and y shapes (post conversion):
(244, 12) (244,) 



In [6]:

rfr = RandomForestRegressor(random_state=0, n_estimators=100)
rfr.fit(X, y)
print ('feature importance (first 6 features):')
feature_importances = rfr.feature_importances_
features = list(data.columns.values)
importance = sorted(zip(feature_importances, features), reverse=True)
[print (row) for i, row in enumerate(importance) if i < 6]
print ()
X_train, X_test, y_train, y_test = train_test_split(
X, y, random_state=0)

feature importance (first 6 features):
(0.763320323509417, 'total_bill')
(0.06472915414124665, 'size')
(0.0358517935578476, 'smoker_No')
(0.03572877523497224, 'smoker_Yes')
(0.02393573971034533, 'day_Sat')
(0.022073551311225303, 'day_Sun')



In [7]:



model = LinearRegression()
model_name = model.__class__.__name__
print ('<<' + model_name + '>>', br)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print (rmse, '(rmse)', br)
print ('predict from new data:')



<<LinearRegression>> 

0.9636287548943023 (rmse) 

predict from new data:


In [10]:
from sklearn.linear_model import Ridge

In [18]:
ridge = Ridge(alpha=1).fit(X_train, y_train)


In [19]:
ridge_y_pred = ridge.predict(X_test)

In [20]:
rmse = np.sqrt(mean_squared_error(y_test, ridge_y_pred))
print (rmse, '(rmse)', br)
print ('predict from new data:')


0.953361588565088 (rmse) 

predict from new data:


Grid CV

In [21]:
from sklearn.model_selection import GridSearchCV

In [25]:
# find optimal alpha with grid search
alpha = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
param_grid = dict(alpha=alpha)
grid = GridSearchCV(estimator=ridge, param_grid=param_grid, scoring='r2', verbose=1, n_jobs=-1, cv=5)
grid_result = grid.fit(X_train, y_train)
print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


Best Score:  0.3560718797551016
Best Params:  {'alpha': 1000}


[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:    0.9s finished


In [24]:
ridge = Ridge(alpha=1000).fit(X_train, y_train)
ridge_y_pred = ridge.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, ridge_y_pred))
print (rmse, '(rmse)', br)
print ('predict from new data:')

0.9207586806428449 (rmse) 

predict from new data:


Predict New data

In [8]:


df1 = pd.get_dummies(pd.DataFrame({'total_bill':[17.23],'sex':['Female'], 'smoker': ['Yes'], 'day': ['Sun'], 'time': ['dinner'], 'size': [3]}))
# dummies_frame = pd.get_dummies(df)
new_df = df1.reindex(columns = data.columns, fill_value=0)

In [9]:
# define input
new_input = new_df.values
# get prediction for new input
new_output = model.predict(new_input)
# summarize input and output
print(new_input, new_output)

[[17.23  3.    0.    1.    1.    0.    0.    0.    0.    1.    0.    0.  ]] [2.96520264]


Predicting from tips with DictVectorizer encoding

In [26]:
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge,Lasso, ElasticNet, SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

In [27]:
def get_scores(model, Xtest, ytest):
 y_pred = model.predict(Xtest)
 return np.sqrt(mean_squared_error(ytest, y_pred)), model.__class__.__name__



In [30]:

print ('rmse:')
lr = LinearRegression().fit(X_train, y_train)
rmse, lr_name = get_scores(lr, X_test, y_test)
print (rmse, '(' + lr_name + ')')
rr = Ridge(random_state=0).fit(X_train, y_train)
rmse, rr_name = get_scores(rr, X_test, y_test)
print (rmse, '(' + rr_name + ')')
lasso = Lasso(random_state=0).fit(X_train, y_train)
rmse, lasso_name = get_scores(lasso, X_test, y_test)
print (rmse, '(' + lasso_name + ')')
en = ElasticNet(random_state=0).fit(X_train, y_train)
rmse, en_name = get_scores(en, X_test, y_test)
print (rmse, '(' + en_name + ')')
sgdr = SGDRegressor(random_state=0, max_iter=1000, tol=0.001)
sgdr.fit(X_train, y_train)
rmse, sgdr_name = get_scores(sgdr, X_test, y_test)
print (rmse, '(' + sgdr_name + ')', br)
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.fit_transform(X_test)
print ('rmse std:')
lr_std = LinearRegression().fit(X_train_std, y_train)
rmse, lr_name = get_scores(lr_std, X_test_std, y_test)
print (rmse, '(' + lr_name + ')')
rr_std = Ridge(random_state=0).fit(X_train_std, y_train)
rmse, rr_name = get_scores(rr_std, X_test_std, y_test)
print (rmse, '(' + rr_name + ')')
lasso_std = Lasso(random_state=0).fit(X_train_std, y_train)

rmse, lasso_name = get_scores(lasso_std, X_test_std, y_test)
print (rmse, '(' + lasso_name + ')')
en_std = ElasticNet(random_state=0)
en_std.fit(X_train_std, y_train)
rmse, en_name = get_scores(en_std, X_test_std, y_test)
print (rmse, '(' + en_name + ')')
sgdr_std = SGDRegressor(random_state=0, max_iter=1000, tol=0.001)
sgdr_std.fit(X_train_std, y_train)
rmse, sgdr_name = get_scores(sgdr_std, X_test_std, y_test)
print (rmse, '(' + sgdr_name + ')')


rmse:
0.9636287548943023 (LinearRegression)
0.953361588565088 (Ridge)
0.939990006405585 (Lasso)
0.9226279884146948 (ElasticNet)
1.3268943860807163 (SGDRegressor) 

rmse std:
0.9282348917156419 (LinearRegression)
0.9266547268827202 (Ridge)
1.3951571751071663 (Lasso)
1.1985456909410515 (ElasticNet)
0.9236479854972407 (SGDRegressor)
