In [None]:
import pandas as pd
%matplotlib inline

df_raw = pd.read_csv("data/untappd_Beer_ratings.csv")

In [None]:
df_raw.head()

In [None]:
df_raw.shape

In [None]:
df_raw['brewery'].nunique()

In [None]:
for i in df_raw['brewery'].sort_values().unique():
    print(i)

In [None]:
df_raw['style'].nunique()

In [None]:
for i in df_raw['style'].sort_values().unique():
    print(i)

> lets clean the numerical data first

In [None]:
df_raw.info()

In [None]:
df_raw.head()

In [None]:
#take out parenthese in rating
df_raw['rating'] = df_raw['rating'].apply(lambda x: x.strip(')'))
df_raw['rating'] = df_raw['rating'].apply(lambda x: x.strip('('))

In [None]:
#convert to float
df_raw['rating'] = df_raw['rating'].astype(float)

In [None]:
# df_raw['rating'].unique()
# we see in the unique values that some are rounded to the thousandths place
# we want to limit the scope of the data by rounding to the tenth place
df_raw['rating']=df_raw['rating'].apply(lambda x: round(x,1))

In [None]:
df_raw['rating'].unique()

In [None]:
df_raw.head()

In [None]:
# now lets clean ibu
df_raw['ibu'] = df_raw['ibu'].apply(lambda x: x.strip(' IBU'))

In [None]:
# df_raw['ibu'].unique()
# only odd value is 'N/A'
df_raw['ibu'] = df_raw['ibu'].replace('N/A', 0)
df_raw['ibu'] = df_raw['ibu'].astype(float)

In [None]:
df_raw.info()

In [None]:
# now lets clean abv
df_raw['abv'] = df_raw['abv'].apply(lambda x: x.strip('% ABV'))

In [None]:
# df_raw['abv'].unique()
# there's a weird 'N/' value
df_raw['abv'] = df_raw['abv'].replace('N/', 0)
df_raw['abv'] = df_raw['abv'].astype(float)

In [None]:
df_raw['abv'] = df_raw['abv'].apply(lambda x: round(x,1))

In [None]:
# df_raw['abv'].unique()
df_raw.info()

In [None]:
df = df_raw[['abv', 'ibu', 'rating']]

In [None]:
df.plot(kind='scatter', x='abv', y='rating', alpha=0.2)

In [None]:
df.plot(kind='scatter', x='ibu', y='rating', alpha=0.5)

In [None]:
import matplotlib.pyplot as plt
import numpy as np 
import matplotlib
matplotlib.rcParams.update({'font.size': 12})
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

In [None]:
df_train_X = df.iloc[:,0:-1]
df_train_y = df.iloc[:,-1]

In [None]:
X_train,X_test,y_train,y_test=train_test_split(df_train_X, df_train_y,test_size=0.3,random_state=3)
print(len(X_test), len(y_test))

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [None]:
rr = Ridge(alpha=0.01) # higher the alpha value, more restriction on the coefficients; low alpha > more generalization, coefficients are barely
# restricted and in this case linear and ridge regression resembles
rr.fit(X_train, y_train)

In [None]:
rr100 = Ridge(alpha=100) #  comparison with alpha value
rr100.fit(X_train, y_train)
train_score=lr.score(X_train, y_train)
test_score=lr.score(X_test, y_test)
Ridge_train_score = rr.score(X_train,y_train)
Ridge_test_score = rr.score(X_test, y_test)
Ridge_train_score100 = rr100.score(X_train,y_train)
Ridge_test_score100 = rr100.score(X_test, y_test)
print("linear regression train score:", train_score)
print("linear regression test score:", test_score)
print("ridge regression train score low alpha:", Ridge_train_score)
print("ridge regression test score low alpha:", Ridge_test_score)
print("ridge regression train score high alpha:", Ridge_train_score100)
print("ridge regression test score high alpha:", Ridge_test_score100)
plt.plot(rr.coef_,alpha=0.7,linestyle='none',marker='*',markersize=5,color='red',label=r'Ridge; $\alpha = 0.01$',zorder=7) # zorder for ordering the markers
plt.plot(rr100.coef_,alpha=0.5,linestyle='none',marker='d',markersize=6,color='blue',label=r'Ridge; $\alpha = 100$') # alpha here is for transparency
# plt.plot(lr.coef_,alpha=0.4,linestyle='none',marker='o',markersize=7,color='green',label='Linear Regression')
plt.xlabel('Coefficient Index',fontsize=16)
plt.ylabel('Coefficient Magnitude',fontsize=16)
plt.legend(fontsize=13,loc=4)
plt.show()

In [None]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
#rf.fit(X_train, y_train);

In [None]:
# Use the forest's predict method on the test data
predictions = rf.predict(X_test)
# Calculate the absolute errors
errors = abs(predictions - y_test)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

In [None]:
train_score=rf.score(X_train, y_train)
test_score=rf.score(X_test, y_test)
print("random forest train score:", train_score)
print("random forest test score:", test_score)

# DATA SUCKS 
### lets clean it up some more, do some feature engineering

In [None]:
df_raw.head()

In [None]:
def simple_style(x):
    x = x.lower()
    types = ['ale', 'stout', 'porter', 'sour', 'lager', 'pilsner', 'ipa', 'cider', 'wine', 'beer']
    found = False
    for a in types:
        if a in x:
            return a
            found = True
            
    if found == False:
        return 'other'

In [None]:
simple_style('something asf awrg aeh pilsner asdf')
# 'pilsner' in 'asdeh aer aerg a pilsner'

In [None]:
df_raw['style'] = df_raw['style'].apply(lambda x: simple_style(x))

In [None]:
for i in df_raw['style'].unique():
    print(i)

In [None]:
df_raw.head()
df_raw['style'].value_counts()

In [None]:
df_all = pd.merge(df_raw, pd.get_dummies(df_raw['style']), left_index=True, right_index=True)

In [None]:
df_all.columns

In [None]:
df_all.head()
df_all.drop(['brewery', 'name', 'id', 'Unnamed: 0', 'style'], axis=1, inplace=True)

In [None]:
df_all.head()

In [None]:
df_train_X = df_all.drop('rating', axis=1)
df_train_y = df_all.iloc[:,2]

In [None]:
X_train,X_test,y_train,y_test=train_test_split(df_train_X, df_train_y,test_size=0.3,random_state=3)
print(len(X_test), len(y_test))

In [None]:
rf.fit(X_train, y_train);

In [None]:
# Use the forest's predict method on the test data
predictions = rf.predict(X_test)
# Calculate the absolute errors
errors = abs(predictions - y_test)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

In [None]:
train_score=rf.score(X_train, y_train)
test_score=rf.score(X_test, y_test)
print("random forest train score:", train_score)
print("random forest test score:", test_score)

In [None]:
from sklearn.metrics import roc_auc_score
sample_leaf_options = [1,5,10,50,100,200,500]
for leaf_size in sample_leaf_options :
    model = RandomForestRegressor(n_estimators = 200, oob_score = True, n_jobs = -1,random_state =50,max_features = "auto", min_samples_leaf = leaf_size)
    model.fit(X_train, y_train)
    train_score=model.score(X_train, y_train)
    test_score=model.score(X_test, y_test)
    print(leaf_size)
    print("random forest train score:", train_score)
    print("random forest test score:", test_score)

In [None]:
from sklearn.metrics import roc_auc_score
n_estimators = [100,200, 500, 1000, 2000]
for n_estimator in n_estimators :
    model = RandomForestRegressor(n_estimators = n_estimator, oob_score = True, n_jobs = -1,random_state =50,max_features = "auto", min_samples_leaf = 10)
    model.fit(X_train, y_train)
    train_score=model.score(X_train, y_train)
    test_score=model.score(X_test, y_test)
    print(n_estimator)
    print("random forest train score:", train_score)
    print("random forest test score:", test_score)

In [None]:
df_beers = pd.read_csv("data/beers.csv")
df_beers.head()

In [None]:
df_beers.organic.value_counts()

In [None]:
df_breweries = pd.read_csv("data/untappd_breweries_ratings.csv")
df_breweries.head()

In [None]:
df_breweries['brewery'] = df_breweries['brewery'].apply(lambda x: x.split('/', 3)[-1])
df_breweries.head()

In [None]:
df_breweries['raters'] = df_breweries['raters'].apply(lambda x: x.split(' ', 1)[0])
df_breweries.head()

In [None]:
df_breweries['rating'] = df_breweries['rating'].apply(lambda x: x.strip(')'))
df_breweries['rating'] = df_breweries['rating'].apply(lambda x: x.strip('('))
df_breweries.head()

In [None]:
df_breweries['ibu'] = df_breweries['ibu'].apply(lambda x: x.strip(' IBU'))
df_breweries.head()

In [None]:
df_breweries['abv'] = df_breweries['abv'].apply(lambda x: x.strip('% ABV'))
df_breweries.head()

In [None]:
df_breweries['date'] = df_breweries['date'].apply(lambda x: x.strip('Added '))
df_breweries.head()

In [None]:
df_breweries['date'] = pd.to_datetime(df_breweries.date)

In [None]:
df_breweries.head()

In [None]:
df_breweries['ibu'] = df_breweries['ibu'].replace('N/A', 0)

In [None]:
df_breweries['abv'].unique()
df_breweries['abv'] = df_breweries['abv'].replace('N/', 0)

In [None]:
df_breweries['rating'].unique()
df_breweries['rating'] = df_breweries['rating'].replace('N/A', 0)

In [None]:
df_breweries['raters'].sort_values().unique()
df_breweries['raters'] = df_breweries['raters'].apply(lambda x: x.replace(",", ""))
df_breweries['raters'].sort_values().unique()

In [None]:
df_breweries.info()
for col in ['abv', 'ibu', 'rating', 'raters']:
    df_breweries[col] = df_breweries[col].astype(float)
df_breweries.info()

In [None]:
df_breweries['rating']=df_breweries['rating'].apply(lambda x: round(x,1))
df_breweries['abv']=df_breweries['abv'].apply(lambda x: round(x,1))

In [None]:
import datetime
(datetime.datetime.today()-df_breweries.loc[1,'date']).days

In [None]:
df_breweries['days_since'] = df_breweries['date'].apply(lambda x: (datetime.datetime.today()-x).days)

In [None]:
df_breweries.head()

In [None]:
df_breweries['raters_per_day'] = df_breweries['raters']/df_breweries['days_since']

In [None]:
df_breweries.head()
df_breweries['raters_per_day'] = df_breweries['raters_per_day'].apply(lambda x: round(x,2))

In [None]:
df_breweries.head()

In [None]:
df_breweries_test = df_breweries.copy()

In [None]:
df_breweries_test['style'] = df_breweries_test['style'].apply(lambda x: simple_style(x))

In [None]:
df_breweries_test = df_breweries_test[['name','style', 'abv', 'rating', 'raters', 'raters_per_day']]

In [None]:
df_all_test = pd.merge(df_breweries_test, pd.get_dummies(df_breweries_test['style']), left_index=True, right_index=True)
df_all_test.head()

In [None]:
df_all_test['raters_per_day'].unique()
df_all_test['raters_per_day'] = df_all_test['raters_per_day'].replace('nan', 0)
df_all_test['raters_per_day'] = df_all_test['raters_per_day'].replace(np.inf, 0)
df_all_test['raters_per_day'] = df_all_test['raters_per_day'].replace(np.nan, 0)


In [None]:
df_nodrop = df_all_test.copy()
df_all_test.drop('style', axis=1, inplace=True)
df_all_test.drop('raters', axis=1, inplace=True)
df_all_test.drop('raters_per_day', axis=1, inplace=True)
df_all_test.drop('name', axis=1, inplace=True)

In [None]:
df_all_test.head()

In [None]:
df_train_X = df_all_test.drop('rating', axis=1)
df_train_y = df_all_test['rating']

In [None]:
X_train,X_test,y_train,y_test=train_test_split(df_train_X, df_train_y,test_size=0.3,random_state=3)
print(len(X_test), len(y_test))

In [None]:
rf.fit(X_train, y_train);

In [None]:
# Use the forest's predict method on the test data
predictions = rf.predict(X_test)
# Calculate the absolute errors
errors = abs(predictions - y_test)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

In [None]:
train_score=rf.score(X_train, y_train)
test_score=rf.score(X_test, y_test)
print("random forest train score:", train_score)
print("random forest test score:", test_score)

> Lets remove the ones missing a value in abv and rating

In [None]:
df_clean = df_all_test.copy()

In [None]:
df_clean.head()

In [None]:
df_clean = df_clean[df_clean['abv'] != 0]
df_clean = df_clean[df_clean['rating'] != 0]
print(df_clean.shape)
df_clean.head()

In [None]:
df_train_X = df_clean.drop('rating', axis=1)
df_train_y = df_clean['rating']

In [None]:
X_train,X_test,y_train,y_test=train_test_split(df_train_X, df_train_y,test_size=0.3,random_state=3)
print(len(X_test), len(y_test))

In [None]:
rf.fit(X_train, y_train);

In [None]:
# Use the forest's predict method on the test data
predictions = rf.predict(X_test)
# Calculate the absolute errors
errors = abs(predictions - y_test)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

In [None]:
train_score=rf.score(X_train, y_train)
test_score=rf.score(X_test, y_test)
print("random forest train score:", train_score)
print("random forest test score:", test_score)

# NLP TIME

So these results aren't super, we should have enough data, but not enough features. Looking on the website, the only beefy-ish piece of data we can grab without doing a more intensive scrape is the description. We can use NLP to do some feature creation!

In [None]:
import QK

# functions to clean
# QK.process_data(texts)
# QK.stop_stem(texts)
# QK.generate_words(texts)
# QK.find_features(texts) # returns dictionary

In [None]:
df_texts_raw = pd.read_csv("data/untappd_beer_texts.csv")
df_texts_raw.head()

In [None]:
df_texts = df_texts_raw['text']
df_texts.head()

In [None]:
df_texts = QK.process_data(df_texts)
df_texts.head()

In [None]:
df_texts = QK.stop_stem((df_texts.apply(str)))
df_texts.head()

In [None]:
all_words = QK.generate_words(df_texts)
all_words

In [None]:
# grab most 500 most common words

word_features = list(all_words.keys())[:500]
word_features

In [None]:
# featuresets = [(find_features(text), label) for (text, label) in df_texts] #msgs or test_messages
test_features = [QK.find_features(text, word_features) for text in df_texts]

In [None]:
df_nlp = pd.merge(pd.DataFrame(df_texts_raw['name']),pd.DataFrame(test_features), left_index=True, right_index=True)

In [None]:
df_nlp.head()

In [None]:
df_nlp = pd.merge(df_nlp, df_nodrop, left_on = 'name_x', right_on = 'name')

In [None]:
while (len([col for col in df_nlp.columns if df_nlp[col].dtype=='object']) > 0):
        [df_nlp.drop(col, axis=1, inplace=True) for col in df_nlp.columns if df_nlp[col].dtype=='object']

In [None]:
df_nlp.drop('name_x', axis=1, inplace=True)
df_nlp.drop('name_y', axis=1, inplace=True)

In [None]:
cols[296]

In [None]:
cols = list(df_nlp.columns)
# for i, col in enumerate(cols):
#     print(i, type(df_nlp[col]))

In [None]:
[col for col in cols if df_nlp[col].dtype=='object']

In [None]:
df_nlp_run = df_nlp.drop('name', axis=1)
df_nlp_run = df_nlp.drop('style_y', axis=1)

In [None]:
cols = list(df_nlp_run.columns)
[col for col in cols if df_nlp[col].dtype=='object']

In [None]:
df_nlp_run = df_nlp_run.drop('name', axis=1)

In [None]:
df_nlp_run.info()

In [None]:
df_train_X = df_nlp_run.drop('rating', axis=1)
df_train_y = df_nlp_run['rating']

In [None]:
X_train,X_test,y_train,y_test=train_test_split(df_train_X, df_train_y,test_size=0.3,random_state=3)
print(len(X_test), len(y_test))

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
#prepare a range of parameters to test
max_features = ['auto', 'log2', 'sqrt', 0.2, 0.5]
n_estimators = [100,500,1000,5000]
min_samples_leaf = [1,5,10,50,100,200,500]
n_jobs = [1,-1]
#create and fit a random forest model, testing each paramter above
model = RandomForestRegressor(oob_score=True, max_features='auto', n_estimators=500, min_samples_leaf = 1, n_jobs=1) #We have chosen to just normalize the data by default, you could GridsearchCV this is you wanted
grid = GridSearchCV(estimator=model, verbose=10, param_grid=dict(n_jobs=n_jobs))
grid.fit(X_train, y_train)
print(grid)
# summarize the results of the grid search
print(grid.best_score_)
# print(grid.best_estimator_.max_features)
# print(grid.best_estimator_.n_estimators)
print(grid.best_estimator_.min_samples_leaf)
print(grid.best_estimator_.n_jobs)
print(grid.best_estimator_.solver)


In [None]:
rf = RandomForestRegressor(oob_score=True, max_features='auto', n_estimators=500, min_samples_leaf = 1, n_jobs=1)

In [None]:
rf.fit(X_train, y_train);

In [None]:
# Use the forest's predict method on the test data
predictions = rf.predict(X_test)
# Calculate the absolute errors
errors = abs(predictions - y_test)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

In [None]:
train_score=rf.score(X_train, y_train)
test_score=rf.score(X_test, y_test)
print("random forest train score:", train_score)
print("random forest test score:", test_score)

In [None]:
!pip install joblib

In [None]:
from joblib import dump, load
dump(rf, 'rf_optimal.joblib') 

# TEST DIFFERENT MODELS

In [None]:
import matplotlib.pyplot as plt
import numpy as np 
import matplotlib
matplotlib.rcParams.update({'font.size': 12})
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)

rr = Ridge(alpha=0.01) # higher the alpha value, more restriction on the coefficients; low alpha > more generalization, coefficients are barely
# restricted and in this case linear and ridge regression resembles
rr.fit(X_train, y_train)

In [None]:
rr100 = Ridge(alpha=100) #  comparison with alpha value
rr100.fit(X_train, y_train)
train_score=lr.score(X_train, y_train)
test_score=lr.score(X_test, y_test)
Ridge_train_score = rr.score(X_train,y_train)
Ridge_test_score = rr.score(X_test, y_test)
Ridge_train_score100 = rr100.score(X_train,y_train)
Ridge_test_score100 = rr100.score(X_test, y_test)
print("linear regression train score:", train_score)
print("linear regression test score:", test_score)
print("ridge regression train score low alpha:", Ridge_train_score)
print("ridge regression test score low alpha:", Ridge_test_score)
print("ridge regression train score high alpha:", Ridge_train_score100)
print("ridge regression test score high alpha:", Ridge_test_score100)
plt.plot(rr.coef_,alpha=0.7,linestyle='none',marker='*',markersize=5,color='red',label=r'Ridge; $\alpha = 0.01$',zorder=7) # zorder for ordering the markers
plt.plot(rr100.coef_,alpha=0.5,linestyle='none',marker='d',markersize=6,color='blue',label=r'Ridge; $\alpha = 100$') # alpha here is for transparency
# plt.plot(lr.coef_,alpha=0.4,linestyle='none',marker='o',markersize=7,color='green',label='Linear Regression')
plt.xlabel('Coefficient Index',fontsize=16)
plt.ylabel('Coefficient Magnitude',fontsize=16)
plt.legend(fontsize=13,loc=4)
plt.show()

In [None]:
lasso = Lasso()
lasso.fit(X_train,y_train)
train_score=lasso.score(X_train,y_train)
test_score=lasso.score(X_test,y_test)
coeff_used = np.sum(lasso.coef_!=0)
print("training score:", train_score )
print("test score: ", test_score)
print("number of features used: ", coeff_used)
lasso001 = Lasso(alpha=0.01, max_iter=10e5)
lasso001.fit(X_train,y_train)
train_score001=lasso001.score(X_train,y_train)
test_score001=lasso001.score(X_test,y_test)
coeff_used001 = np.sum(lasso001.coef_!=0)
print("training score for alpha=0.01:", train_score001) 
print("test score for alpha =0.01: ", test_score001)
print("number of features used: for alpha =0.01:", coeff_used001)
lasso00001 = Lasso(alpha=0.0001, max_iter=10e5)
lasso00001.fit(X_train,y_train)
train_score00001=lasso00001.score(X_train,y_train)
test_score00001=lasso00001.score(X_test,y_test)
coeff_used00001 = np.sum(lasso00001.coef_!=0)
print("training score for alpha=0.0001:", train_score00001) 
print("test score for alpha =0.0001: ", test_score00001)
print("number of features used: for alpha =0.0001:", coeff_used00001)
lr = LinearRegression()
lr.fit(X_train,y_train)
lr_train_score=lr.score(X_train,y_train)
lr_test_score=lr.score(X_test,y_test)
print("LR training score:", lr_train_score) 
print("LR test score: ", lr_test_score)
plt.subplot(1,2,1)
plt.plot(lasso.coef_,alpha=0.7,linestyle='none',marker='*',markersize=5,color='red',label=r'Lasso; $\alpha = 1$',zorder=7) # alpha here is for transparency
plt.plot(lasso001.coef_,alpha=0.5,linestyle='none',marker='d',markersize=6,color='blue',label=r'Lasso; $\alpha = 0.01$') # alpha here is for transparency

plt.xlabel('Coefficient Index',fontsize=16)
plt.ylabel('Coefficient Magnitude',fontsize=16)
plt.legend(fontsize=13,loc=4)
plt.subplot(1,2,2)
plt.plot(lasso.coef_,alpha=0.7,linestyle='none',marker='*',markersize=5,color='red',label=r'Lasso; $\alpha = 1$',zorder=7) # alpha here is for transparency
plt.plot(lasso001.coef_,alpha=0.5,linestyle='none',marker='d',markersize=6,color='blue',label=r'Lasso; $\alpha = 0.01$') # alpha here is for transparency
plt.plot(lasso00001.coef_,alpha=0.8,linestyle='none',marker='v',markersize=6,color='black',label=r'Lasso; $\alpha = 0.00001$') # alpha here is for transparency
plt.plot(lr.coef_,alpha=0.7,linestyle='none',marker='o',markersize=5,color='green',label='Linear Regression',zorder=2)
plt.xlabel('Coefficient Index',fontsize=16)
plt.ylabel('Coefficient Magnitude',fontsize=16)
plt.legend(fontsize=13,loc=4)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.linear_model import ElasticNet
EN = ElasticNet()
EN.fit(X_train,y_train)
train_score = EN.score(X_train,y_train)
test_score = EN.score(X_test,y_test)
coeff_used = np.sum(EN.coef_!=0)
print("training score:", train_score )
print("test score: ", test_score)
print("number of features used: ", coeff_used)

In [None]:
from sklearn.svm import SVR
svr = SVR(gamma='scale', C=1.0, epsilon=0.2)
 
SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.2, gamma='scale',
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

svr.fit(X_train,y_train)
train_score = svr.score(X_train,y_train)
test_score = svr.score(X_test,y_test)
coeff_used = np.sum(EN.coef_!=0)
print("training score:", train_score )
print("test score: ", test_score)
print("number of features used: ", coeff_used)

In [None]:
from sklearn.model_selection import GridSearchCV # Grid Search for tuning the Ridge Regression

#prepare a range of parameters to test
alphas = np.array([1,0.1,0.01,0.001,0.0001,0])
fit_interceptOptions = ([True, False])
solverOptions = (['svd', 'cholesky', 'sparse_cg', 'sag'])
#create and fit a ridge regression model, testing each alpha
model = Ridge(normalize=True) #We have chosen to just normalize the data by default, you could GridsearchCV this is you wanted
grid = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas, fit_intercept=fit_interceptOptions, solver=solverOptions))
grid.fit(X_train, y_train)
print(grid)
# summarize the results of the grid search
print(grid.best_score_)
print(grid.best_estimator_.alpha)
print(grid.best_estimator_.fit_intercept)
print(grid.best_estimator_.solver)

In [None]:
test = pd.read_csv("data/test_data.csv")
test.head()