## Note: Frequency encoding was performed in a previous notebook

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df = pd.read_csv('../data/in_process/Games_FreqEncodedCorrectly.csv', index_col = 'game_id')

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


In [4]:
df.tail(2)

Unnamed: 0_level_0,min_players,max_players,avg_time,year,avg_rating,geek_rating,age,weight,Category_average,Mechanic_average
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
286096,1,5,120.0,2019,7.498,7.14207,12,2.8564,0.013835,0.031744
287954,2,4,45.0,2019,7.84646,7.01513,8,2.0132,0.018537,0.028841


In [5]:
df.shape

(3836, 10)

In [6]:
#Dependant variables are average rating and "geek rating"
#These are two different rating scales. They should be rather similar, so we'll use Average Rating for now
#The rest are possible inputs

X = df.drop(['avg_rating', 'geek_rating'], axis = 1)
y = df['avg_rating']
y2 = df['geek_rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .25, random_state = 42)


In [7]:
X_train.head()

Unnamed: 0_level_0,min_players,max_players,avg_time,year,age,weight,Category_average,Mechanic_average
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
404,3,5,20.0,1994,7,1.1,0.015402,0.017737
53093,2,2,60.0,2014,14,2.7537,0.011936,0.036353
3510,2,8,45.0,1997,12,1.0833,0.013564,0.014735
1705,2,10,30.0,1982,8,1.3211,0.056741,0.014539
24480,2,4,120.0,2006,12,2.8244,0.019916,0.004644


In [8]:
#Scale just the training set.
#XScaled = X_train.iloc[:,:6]
col_names = X_train.columns
#XCats = X_train.iloc[:,6:] #columns with categorical variables
scaler = StandardScaler()
XScaled = scaler.fit_transform(X_train)
X_train = pd.DataFrame(XScaled, columns = col_names, index = X_train.index)
#X_train = X_train.join(XCats, how = 'left') #merge the scaled columns with the encoded columns
X_train.tail()

Unnamed: 0_level_0,min_players,max_players,avg_time,year,age,weight,Category_average,Mechanic_average
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
19995,0.00206,-0.058063,0.044059,-0.191676,0.530695,0.871417,-1.005327,-0.627138
27389,0.00206,0.365198,-0.237885,-0.191676,0.530695,-1.123985,-0.735653,0.87945
8229,0.00206,-0.19915,-0.237885,-0.474665,-1.115016,-0.32508,-0.327719,-0.776208
232043,0.00206,-0.19915,-0.26138,0.845952,-1.115016,-0.149143,-0.096716,-0.827833
199383,1.484016,-0.058063,-0.026427,0.845952,-0.292161,0.828653,-0.241478,0.001218


In [9]:
#Now we need to scale the Test Set using the same scaler we used on the training set
#Scale just the first 6 columns of test set. The rest are categorical
#XScaled = X_test.iloc[:,:6]
col_names = X_test.columns
#XCats = X_test.iloc[:,6:] #columns with categorical variables
XScaled = scaler.transform(X_test)
X_test = pd.DataFrame(XScaled, columns = col_names, index = X_test.index)
#X_test = X_test.join(XCats, how = 'left') #merge the scaled columns with the hot encoded columns
X_test.tail()


Unnamed: 0_level_0,min_players,max_players,avg_time,year,age,weight,Category_average,Mechanic_average
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
21985,0.00206,-0.058063,-0.237885,-0.191676,-1.937872,-1.318928,-0.724701,-1.498649
1855,1.484016,0.647372,-0.096913,-1.606623,-1.115016,-1.446321,-0.387266,-1.498649
3655,0.00206,-0.19915,-0.096913,-0.568995,-1.526444,0.136465,-0.909504,1.102444
126239,0.00206,0.083024,-0.143904,0.468633,-0.292161,0.24254,-0.576586,-1.26649
210232,-1.479895,-0.19915,1.312806,0.845952,-0.292161,1.682908,0.219843,-0.026656


In [10]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, RidgeCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

### Trying models on scaled and frequency encoded data

In [11]:
#Trying a strait linear regression on all columns as a reference
lm = LinearRegression()
lm.fit(X_train, y_train)
y_pred = lm.predict(X_test)

print('MSE = ', mean_squared_error(y_test, y_pred))
print('R^2 = ', r2_score(y_test, y_pred))

MSE =  0.23904895765080297
R^2 =  0.49684836812487687


In [12]:
#Lasso Regression performed with .01 as reference
alpha = .01
lassom = Lasso(alpha = alpha)

lassom.fit(X_train, y_train)
y_pred_lasso = lassom.predict(X_test)

print('MSE = ', mean_squared_error(y_test, y_pred_lasso))
print('R^2 = ', r2_score(y_test, y_pred_lasso))

MSE =  0.23815921455635802
R^2 =  0.4987211045480725


In [13]:
print(lassom.sparse_coef_)
bestCoefs = lassom.coef_
features = X_train.columns
pd.Series(bestCoefs, index=features).sort_values(ascending=False)

  (0, 0)	-0.021334941491623842
  (0, 2)	0.006972545064107222
  (0, 3)	0.2615293157798346
  (0, 4)	0.008509052234487545
  (0, 5)	0.35645833640320373
  (0, 7)	-0.049638859469419955


weight              0.356458
year                0.261529
age                 0.008509
avg_time            0.006973
max_players        -0.000000
Category_average    0.000000
min_players        -0.021335
Mechanic_average   -0.049639
dtype: float64

In [14]:
#Finding the best coefficient for Alpha
lassoc = LassoCV()
lassoc.fit(X_train,y_train)
print(lassoc.alpha_)
pd.Series(lassoc.coef_, index=features).sort_values(ascending=False)


0.006152578677625733


weight              0.357083
year                0.264933
age                 0.010490
avg_time            0.010460
Category_average    0.001189
max_players        -0.001334
min_players        -0.024470
Mechanic_average   -0.054061
dtype: float64

In [15]:
y_pred_lassoc = lassoc.predict(X_test)

In [16]:
print('MSE lassoCV = ', mean_squared_error(y_test, y_pred_lassoc))
print('R^2 lassoCV = ', r2_score(y_test, y_pred_lassoc))

MSE lassoCV =  0.2382827860368411
R^2 lassoCV =  0.4984610105795846


In [17]:
#Using grid search as an alternate to find best Alpha for Lasso Regression
param_grid = {'alpha':np.arange(.004, .051, .001)}
lassom2 = Lasso()
lasso_cv = GridSearchCV(lassom2,param_grid,cv=5, n_jobs =-1)
lasso_cv.fit(X_train,y_train)

print("Best Score:" + str(lasso_cv.best_score_))
print("Best Parameters: " + str(lasso_cv.best_params_))

Best Score:0.48478841863905486
Best Parameters: {'alpha': 0.006}


In [25]:
#Trying a random forest regression
rf = RandomForestRegressor(random_state = 42)
rf_param_grid = {'n_estimators':list(np.arange(1, 8))}
rfcv = GridSearchCV(rf, rf_param_grid, n_jobs = -1)
search = rfcv.fit(X_train, y_train)
search.best_params_

{'n_estimators': 7}

In [26]:
print(search.best_estimator_)
print(search.best_score_)
print(search.best_index_)

RandomForestRegressor(n_estimators=7, random_state=42)
0.4741338997382617
6


In [27]:
y_pred_rf = search.predict(X_test)
print('MSE = ', mean_squared_error(y_test, y_pred_rf))
print('R^2 = ', r2_score(y_test, y_pred_rf))

MSE =  0.24622187104665785
R^2 =  0.4817507784265551


In [21]:
y_pred_lassocv = lasso_cv.predict(X_test)

In [22]:
ridgemodel = RidgeCV(alphas = [.0001, .001, .01, 1])
ridgemodel.fit(X_train, y_train)
print(ridgemodel.alpha_)
print(ridgemodel.best_score_)
pd.Series(ridgemodel.coef_, index=features).sort_values(ascending=False)

1.0
-0.2525520595509226


weight              0.359340
year                0.269700
avg_time            0.016254
age                 0.013141
Category_average    0.009994
max_players        -0.006575
min_players        -0.029338
Mechanic_average   -0.062645
dtype: float64

In [23]:
y_pred_ridge = ridgemodel.predict(X_test)


In [28]:
print('MSE simple linear = ', mean_squared_error(y_test, y_pred))
print('R^2 simple linear = ', r2_score(y_test, y_pred))
print('MSE RF = ', mean_squared_error(y_test, y_pred_rf))
print('R^2 RF = ', r2_score(y_test, y_pred_rf))
print('MSE LassoCV = ', mean_squared_error(y_test, y_pred_lassocv))
print('R^2 LassoCV = ', r2_score(y_test, y_pred_lassocv))
print('MSE lasso grid search = ', mean_squared_error(y_test, y_pred_lassoc))
print('R^2 lasso grid search = ', r2_score(y_test, y_pred_lassoc))
print('MSE RidgeCV = ', mean_squared_error(y_test, y_pred_ridge))
print('R^2 RidgeCV = ', r2_score(y_test, y_pred_ridge))

MSE simple linear =  0.23904895765080297
R^2 simple linear =  0.49684836812487687
MSE RF =  0.24622187104665785
R^2 RF =  0.4817507784265551
MSE LassoCV =  0.2382930217628136
R^2 LassoCV =  0.4984394663642192
MSE lasso grid search =  0.2382827860368411
R^2 lasso grid search =  0.4984610105795846
MSE RidgeCV =  0.2390464745676192
R^2 RidgeCV =  0.496853594532757


Lasso regression eliminates two of the factors. It also has the best MSE and R-squared.



It may have been incorrect to scale the categorical frequencies. Running Lasso and Ridge again on unscaled values.