## Note: Frequency encoding was performed as part of the EDA in a previous notebook

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df = pd.read_csv('../data/in_process/Games_FreqEncoded.csv', index_col = 'game_id')

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


In [4]:
df.tail(2)

Unnamed: 0_level_0,min_players,max_players,avg_time,year,avg_rating,geek_rating,age,weight,Area Majority / Influence,Auction/Bidding,...,Video Game Theme,unknowncat_,Environmental,Pike and Shot,Medical,Vietnam War,Math,Mature / Adult,Korean War,Expansion for Base-game
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
286096,1,5,120.0,2019,7.498,7.14207,12,2.8564,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
287954,2,4,45.0,2019,7.84646,7.01513,8,2.0132,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
df.shape

(3836, 276)

In [6]:
df.iloc[:, :8].describe()

#Columns 8 through 276 are Frequency encoded categorical variables

Unnamed: 0,min_players,max_players,avg_time,year,avg_rating,geek_rating,age,weight
count,3836.0,3836.0,3836.0,3836.0,3836.0,3836.0,3836.0,3836.0
mean,2.010688,5.436131,82.510167,2007.887904,6.834371,6.222357,10.722888,2.217682
std,0.681773,7.278325,273.188526,10.636066,0.698512,0.539138,2.427055,0.782007
min,1.0,1.0,1.0,1960.0,3.33177,4.27258,2.0,1.0
25%,2.0,4.0,30.0,2004.0,6.42188,5.844862,8.0,1.59
50%,2.0,5.0,60.0,2011.0,6.848635,6.10975,10.0,2.1509
75%,2.0,6.0,90.0,2015.0,7.300432,6.523722,12.0,2.73065
max,8.0,100.0,12000.0,2020.0,9.1869,8.57686,18.0,4.7233


In [7]:
#Dependant variables are average rating and "geek rating"
#These are two different rating scales. They should be rather similar, so we'll use Average Rating for now
#The rest are possible inputs

X = df.drop(['avg_rating', 'geek_rating'], axis = 1)
y = df['avg_rating']
y2 = df['geek_rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .25, random_state = 42)


In [8]:
X_train.head()

Unnamed: 0_level_0,min_players,max_players,avg_time,year,age,weight,Area Majority / Influence,Auction/Bidding,Dice Rolling,Hand Management,...,Video Game Theme,unknowncat_,Environmental,Pike and Shot,Medical,Vietnam War,Math,Mature / Adult,Korean War,Expansion for Base-game
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
404,3,5,20.0,1994,7,1.1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53093,2,2,60.0,2014,14,2.7537,0.0,0.0,0.073673,0.080517,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3510,2,8,45.0,1997,12,1.0833,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1705,2,10,30.0,1982,8,1.3211,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24480,2,4,120.0,2006,12,2.8244,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
X_train.iloc[:,6:] #columns with categorical variables


Unnamed: 0_level_0,Area Majority / Influence,Auction/Bidding,Dice Rolling,Hand Management,Simultaneous Action Selection,Trick-taking,Set Collection,Tile Placement,Investment,Market,...,Video Game Theme,unknowncat_,Environmental,Pike and Shot,Medical,Vietnam War,Math,Mature / Adult,Korean War,Expansion for Base-game
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
404,0.000000,0.0,0.000000,0.000000,0.017737,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53093,0.000000,0.0,0.073673,0.080517,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3510,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1705,0.000000,0.0,0.000000,0.000000,0.017737,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24480,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.027863,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27389,0.000000,0.0,0.073673,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8229,0.039385,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
232043,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.027863,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
#Scale just the numerical part of the training set. Leaving the categoricals alone
XScaled = X_train.iloc[:,:6]
col_names = XScaled.columns
XCats = X_train.iloc[:,6:] #columns with categorical variables
scaler = StandardScaler()
XScaled = scaler.fit_transform(XScaled)
X_train = pd.DataFrame(XScaled, columns = col_names, index = X_train.index)
X_train = X_train.join(XCats, how = 'left') #merge the scaled columns with the encoded columns
X_train.tail()

Unnamed: 0_level_0,min_players,max_players,avg_time,year,age,weight,Area Majority / Influence,Auction/Bidding,Dice Rolling,Hand Management,...,Video Game Theme,unknowncat_,Environmental,Pike and Shot,Medical,Vietnam War,Math,Mature / Adult,Korean War,Expansion for Base-game
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
19995,0.00206,-0.058063,0.044059,-0.191676,0.530695,0.871417,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27389,0.00206,0.365198,-0.237885,-0.191676,0.530695,-1.123985,0.0,0.0,0.073673,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8229,0.00206,-0.19915,-0.237885,-0.474665,-1.115016,-0.32508,0.039385,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
232043,0.00206,-0.19915,-0.26138,0.845952,-1.115016,-0.149143,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
199383,1.484016,-0.058063,-0.026427,0.845952,-0.292161,0.828653,0.039385,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
X_test.iloc[:,:6]

Unnamed: 0_level_0,min_players,max_players,avg_time,year,age,weight
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
19370,3,8,60.0,2005,12,1.2793
28037,2,6,30.0,2007,10,1.4925
6778,2,4,90.0,2003,12,2.5400
21892,2,5,75.0,2006,12,2.6708
169786,1,5,115.0,2016,14,3.3881
...,...,...,...,...,...,...
21985,2,5,30.0,2006,6,1.1944
1855,3,10,60.0,1991,8,1.0952
3655,2,4,60.0,2002,7,2.3277
126239,2,6,50.0,2013,10,2.4103


In [12]:
#Now we need to scale the Test Set using the same scaler we used on the training set
#Scale just the first 6 columns of test set. The rest are categorical
XScaled = X_test.iloc[:,:6]
col_names = XScaled.columns
XCats = X_test.iloc[:,6:] #columns with categorical variables
XScaled = scaler.transform(XScaled)
X_test = pd.DataFrame(XScaled, columns = col_names, index = X_test.index)
X_test = X_test.join(XCats, how = 'left') #merge the scaled columns with the hot encoded columns
X_test.tail()


Unnamed: 0_level_0,min_players,max_players,avg_time,year,age,weight,Area Majority / Influence,Auction/Bidding,Dice Rolling,Hand Management,...,Video Game Theme,unknowncat_,Environmental,Pike and Shot,Medical,Vietnam War,Math,Mature / Adult,Korean War,Expansion for Base-game
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21985,0.00206,-0.058063,-0.237885,-0.191676,-1.937872,-1.318928,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1855,1.484016,0.647372,-0.096913,-1.606623,-1.115016,-1.446321,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3655,0.00206,-0.19915,-0.096913,-0.568995,-1.526444,0.136465,0.0,0.0,0.0,0.080517,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
126239,0.00206,0.083024,-0.143904,0.468633,-0.292161,0.24254,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
210232,-1.479895,-0.19915,1.312806,0.845952,-0.292161,1.682908,0.0,0.0,0.073673,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, RidgeCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

### Trying models on scaled and frequency encoded data

In [14]:
#Trying a strait linear regression on all columns as a reference
lm = LinearRegression()
lm.fit(X_train, y_train)
y_pred = lm.predict(X_test)

print('MSE = ', mean_squared_error(y_test, y_pred))
print('R^2 = ', r2_score(y_test, y_pred))

MSE =  0.22861230798774104
R^2 =  0.5188154888389083


In [15]:
#Lasso Regression performed with .01 as reference
alpha = .01
lassom = Lasso(alpha = alpha)

lassom.fit(X_train, y_train)
y_pred_lasso = lassom.predict(X_test)

print('MSE = ', mean_squared_error(y_test, y_pred_lasso))
print('R^2 = ', r2_score(y_test, y_pred_lasso))

MSE =  0.2414009893396125
R^2 =  0.49189779819110113


In [16]:
print(lassom.sparse_coef_)
bestCoefs = lassom.coef_
features = X_train.columns
pd.Series(bestCoefs, index=features).sort_values(ascending=False)

  (0, 0)	-0.017826213908531296
  (0, 2)	0.007657361623842103
  (0, 3)	0.25906356811076753
  (0, 4)	0.00810477033363821
  (0, 5)	0.35949456935047


weight                          0.359495
year                            0.259064
age                             0.008105
avg_time                        0.007657
Singing                        -0.000000
                                  ...   
Resource to Move                0.000000
Mancala                        -0.000000
Real-Time                       0.000000
Victory Points as a Resource    0.000000
min_players                    -0.017826
Length: 274, dtype: float64

In [17]:
#Finding the best coefficient for Alpha
lassoc = LassoCV()
lassoc.fit(X_train,y_train)
print(lassoc.alpha_)
pd.Series(lassoc.coef_, index=features).sort_values(ascending=False)


0.0053511991683924225


weight                          0.360389
year                            0.263049
avg_time                        0.011888
age                             0.010574
Singing                        -0.000000
                                  ...   
Resource to Move                0.000000
Mancala                        -0.000000
Real-Time                       0.000000
Victory Points as a Resource    0.000000
min_players                    -0.021389
Length: 274, dtype: float64

In [18]:
y_pred_lassoc = lassoc.predict(X_test)

In [19]:
print('MSE lassoCV = ', mean_squared_error(y_test, y_pred_lassoc))
print('R^2 lassoCV = ', r2_score(y_test, y_pred_lassoc))

MSE lassoCV =  0.24163291817221233
R^2 lassoCV =  0.49140963303970997


In [20]:
#Using grid search as an alternate to find best Alpha for Lasso Regression
param_grid = {'alpha':np.arange(.004, .051, .001)}
lassom2 = Lasso()
lasso_cv = GridSearchCV(lassom2,param_grid,cv=5, n_jobs =-1)
lasso_cv.fit(X_train,y_train)

print("Best Score:" + str(lasso_cv.best_score_))
print("Best Parameters: " + str(lasso_cv.best_params_))

Best Score:0.47820811348839537
Best Parameters: {'alpha': 0.006}


In [21]:
#Trying a random forest regression with random regressors search
rf = RandomForestRegressor(random_state = 42)
rnd_param_grid = {'n_estimators':list(np.arange(4, 270))}
rfcv = RandomizedSearchCV(rf, rnd_param_grid, n_jobs = -1)
search = rfcv.fit(X_train, y_train)
search.best_params_

{'n_estimators': 269}

In [22]:
print(search.best_estimator_)
print(search.best_score_)
print(search.best_index_)

RandomForestRegressor(n_estimators=269, random_state=42)
0.5793818838538545
6


In [23]:
#Narrowing down with grid search
srch_param_grid = {'n_estimators':(np.arange(260, 270))}
gscv = GridSearchCV(rf, srch_param_grid, n_jobs = -1)
search2 = gscv.fit(X_train, y_train)
search2.best_params_

{'n_estimators': 265}

In [24]:
print(search2.best_estimator_, search2.best_score_)

RandomForestRegressor(n_estimators=265, random_state=42) 0.5793897175527123


In [25]:
y_pred_rf = search2.predict(X_test)
print('MSE = ', mean_squared_error(y_test, y_pred_rf))
print('R^2 = ', r2_score(y_test, y_pred_rf))

MSE =  0.19444961436306663
R^2 =  0.5907213244276754


In [26]:
y_pred_lassocv = lasso_cv.predict(X_test)

In [27]:
ridgemodel = RidgeCV(alphas = [.0001, .001, .01, 1])
ridgemodel.fit(X_train, y_train)
print(ridgemodel.alpha_)
print(ridgemodel.best_score_)
pd.Series(ridgemodel.coef_, index=features).sort_values(ascending=False)

0.0001
-0.2302423129765688


Race                    88.385153
Communication Limits    77.710713
Line Drawing            69.808202
Traitor Game            64.607265
Sports                  59.849325
                          ...    
Video Game Theme       -42.040301
Memorycat_             -46.256849
Zombies                -49.085767
Rock-Paper-Scissors    -65.043748
Trivia                 -76.510057
Length: 274, dtype: float64

In [28]:
y_pred_ridge = ridgemodel.predict(X_test)


In [29]:
print('MSE simple linear = ', mean_squared_error(y_test, y_pred))
print('R^2 simple linear = ', r2_score(y_test, y_pred))
print('MSE RF = ', mean_squared_error(y_test, y_pred_rf))
print('R^2 RF = ', r2_score(y_test, y_pred_rf))
print('MSE LassoCV = ', mean_squared_error(y_test, y_pred_lassocv))
print('R^2 LassoCV = ', r2_score(y_test, y_pred_lassocv))
print('MSE lasso grid search = ', mean_squared_error(y_test, y_pred_lassoc))
print('R^2 lasso grid search = ', r2_score(y_test, y_pred_lassoc))
print('MSE RidgeCV = ', mean_squared_error(y_test, y_pred_ridge))
print('R^2 RidgeCV = ', r2_score(y_test, y_pred_ridge))

MSE simple linear =  0.22861230798774104
R^2 simple linear =  0.5188154888389083
MSE RF =  0.19444961436306663
R^2 RF =  0.5907213244276754
MSE LassoCV =  0.2415865644630485
R^2 LassoCV =  0.4915071985954791
MSE lasso grid search =  0.24163291817221233
R^2 lasso grid search =  0.49140963303970997
MSE RidgeCV =  0.22172086184745826
R^2 RidgeCV =  0.5333206446259833


The RandomForest regression gives the best scores, but it uses 265 of the features.
Ridge regression also gives a better score, but it's also using all the features. I question its predictive value.
The conclusion of the Lasso regression is that the Categorical features are meaningless.
I tend to like this. It shows that there's no magic bullet in game design. A game has to be good, independent of its mechanics.

QUESTION: How to interpret ridge regression?