## Note: One-hot encoding was performed as part of the EDA in a previous notebook

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df = pd.read_csv('../data/in_process/Games_coded.csv', index_col = 'game_id')

In [21]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Lasso, LassoCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

In [4]:
df.tail()

Unnamed: 0_level_0,min_players,max_players,avg_time,year,avg_rating,geek_rating,num_votes,age,owned,weight,...,Automatic Resource Growth,Prisoner's Dilemma,Narrative Choice / Paragraph,Contracts,Moving Multiple Units,King of the Hill,Force Commitment,Legacy Game,Bingo,Pattern Movement
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
285265,2,4,60.0,2019,7.35792,6.02424,665.0,10,1171,2.375,...,0,0,0,0,0,0,0,0,0,0
285774,1,4,90.0,2019,8.29442,7.52374,4935.0,14,9351,2.7381,...,0,0,0,0,0,0,0,0,0,0
285984,1,4,45.0,2019,7.77426,6.27306,794.0,14,1704,2.5217,...,0,0,0,0,0,0,0,0,0,0
286096,1,5,120.0,2019,7.498,7.14207,7753.0,12,10970,2.8564,...,0,0,0,0,0,0,0,0,0,0
287954,2,4,45.0,2019,7.84646,7.01513,2731.0,8,5188,2.0132,...,0,0,0,0,0,0,0,0,0,0


In [5]:
df.shape

(3836, 274)

In [6]:
df.iloc[:, :10].describe()

#Columns 11 through 273 are hot encoded categorical variables

Unnamed: 0,min_players,max_players,avg_time,year,avg_rating,geek_rating,num_votes,age,owned,weight
count,3836.0,3836.0,3836.0,3836.0,3836.0,3836.0,3836.0,3836.0,3836.0,3836.0
mean,2.010688,5.436131,82.510167,2007.887904,6.834371,6.222357,3387.934307,10.722888,5219.259124,2.217682
std,0.681773,7.278325,273.188526,10.636066,0.698512,0.539138,6615.109703,2.427055,9229.124746,0.782007
min,1.0,1.0,1.0,1960.0,3.33177,4.27258,500.0,2.0,263.0,1.0
25%,2.0,4.0,30.0,2004.0,6.42188,5.844862,770.0,8.0,1503.0,1.59
50%,2.0,5.0,60.0,2011.0,6.848635,6.10975,1345.5,10.0,2427.0,2.1509
75%,2.0,6.0,90.0,2015.0,7.300432,6.523722,2936.25,12.0,4895.5,2.73065
max,8.0,100.0,12000.0,2020.0,9.1869,8.57686,93524.0,18.0,139881.0,4.7233


In [7]:
#Dependant variables are average rating and "geek rating"
#These are two different rating scales. They should be rather similar, so we'll use Average Rating for now
#The rest are possible inputs

X = df.drop(['avg_rating', 'geek_rating'], axis = 1)
y = df['avg_rating']
y2 = df['geek_rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 42)

#Scale just the first 8 columns of training set. The rest are categorical
XScaled = X_train.iloc[:,:8]
col_names = XScaled.columns
XCats = X_train.iloc[:,8:] #columns with categorical variables
scaler = StandardScaler()
scaler.fit(XScaled)
XScaled = scaler.transform(XScaled)
X_train = pd.DataFrame(XScaled, columns = col_names, index = X_train.index)
X_train = X_train.join(XCats, how = 'left') #merge the scaled columns with the hot encoded columns
X_train.tail()

Unnamed: 0_level_0,min_players,max_players,avg_time,year,num_votes,age,owned,weight,Economic,Negotiation,...,Automatic Resource Growth,Prisoner's Dilemma,Narrative Choice / Paragraph,Contracts,Moving Multiple Units,King of the Hill,Force Commitment,Legacy Game,Bingo,Pattern Movement
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
19995,0.00385,-0.062536,0.056448,-0.195614,-0.328198,0.521296,-0.431225,0.869353,0,0,...,0,0,0,0,0,0,0,0,0,0
27389,0.00385,0.347329,-0.263082,-0.195614,-0.347891,0.521296,-0.282147,-1.122347,0,0,...,0,0,0,0,0,0,0,0,0,0
8229,0.00385,-0.199157,-0.263082,-0.478652,-0.388877,-1.112889,-0.403912,-0.324924,0,0,...,0,0,0,0,0,0,0,0,0,0
232043,0.00385,-0.199157,-0.289709,0.842191,0.542281,-1.112889,0.994027,-0.149314,0,0,...,0,0,0,0,0,0,0,0,0,0
199383,1.480734,-0.062536,-0.023434,0.842191,-0.283209,-0.295797,-0.373614,0.826668,1,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
#Now we need to scale the Test Set using the same scaler we used on the training set
#Scale just the first 8 columns of test set. The rest are categorical
XScaled = X_test.iloc[:,:8]
col_names = XScaled.columns
XCats = X_test.iloc[:,8:] #columns with categorical variables
XScaled = scaler.transform(XScaled)
X_test = pd.DataFrame(XScaled, columns = col_names, index = X_test.index)
X_test = X_test.join(XCats, how = 'left') #merge the scaled columns with the hot encoded columns
X_test.tail()


Unnamed: 0_level_0,min_players,max_players,avg_time,year,num_votes,age,owned,weight,Economic,Negotiation,...,Automatic Resource Growth,Prisoner's Dilemma,Narrative Choice / Paragraph,Contracts,Moving Multiple Units,King of the Hill,Force Commitment,Legacy Game,Bingo,Pattern Movement
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4098,-1.473033,0.074086,0.216213,-0.572998,0.786598,0.929842,0.421355,2.171559,1,0,...,0,0,0,0,0,0,0,0,0,0
1417,1.480734,0.620573,-0.183199,-1.42211,-0.417375,-0.295797,-0.398518,-0.515019,0,0,...,0,0,0,0,0,0,0,0,0,0
160851,0.00385,-0.199157,-0.263082,0.653499,0.884261,-1.112889,1.112808,-0.839705,0,0,...,0,0,0,0,0,0,0,0,0,0
1590,0.00385,0.347329,-0.103317,-4.158141,-0.430024,0.521296,-0.328627,-0.250963,1,0,...,0,0,0,0,0,0,0,0,0,0
17223,0.00385,0.074086,0.535743,-0.28996,0.30661,0.521296,0.349398,1.641652,0,0,...,0,0,0,0,0,0,0,0,0,0


Starting with a simple linear regression to use as a baseline. R^2 was .6 for the whole data set. When using this to predict, it the bottom fell out. Yuck.

In [9]:
lm = LinearRegression()
lm.fit(X_train, y_train)
y_pred = lm.predict(X_test)

print('MSE = ', mean_squared_error(y_test, y_pred))
print('R^2 = ', r2_score(y_test, y_pred))


MSE =  52795.13525294445
R^2 =  -109562.0103667838


Tried Lasso regression. alpha = 1 gave R^2 of zero, alpha = .1 was better, though not great. Only three features gave non-zero coefficients.

In [10]:
alpha = .1
lassom = Lasso(alpha = alpha)

lassom.fit(X_train, y_train)
y_pred_lasso = lassom.predict(X_test)

print('MSE = ', mean_squared_error(y_test, y_pred_lasso))
print('R^2 = ', r2_score(y_test, y_pred_lasso))

MSE =  0.2505288841281763
R^2 =  0.4800903795888407


In [11]:
print(lassom.sparse_coef_)
bestCoefs = lassom.coef_
features = X_train.columns
pd.Series(bestCoefs, index=features).sort_values(ascending=False)


  (0, 3)	0.1765339546782592
  (0, 4)	0.04894236444528477
  (0, 7)	0.28514568969174514


weight                  0.285146
year                    0.176534
num_votes               0.048942
min_players            -0.000000
Measurement Movement   -0.000000
                          ...   
Tile Placement          0.000000
Investment              0.000000
Market                  0.000000
Ownership               0.000000
Pattern Movement        0.000000
Length: 272, dtype: float64

Trying a LassoCV to locate important features. Only one gave a non-zero coefficient. And that was barely non-zero.

In [12]:
lassoc = LassoCV()
lassoc.fit(X, y)
print(lassoc.coef_)

[-0.00000000e+00 -0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  1.46293004e-05  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00 -0.00000000e+00
  0.00000000e+00 -0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
 -0.00000000e+00  0.00000000e+00  0.00000000e+00 -0.00000000e+00
  0.00000000e+00 -0.00000000e+00 -0.00000000e+00  0.00000000e+00
 -0.00000000e+00 -0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
 -0.00000000e+00 -0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -0.00000000e+00  0.00000000e+00
 -0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
 -0.00000000e+00  0.00000000e+00  0.00000000e+00 -0.00000000e+00
 -0.00000000e+00  0.00000000e+00 -0.00000000e+00 -0.00000000e+00
  0.00000000e+00  0.00000000e+00 -0.00000000e+00 -0.00000000e+00
  0.00000000e+00  0.00000

I think that the "number owned" and "number votes" is swamping the categorical features. They are highly correlated with the rating, but don't add any information. In reality, the number owned could be a dependant variable. This would answer the question, "What games do people buy?" or, at least, "What games do people admit buying?"

In [13]:
X_train2 = X_train.drop(['num_votes', 'owned'], axis = 1)
X_test2 = X_test.drop(['num_votes', 'owned'], axis = 1)

In [14]:
alpha = .1
lassom = Lasso(alpha = alpha)

lassom.fit(X_train2, y_train)
y_pred_lasso = lassom.predict(X_test2)

print('MSE = ', mean_squared_error(y_test, y_pred_lasso))
print('R^2 = ', r2_score(y_test, y_pred_lasso))
print(lassom.sparse_coef_)
bestCoefs = lassom.coef_
features = X_train2.columns
pd.Series(bestCoefs, index=features).sort_values(ascending=False)

MSE =  0.26479589197774667
R^2 =  0.45048279696903326
  (0, 3)	0.17695972516856823
  (0, 5)	0.28986931800514115


weight                  0.289869
year                    0.176960
min_players            -0.000000
Measurement Movement   -0.000000
Hidden Movement         0.000000
                          ...   
Tile Placement          0.000000
Investment              0.000000
Market                  0.000000
Ownership               0.000000
Pattern Movement        0.000000
Length: 270, dtype: float64

In [17]:
param_grid = {'alpha':np.arange(.01, 2, .05)}
lassom2 = Lasso()
lasso_cv = GridSearchCV(lassom2,param_grid,cv=5)
lasso_cv.fit(X_train,y_train)

print("Best Score:" + str(lasso_cv.best_score_))
print("Best Parameters: " + str(lasso_cv.best_params_))

Best Score:0.5288266358594442
Best Parameters: {'alpha': 0.01}


In [30]:
rf = RandomForestRegressor(random_state = 42)
rnd_param_grid = {'n_estimators':list(np.arange(1, 275))}
rfcv = RandomizedSearchCV(rf, rnd_param_grid)
search = rfcv.fit(X_train2, y_train)
search.best_params_

{'n_estimators': 263}

In [38]:
search.best_estimator_
search.best_score_
search.best_index_

3

In [42]:
srch_param_grid = {'n_estimators':(np.arange(15, 25))}
gscv = GridSearchCV(rf, srch_param_grid)
search2 = gscv.fit(X_train2, y_train)
search2.best_params_

{'n_estimators': 24}

In [45]:
print(search2.best_estimator_, search2.best_score_)

RandomForestRegressor(n_estimators=24, random_state=42) 0.5640194692619062
