# Feature Engineering, Preprocessing, and Modeling

In [50]:
# import packages and data
import pandas as pd
import numpy as np
import category_encoders as ce
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn import tree, metrics
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, Ridge, Lasso

bgg = pd.read_csv('bgg_clean_impute.csv', index_col=0)
pd.set_option('max_colwidth', 120)

# KNN was used to impute missing values. I'm rounding these columns to remove inconsistencies with age which 
# is recorded as a whole number as well as with player count which must be a whole number.
bgg.age = bgg.age.round()
bgg.min_players = bgg.min_players.round()
bgg.max_players = bgg.max_players.round()

In [2]:
# checking that the data types are correct
bgg.dtypes

avg_rating       float64
geek_rating      float64
num_voters         int64
title             object
full_game_url     object
rank               int64
game_id            int64
category          object
mechanic          object
family            object
age              float64
max_play_time    float64
max_players      float64
min_play_time    float64
min_players      float64
weight           float64
dtype: object

In [3]:
# checking for null values
bgg.isnull().sum()

avg_rating          0
geek_rating         0
num_voters          0
title               0
full_game_url       0
rank                0
game_id             0
category          211
mechanic         1567
family           4614
age                 0
max_play_time       0
max_players         0
min_play_time       0
min_players         0
weight              0
dtype: int64

In [4]:
# filling nulls in categorical labels with "None"
bgg.fillna('None', inplace=True)

# confirming categorical nulls are replaced
bgg[['category','mechanic','family']].isnull().sum()

category    0
mechanic    0
family      0
dtype: int64

The family label is sparse with 2,748 different families. Using this as a feature would increase dimensionality significantly, but also many of families are unique or specific to a game or set of games, so this will not generalize well to new data. However, as we saw in EDA, Kickstarter games have a statistically higher average Geek Rating, so I do want to create a column indicating if a game was or wasn't on Kickstarter.

In [5]:
# create new column named Kickstarter and drop family column
bgg['kickstarter'] = bgg['family'].str.contains('Kickstarter').replace({True:1,False:0})
bgg.drop(columns=['family'], inplace=True)

In [6]:
# creating dummy columns for category label
cat_dummy = bgg.category.str.get_dummies(',').add_prefix('cat_')

# creating dummy columns for mechanic label
# three mechanics have commas which need to be removed before creating dummies 
bgg['mechanic'] = bgg['mechanic'].str.replace('Deck, Bag, and Pool Building', 'Deck Bag and Pool Building')
bgg['mechanic'] = bgg['mechanic'].str.replace('I Cut, You Choose', 'I Cut You Choose')
bgg['mechanic'] = bgg['mechanic'].str.replace('Worker Placement, Different Worker Types', 'Worker Placement Different Worker Types')
mech_dummy = bgg.mechanic.str.get_dummies(',').add_prefix('mech_')

# concatenating with original dataframe and dropping the category and mechanic columns
bgg = pd.concat([bgg, cat_dummy, mech_dummy], axis=1)
bgg.drop(columns=['category','mechanic'], inplace=True)

### Train-Test-Split, Scaling and Dimensionality Reduction

The target variable will be Geek Rating, specifically if a game be in the top 1,000. Since its derived from average rating we will drop average rating. Number of voters correlates with Geek Rating, but in this context I want the model to predict the rating of a new game. Any value for number of voters for a new game would be arbitrary. Title, game url, rank, and game id are not relevant to the model and can be dropped.

In [7]:
# creating binary response column for top 1000 games
bgg['top_1000'] = bgg['rank'] <= 1000
bgg['top_1000'].replace({True:1,False:0}, inplace=True)

In [23]:
# setting features, X and response variable, y
X = bgg.drop(columns=['avg_rating','geek_rating','num_voters','title','full_game_url','rank','game_id','top_1000'])
y = bgg.top_1000

# splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23, shuffle=True)

In [24]:
# scaling - fit on train, transform both train and teset
standard_scaler = StandardScaler()
standard_scaler.fit(X_train)
X_train = standard_scaler.transform(X_train)
X_test = standard_scaler.transform(X_test)

In [27]:
# determening the number of components to keep in principal component analysis
pca = PCA()
pca.fit(X_train)
explained_variance = pca.explained_variance_ratio_

# function to determine the number of components needed to reach desired explained variance
def select_n_components(var_ratio, goal_var: float) -> int:
    total_variance = 0.0
    n_components = 0
    
    # for the explained variance of each feature:
    for explained_variance in var_ratio:
        
        # add the explained variance to the total
        total_variance += explained_variance
        
        # add one to the number of components
        n_components += 1
        
        # break if we reach our goal level of explained variance
        if total_variance >= goal_var:
            break
            
    # Return the number of components
    return n_components

goal_var = 0.95
components = select_n_components(explained_variance,goal_var)
print(f'{components} components are needed to explain {goal_var}% of the variance')

413 components are needed to explain 0.95% of the variance


In [28]:
# fitting PCA on train and transforming both train and test
pca = PCA(n_components=components)
pca.fit(X_train)
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)

In [29]:
# applying linear discriminant analysis: this returns 1 component since max components = num classes - 1 
# and there are only 2 classes in this data set
lda = LDA()
lda.fit(X_train, y_train)

X_train = lda.transform(X_train)
X_test = lda.transform(X_test)

### Modeling
Trying some out of the box models to get a baseline

In [77]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
logreg_pred = logreg.predict(X_test)

print('Accuray:', metrics.accuracy_score(y_test,logreg_pred))
print("Balanced Accuracy:", metrics.balanced_accuracy_score(y_test,logreg_pred))
print('Precision Score:', metrics.precision_score(y_test,logreg_pred, pos_label = 1))
print('Recall Score:', metrics.recall_score(y_test,logreg_pred, pos_label = 0))

Accuray: 0.9471608832807571
Balanced Accuracy: 0.5990882205324425
Precision Score: 0.5308641975308642
Recall Score: 0.9894385769872152


In [78]:
tree = DecisionTreeClassifier(random_state=23)
tree.fit(X_train, y_train)
tree_pred = tree.predict(X_test)
leaves = tree.get_n_leaves()

print('Accuray:', metrics.accuracy_score(y_test,tree_pred))
print('Balanced Accuracy:', metrics.balanced_accuracy_score(y_test,tree_pred))
print('Precision Score:' , metrics.precision_score(y_test,tree_pred, pos_label = 1))
print('Recall Score:', metrics.recall_score(y_test,tree_pred, pos_label = 0))

Accuray: 0.9216614090431126
Balanced Accuracy: 0.6245082219355953
Precision Score: 0.2830188679245283
Recall Score: 0.9577543079488605


In [82]:
rf = RandomForestClassifier(random_state=23, max_depth=5)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

print('Accuray:', metrics.accuracy_score(y_test,rf_pred))
print('Balanced Accuracy:', metrics.balanced_accuracy_score(y_test,rf_pred))
print('Precision Score:' , metrics.precision_score(y_test,rf_pred, pos_label = 1))
print('Recall Score:' , metrics.recall_score(y_test,rf_pred, pos_label = 0))

Accuray: 0.9458464773922187
Balanced Accuracy: 0.5709347695861239
Precision Score: 0.5
Recall Score: 0.9913841022790439


Accuray and Recall are high, but Precision is bad across the board. This makes sense as the number of games in the top 1000 is only about 5% of the dataset and this is reflected in Balanced Accuracy. Basically the models are predicting too many games to be in the top 1000.

#### Notes on Next Steps

Use:
- pipelines
- gridsearchcv

Try:
- to see if I can keep the intepretability
- binary encoding? Less interpretability, but maybe more accuracy
- limiting label volume with value counts
- remove outliers outside 3 stds

How to increase precision?
- Try more models
- Oversample
- Synthetic samples with - Synthetic Minority Oversampling Technique (SMOAT)