# Importing Libraries & Data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures, LabelBinarizer
from sklearn.metrics import confusion_matrix
import pickle


import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('../data/final_dataset.csv') # importing dataset
df.shape

(19036, 199)

In [3]:
df.keys() # getting a list of feature names

Index(['name', 'yearpublished', 'minplayers', 'maxplayers', 'minplaytime',
       'maxplaytime', 'minage', 'languagedependence', 'boardgamecategory_cnt',
       'boardgamemechanic_cnt',
       ...
       'Category Transportation', 'Category Travel', 'Category Trivia',
       'Category Video Game Theme', 'Category Vietnam War', 'Category Wargame',
       'Category Word Game', 'Category World War I', 'Category World War II',
       'Category Zombies'],
      dtype='object', length=199)

In [4]:
# creating lists of the least impactful mechanics and categories, based on the models in the previous notebook
mech_list = [
    'Mechanic Action Drafting', 'Mechanic Action Queue',
    'Mechanic Action Retrieval', 'Mechanic Action Timer',
    'Mechanic Action/Event', 'Mechanic Advantage Token',
    'Mechanic Area Movement', 'Mechanic Area-Impulse',
    'Mechanic Auction/Bidding', 'Mechanic Auction: Sealed Bid',
    'Mechanic Auction: Turn Order Until Pass',
    'Mechanic Automatic Resource Growth', 'Mechanic Bias', 'Mechanic Bribery',
    'Mechanic Card Drafting', 'Mechanic Catch the Leader', 'Mechanic Chaining',
    'Mechanic Chit-Pull System', 'Mechanic Command Cards',
    'Mechanic Commodity Speculation', 'Mechanic Communication Limits',
    'Mechanic Connections', 'Mechanic Critical Hits and Failures',
    'Mechanic Dice Rolling', 'Mechanic Different Dice Movement',
    'Mechanic Drafting', 'Mechanic Elapsed Real Time Ending',
    'Mechanic Enclosure', 'Mechanic Events', 'Mechanic Flicking',
    'Mechanic Follow', 'Mechanic Grid Coverage', 'Mechanic Grid Movement',
    'Mechanic Hand Management', 'Mechanic Hidden Movement',
    'Mechanic Hidden Roles', 'Mechanic Highest-Lowest Scoring',
    'Mechanic Increase Value of Unchosen Resources', 'Mechanic Investment',
    'Mechanic Legacy Game', 'Mechanic Line Drawing', 'Mechanic Mancala',
    'Mechanic Market', 'Mechanic Measurement Movement',
    'Mechanic Melding and Splaying', 'Mechanic Minimap Resolution',
    'Mechanic Modular Board', 'Mechanic Move Through Deck',
    'Mechanic Multiple Maps', 'Mechanic Negotiation', 'Mechanic Ownership',
    'Mechanic Pattern Building', 'Mechanic Physical Removal',
    'Mechanic Pick-up and Deliver', 'Mechanic Player Judge',
    'Mechanic Point to Point Movement', 'Mechanic Race', 'Mechanic Real-Time',
    'Mechanic Resource to Move', 'Mechanic Role Playing',
    'Mechanic Roles with Asymmetric Information', 'Mechanic Rondel',
    'Mechanic Score-and-Reset Game', 'Mechanic Secret Unit Deployment',
    'Mechanic Set Collection', 'Mechanic Simulation',
    'Mechanic Simultaneous Action Selection', 'Mechanic Singing',
    'Mechanic Solo / Solitaire Game', 'Mechanic Square Grid',
    'Mechanic Stacking and Balancing', 'Mechanic Stock Holding',
    'Mechanic Take That', 'Mechanic Team-Based Game',
    'Mechanic Tile Placement', 'Mechanic Time Track', 'Mechanic Trading',
    'Mechanic Traitor Game', 'Mechanic Trick-taking', 'Mechanic Voting',
    'Mechanic Worker Placement'
]

cat_list = [
    'Category Abstract Strategy', 'Category Adventure',
    'Category American Civil War', 'Category American Indian Wars',
    'Category American West', 'Category Ancient', 'Category Animals',
    'Category Arabian', 'Category Aviation / Flight', 'Category Bluffing',
    'Category Book', 'Category City Building', 'Category Civil War',
    'Category Collectible Components', 'Category Comic Book / Strip',
    'Category Dice', 'Category Educational', 'Category Electronic',
    'Category Environmental', 'Category Exploration', 'Category Fantasy',
    'Category Farming', 'Category Fighting', 'Category Game System',
    'Category Horror', 'Category Industry / Manufacturing', 'Category Mafia',
    'Category Maze', 'Category Memory', 'Category Modern Warfare',
    'Category Movies / TV / Radio theme', 'Category Music',
    'Category Mythology', 'Category Nautical', 'Category Negotiation',
    'Category Novel-based', 'Category Number', 'Category Pirates',
    'Category Political', 'Category Post-Napoleonic', 'Category Prehistoric',
    'Category Print & Play', 'Category Real-time', 'Category Religious',
    'Category Science Fiction', 'Category Space Exploration',
    'Category Spies/Secret Agents', 'Category Sports',
    'Category Transportation', 'Category Travel', 'Category Trivia',
    'Category Video Game Theme', 'Category World War I',
    'Category World War II', 'Category Zombies'
]

In [5]:
df['Mechanic Other'] = df[mech_list].sum(axis = 1) # summing the less impactful mechanics into a single column

In [6]:
df['Mechanic Other'].value_counts()

1    10365
0     8671
Name: Mechanic Other, dtype: int64

In [7]:
df['Category Other'] = df[cat_list].sum(axis = 1) # summing the less impactful categories into a single column

In [8]:
df['Category Other'].value_counts()

0    9646
1    9390
Name: Category Other, dtype: int64

In [9]:
df.drop(columns = mech_list, inplace = True) # dropping the less impactful mechanics and categories
df.drop(columns = cat_list, inplace = True)
df.shape

(19036, 65)

65 features is much more manageable than the 199 that were originally in the model. Hopefully that's enough to find a good production model.

In [10]:
# converting the avgweight feature to categorical data for better modeling
# using the categories found here: https://boardgamegeek.com/wiki/page/Weight
conversion_dict = {1: 'light',
                   2: 'medium light',
                   3: 'medium',
                   4: 'medium heavy',
                   5: 'heavy'}

In [11]:
df['avgweight'] = np.round(df['avgweight']) # rounding avgweight
df['avgweight'] = df['avgweight'].map(conversion_dict) # mapping the conversion dictionary to avgweight

In [12]:
df.drop(columns=[
    'yearpublished', 'boardgamecategory', 'boardgamemechanic', 'languagedependence',
    'description'],
        inplace=True)

In [13]:
# reorganizing the columns to make the flask app a bit more intuitive
df = df[[
    'name', 'minplayers', 'maxplayers', 'minplaytime', 'maxplaytime', 'minage',
    'boardgamecategory_cnt', 'boardgamemechanic_cnt', 'avgweight',
    'Mechanic Acting', 'Mechanic Action Points', 'Mechanic Alliances',
    'Mechanic Area Majority / Influence', 'Mechanic Betting and Bluffing',
    'Mechanic Campaign / Battle Card Driven', 'Mechanic Contracts',
    'Mechanic Cooperative Game', 'Mechanic Crayon Rail System',
    'Mechanic End Game Bonuses', 'Mechanic Hexagon Grid', 'Mechanic Memory',
    'Mechanic Network and Route Building', 'Mechanic Paper-and-Pencil',
    'Mechanic Pattern Recognition', 'Mechanic Player Elimination',
    'Mechanic Push Your Luck', 'Mechanic Rock-Paper-Scissors',
    'Mechanic Roll / Spin and Move', 'Mechanic Storytelling',
    'Mechanic Variable Phase Order', 'Mechanic Variable Player Powers',
    'Mechanic Other', 'Category Action / Dexterity', 'Category Age of Reason',
    'Category American Revolutionary War', 'Category Card Game',
    'Category Civilization', 'Category Deduction', 'Category Economic',
    'Category Expansion for Base-game', 'Category Humor',
    'Category Korean War', 'Category Math', 'Category Mature / Adult',
    'Category Medical', 'Category Medieval', 'Category Miniatures',
    'Category Murder/Mystery', 'Category Napoleonic', 'Category Party Game',
    'Category Pike and Shot', 'Category Puzzle', 'Category Racing',
    'Category Renaissance', 'Category Territory Building', 'Category Trains',
    'Category Vietnam War', 'Category Wargame', 'Category Word Game',
    'Category Other'
]]

In [14]:
X = df._get_numeric_data()
y = df['avgweight']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 1919)

# Modeling

## Base Model

In [16]:
y.value_counts(normalize = True)

medium light    0.441007
light           0.299170
medium          0.204245
medium heavy    0.053215
heavy           0.002364
Name: avgweight, dtype: float64

A base model would be 44% accurate if it guessed "medium light" each time. The goal will be to find a production model that can beat that.

## Logistic Regression Model

In [17]:
pipe = Pipeline(steps = [           # running a pipeline of a Logistic Regression
    ('lr', LogisticRegression())
])

params = {                          # setting parameters
    'lr__penalty' : ['l1'],
    'lr__C' : [1],
    'lr__solver' : ['liblinear']
}

gs_lr = GridSearchCV(pipe,
                    param_grid = params,
                    cv = 5,
                    scoring = 'accuracy',
                    n_jobs = -2)

gs_lr.fit(X_train, y_train)
gs_lr.best_estimator_

Pipeline(memory=None,
         steps=[('lr',
                 LogisticRegression(C=1, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l1', random_state=None,
                                    solver='liblinear', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [18]:
gs_lr.score(X_train, y_train) # scoring the model

0.5959235133431393

In [19]:
gs_lr.best_score_ # cross-val score

0.5915810141920165

In [20]:
gs_lr.score(X_test, y_test) #test score

0.5879386425719689

## KNN Model

In [21]:
pipe = Pipeline(steps = [                 # setting a KNN model with a standard scaler
    ('sc', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

params = {                                # setting model paramenters
    'knn__n_neighbors' : [21],
    'knn__p' : [1]
}

gs_knn = GridSearchCV(pipe,
                    param_grid = params,
                    cv = 5,
                    scoring = 'accuracy')

gs_knn.fit(X_train, y_train)
gs_knn.best_estimator_

Pipeline(memory=None,
         steps=[('sc',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('knn',
                 KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                      metric='minkowski', metric_params=None,
                                      n_jobs=None, n_neighbors=21, p=1,
                                      weights='uniform'))],
         verbose=False)

In [22]:
gs_knn.score(X_train, y_train) # scoring the model

0.6361980808293058

In [23]:
gs_knn.best_score_ # cross-val score

0.6001975991797771

In [24]:
gs_knn.score(X_test, y_test) # test score

0.6005463332632907

## Decision Tree Model

In [25]:
pipe = Pipeline(steps = [                 # setting a Decision Tree model
    ('tree', DecisionTreeClassifier())
])

params = {                                # setting model paramenters
    'tree__max_depth' : [6, 8],
}

gs_tree = GridSearchCV(pipe,
                    param_grid = params,
                    cv = 5,
                    scoring = 'accuracy'
                    )

gs_tree.fit(X_train, y_train)
gs_tree.best_estimator_

Pipeline(memory=None,
         steps=[('tree',
                 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                        criterion='gini', max_depth=8,
                                        max_features=None, max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        presort='deprecated', random_state=None,
                                        splitter='best'))],
         verbose=False)

In [26]:
gs_tree.score(X_train, y_train) # scoring the model

0.668487777544302

In [27]:
gs_tree.best_score_ # cross-val score

0.6284238178633975

In [28]:
gs_tree.score(X_test, y_test) # test score

0.630594662744274

## Bagging Classifier Model

In [29]:
pipe = Pipeline(steps = [                 # setting up a Bagging Classifier model
    ('bag', BaggingClassifier())
])

params = {                                # setting model paramenters
    'bag__n_estimators' : [200],
}

gs_bag = GridSearchCV(pipe,
                    param_grid = params,
                    cv = 5,
                    scoring = 'accuracy'
                    )

gs_bag.fit(X_train, y_train)
gs_bag.best_estimator_

Pipeline(memory=None,
         steps=[('bag',
                 BaggingClassifier(base_estimator=None, bootstrap=True,
                                   bootstrap_features=False, max_features=1.0,
                                   max_samples=1.0, n_estimators=200,
                                   n_jobs=None, oob_score=False,
                                   random_state=None, verbose=0,
                                   warm_start=False))],
         verbose=False)

In [30]:
gs_bag.score(X_train, y_train) # scoring the model

0.9498494081389648

In [31]:
gs_bag.best_score_ # cross-val score

0.6170079029860631

In [32]:
gs_bag.score(X_test, y_test) # test score

0.6142046648455558

## Random Forest Model

In [33]:
pipe = Pipeline(steps = [                 # setting up a Random Forest model
    ('forest', RandomForestClassifier())
])

params = {                                # setting model paramenters
    'forest__n_estimators' : [150],
    'forest__max_depth' : [15]
}

gs_forest = GridSearchCV(pipe,
                    param_grid = params,
                    cv = 5,
                    scoring = 'accuracy'
                    )

gs_forest.fit(X_train, y_train)
gs_forest.best_estimator_

Pipeline(memory=None,
         steps=[('forest',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=15, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=150, n_jobs=None,
                                        oob_score=False, random_state=None,
                                        verbose=0, warm_start=False))],
         verbose=False)

In [34]:
gs_forest.score(X_train, y_train) # scoring the model

0.7755130629684107

In [35]:
gs_forest.best_score_ # cross-val score

0.656230457156593

In [36]:
gs_forest.score(X_test, y_test) # test score

0.6492960706030678

## Extra Trees Model

In [37]:
pipe = Pipeline(steps = [                 # setting up and Extra Trees model
    ('extra', ExtraTreesClassifier())
])

params = {                                # setting model paramenters
    'extra__n_estimators' : [600],
    'extra__max_depth' : [None]
}

gs_extra = GridSearchCV(pipe,
                    param_grid = params,
                    cv = 5,
                    scoring = 'accuracy'
                    )

gs_extra.fit(X_train, y_train)
gs_extra.best_estimator_

Pipeline(memory=None,
         steps=[('extra',
                 ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0,
                                      class_weight=None, criterion='gini',
                                      max_depth=None, max_features='auto',
                                      max_leaf_nodes=None, max_samples=None,
                                      min_impurity_decrease=0.0,
                                      min_impurity_split=None,
                                      min_samples_leaf=1, min_samples_split=2,
                                      min_weight_fraction_leaf=0.0,
                                      n_estimators=600, n_jobs=None,
                                      oob_score=False, random_state=None,
                                      verbose=0, warm_start=False))],
         verbose=False)

In [38]:
gs_extra.score(X_train, y_train) # scoring the model

0.9498494081389648

In [39]:
gs_extra.best_score_ # cross-val score

0.6179176784549197

In [40]:
gs_extra.score(X_test, y_test) # test score

0.6125236394200462

## AdaBoost Model

In [41]:
pipe = Pipeline(steps = [                 # setting up an AdaBoost model
    ('ada', AdaBoostClassifier())
])

params = {                                # setting model paramenters
    'ada__n_estimators' : [10],
}

gs_ada = GridSearchCV(pipe,
                    param_grid = params,
                    cv = 5,
                    scoring = 'accuracy'
                    )

gs_ada.fit(X_train, y_train)
gs_ada.best_estimator_

Pipeline(memory=None,
         steps=[('ada',
                 AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
                                    learning_rate=1.0, n_estimators=10,
                                    random_state=None))],
         verbose=False)

In [42]:
gs_ada.score(X_train, y_train) # scoring the model

0.5640540729845206

In [43]:
gs_ada.best_score_ # cross-val score

0.5524967500134905

In [44]:
gs_ada.score(X_test, y_test) # test score

0.5578903130909855

## Gradient Boosting Model

In [45]:
pipe = Pipeline(steps = [                 # setting up a Gradient Boost model
        ('grad', GradientBoostingClassifier())
])

params = {                                # setting model paramenters
    'grad__n_estimators' : [300],
    'grad__max_depth'    : [3]
    
}

gs_grad = GridSearchCV(pipe,
                    param_grid = params,
                    cv = 5,
                    scoring = 'accuracy'
                    )

gs_grad.fit(X_train, y_train)
gs_grad.best_estimator_

Pipeline(memory=None,
         steps=[('grad',
                 GradientBoostingClassifier(ccp_alpha=0.0,
                                            criterion='friedman_mse', init=None,
                                            learning_rate=0.1, loss='deviance',
                                            max_depth=3, max_features=None,
                                            max_leaf_nodes=None,
                                            min_impurity_decrease=0.0,
                                            min_impurity_split=None,
                                            min_samples_leaf=1,
                                            min_samples_split=2,
                                            min_weight_fraction_leaf=0.0,
                                            n_estimators=300,
                                            n_iter_no_change=None,
                                            presort='deprecated',
                                            random_sta

In [46]:
gs_grad.score(X_train, y_train) # scoring the model

0.708482174126217

In [47]:
gs_grad.best_score_ # cross-val score

0.6593826497323974

In [48]:
gs_grad.score(X_test, y_test) # test score

0.6505568396722

## SVC Model

In [49]:
pipe = Pipeline(steps = [                 # setting up an SVC model
    ('svc', SVC())
])

params = {                                # setting model paramenters
    'svc__C' : [3],
}

gs_svc = GridSearchCV(pipe,
                    param_grid = params,
                    cv = 5,
                    scoring = 'accuracy'
                    )

gs_svc.fit(X_train, y_train)
gs_svc.best_estimator_

Pipeline(memory=None,
         steps=[('svc',
                 SVC(C=3, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='scale', kernel='rbf', max_iter=-1,
                     probability=False, random_state=None, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)

In [50]:
gs_svc.score(X_train, y_train) # scoring the model

0.5609021503116901

In [51]:
gs_svc.best_score_ # cross-val score

0.565385583305126

In [52]:
gs_svc.score(X_test, y_test) # test score

0.556209287665476

## Voting Classifier Model

In [53]:
knn_pipe = Pipeline([                          # setting up a pipeline for a KNN model
    ('ss', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

vote = VotingClassifier([                      # setting up a Voting Classifer model with multiple model inside it
    ('rand', RandomForestClassifier()),
    ('grad', GradientBoostingClassifier()),
    ('lr', LogisticRegression()),
    ('tree', DecisionTreeClassifier()),
    ('bag', BaggingClassifier()),
    ('ada', AdaBoostClassifier()),
    ('extra', ExtraTreesClassifier()),
    ('knn_pipe', knn_pipe)
],
                        voting='soft')
vote_params = {                                # setting model paramenters for each model in the Voting Classifier
    'rand__n_estimators' : [150],
    'rand__max_depth' : [15],
    'grad__n_estimators' : [300],
    'tree__max_depth' : [8],
    'bag__n_estimators' : [200],
    'ada__n_estimators' : [10],
    'extra__n_estimators' : [600],
    'knn_pipe__knn__n_neighbors': [21],
    'lr__penalty' : ['l1'],
    'lr__C' : [1],
    'lr__solver' : ['liblinear']    
}
gs_vc = GridSearchCV(vote,
                     param_grid=vote_params,
                     cv=5,
                     scoring='accuracy'
                     )
gs_vc.fit(X_train, y_train)
gs_vc.best_estimator_

VotingClassifier(estimators=[('rand',
                              RandomForestClassifier(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     class_weight=None,
                                                     criterion='gini',
                                                     max_depth=15,
                                                     max_features='auto',
                                                     max_leaf_nodes=None,
                                                     max_samples=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=1,
                                                     min_samples_split=2,
                                                     min_weight_fraction_leaf=0.0,
        

In [54]:
gs_vc.score(X_train, y_train) # scoring the model

0.8654479232331722

In [55]:
gs_vc.best_score_ # cross-val score

0.6565808670228162

In [56]:
gs_vc.score(X_test, y_test) # test score

0.6495061987812566

# Pickling

In [57]:
pickle.dump(gs_grad, open('model.pkl', 'wb'))