In [1]:
import numpy as np
import pandas as pd
import json
from typing import Dict
from itertools import groupby, chain
from functools import reduce
from collections import Counter, defaultdict
import re
from sklearn.preprocessing import OneHotEncoder 
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier 
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score
import pickle

In [2]:
combined_df = pd.read_pickle("final_data_frame_one_hot")

In [3]:
combined_df.head()

Unnamed: 0,stage,gold_spent,level,rank,item_index1,item_index2,item_index3,item_index4,item_index5,item_index6,...,Starship,Vanguard,stage_1,stage_2,stage_3,stage_4,stage_5,stage_6,stage_7,stage_8
0,1,4.0,2.0,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1,0,0,0,0,0,0,0
1,1,4.0,2.0,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1,0,0,0,0,0,0,0
2,2,20.0,4.166667,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0,1,0,0,0,0,0,0
3,2,20.0,4.166667,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0,1,0,0,0,0,0,0
4,2,20.0,4.166667,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0,1,0,0,0,0,0,0


In [4]:
combined_df = combined_df.drop(columns = ["stage_1", "stage_2", "stage_3", "stage_4", "stage_5", "stage_6", "stage_7", "stage_8"])

### Create training and test sets

In [5]:
X = combined_df.drop(columns = ['rank'])
Y = combined_df['rank']

In [6]:
X.head()

Unnamed: 0,stage,gold_spent,level,item_index1,item_index2,item_index3,item_index4,item_index5,item_index6,item_index7,...,Mystic,Paragon,Protector,Rebel,Sniper,Sorcerer,SpacePirate,StarGuardian,Starship,Vanguard
0,1,4.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,4.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,20.0,4.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,2,20.0,4.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,2,20.0,4.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [7]:
Y.head()

0    4
1    4
2    4
3    4
4    4
Name: rank, dtype: object

In [8]:
train_x, test_x, train_y, test_y = train_test_split(X,Y,test_size=0.2)

## Create and run Random Forest Regressor to predict rank

In [9]:
reg = RandomForestRegressor(n_jobs=-1)

In [10]:
reg.fit(train_x,train_y)

RandomForestRegressor(n_jobs=-1)

In [11]:
pred_y = reg.predict(test_x)

In [12]:
mean_squared_error(test_y,pred_y)

1.136398770539936

In [13]:
mean_absolute_error(test_y,np.round(pred_y))

0.4872033666976614

In [14]:
def rank_accuracy(true,pred):
    return np.sum([x == y for x,y in zip(true,pred)])/true.shape[0]

### Accuracy of Random Forest regressor

In [15]:
rank_accuracy(test_y.astype(int), np.round(pred_y).astype(int))

0.7183777258441925

In [16]:
reg.feature_importances_

array([1.04101706e-01, 2.27460754e-01, 1.03289175e-01, 7.76457074e-04,
       1.02490493e-04, 5.48900630e-04, 1.05823942e-04, 3.74600507e-04,
       1.09867892e-04, 3.72042548e-04, 7.41849597e-06, 6.03460827e-04,
       7.21733777e-05, 2.29485385e-04, 9.38264667e-06, 1.83924067e-04,
       1.46966020e-07, 1.87630197e-05, 2.02915226e-07, 2.59685133e-04,
       0.00000000e+00, 1.43943593e-03, 8.88140646e-05, 1.18118920e-03,
       2.98724754e-04, 1.24143613e-03, 4.82689698e-04, 1.03743085e-03,
       1.14587651e-04, 1.17950308e-03, 2.84134154e-04, 8.69186778e-04,
       1.73756017e-04, 7.36577430e-04, 2.42675684e-05, 8.45101230e-05,
       8.89070701e-06, 1.24329688e-03, 0.00000000e+00, 2.80735619e-03,
       7.75441607e-05, 2.60950506e-03, 5.51737165e-04, 3.08550633e-03,
       5.76462542e-04, 3.16198282e-03, 1.01318201e-03, 2.93154199e-03,
       6.45582924e-04, 2.98716588e-03, 6.94366951e-04, 3.39209546e-03,
       8.46595811e-04, 9.47036429e-04, 3.29819592e-04, 3.10063316e-03,
      

## Create and run Random Forest Classifier to predict top 4

In [17]:
test_y_top_4 = np.where(test_y.astype('int') <= 4, 1, 0)

In [18]:
train_y_top_4 = np.where(train_y.astype('int') <= 4, 1, 0)

In [19]:
clf_top_4 = RandomForestClassifier(n_jobs=-1)

In [20]:
clf_top_4.fit(train_x,train_y_top_4)

RandomForestClassifier(n_jobs=-1)

In [21]:
pred_y_top_4 = clf_top_4.predict(test_x)

In [22]:
accuracy_score(test_y_top_4, pred_y_top_4)

0.8302141554864297

In [23]:
clf_top_4.feature_importances_

array([6.31340312e-02, 2.42355875e-01, 1.28086619e-01, 2.35689239e-03,
       2.74279959e-04, 1.61030643e-03, 2.74844478e-04, 1.14999473e-03,
       2.96720452e-04, 1.07968418e-03, 2.79927849e-05, 1.66272270e-03,
       1.83409256e-04, 7.08763815e-04, 3.26319009e-05, 5.55711494e-04,
       1.98005765e-06, 7.64448946e-05, 4.97860361e-07, 7.48043584e-04,
       0.00000000e+00, 3.98948495e-03, 2.63422474e-04, 3.20092855e-03,
       6.94427395e-04, 3.26847580e-03, 1.02412458e-03, 2.73061174e-03,
       3.44836840e-04, 3.24985461e-03, 7.49324669e-04, 2.20854121e-03,
       4.43243718e-04, 1.95401269e-03, 8.59735640e-05, 2.85108464e-04,
       4.32856385e-05, 2.89264053e-03, 0.00000000e+00, 5.89980720e-03,
       2.18607138e-04, 5.81749629e-03, 1.27328979e-03, 6.79968802e-03,
       1.23500921e-03, 6.52032787e-03, 2.25753016e-03, 6.18333192e-03,
       1.42875569e-03, 6.63858211e-03, 1.77345902e-03, 5.93903585e-03,
       1.68812769e-03, 2.31122381e-03, 1.37024809e-03, 6.68687709e-03,
      

In [24]:
filename = 'final_reg_model_no_stage_oh.sav'
pickle.dump(reg, open(filename, 'wb'))
filename2 = 'final_clf_model_no_stage_oh.sav'
pickle.dump(clf_top_4, open(filename2, 'wb'))