# Importing the necessary libraries


In [1]:
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
import pickle
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeRegressor
import logging
from sklearn.model_selection import GridSearchCV
import numpy as np
%matplotlib inline  

# Initializing the logging format

In [2]:
LOG_FORMAT = "%(asctime)s %(filename)s:%(lineno)d %(message)s"
logging.basicConfig(filename='dota_training.log', level=logging.DEBUG, format=LOG_FORMAT)

# Web scrapping in order to get the names of heroes(since the dataset doesn't have any column names)

In [3]:
url = 'https://github.com/kronusme/dota2-api/blob/master/data/heroes.json'
response = get(url)
html_soup = BeautifulSoup(response.text, 'html.parser')

In [4]:
hero_desc = html_soup.find_all('span', class_ = 'pl-s')
i = 0
heroes = []
try:
    for a in hero_desc:
        name = hero_desc[i+5].get_text()
        name = name.replace('"', "")
        heroes.append(name)
        i = i+5
except:
    pass

# Viewing the heroes

In [5]:
heroes

['Anti-Mage',
 'Axe',
 'Bane',
 'Bloodseeker',
 'Crystal Maiden',
 'Drow Ranger',
 'Earthshaker',
 'Juggernaut',
 'Mirana',
 'Shadow Fiend',
 'Morphling',
 'Phantom Lancer',
 'Puck',
 'Pudge',
 'Razor',
 'Sand King',
 'Storm Spirit',
 'Sven',
 'Tiny',
 'Vengeful Spirit',
 'Windranger',
 'Zeus',
 'Kunkka',
 'Lina',
 'Lich',
 'Lion',
 'Shadow Shaman',
 'Slardar',
 'Tidehunter',
 'Witch Doctor',
 'Riki',
 'Enigma',
 'Tinker',
 'Sniper',
 'Necrophos',
 'Warlock',
 'Beastmaster',
 'Queen of Pain',
 'Venomancer',
 'Faceless Void',
 'Skeleton King',
 'Death Prophet',
 'Phantom Assassin',
 'Pugna',
 'Templar Assassin',
 'Viper',
 'Luna',
 'Dragon Knight',
 'Dazzle',
 'Clockwerk',
 'Leshrac',
 "Nature's Prophet",
 'Lifestealer',
 'Dark Seer',
 'Clinkz',
 'Omniknight',
 'Enchantress',
 'Huskar',
 'Night Stalker',
 'Broodmother',
 'Bounty Hunter',
 'Weaver',
 'Jakiro',
 'Batrider',
 'Chen',
 'Spectre',
 'Doom',
 'Ancient Apparition',
 'Ursa',
 'Spirit Breaker',
 'Gyrocopter',
 'Alchemist',
 'Invo

In [6]:
lst = ["Team Won","Cluster ID","Game Mode","Game Type"]
columns = lst+heroes

In [7]:
df = pd.read_csv("dota2Train.csv",engine="python",names=columns,index_col=False)

# Dropping lina and phoenix because in all of these games no one has chosen them even once as heroes
# Dropping 'game mode', 'game type', 'cluster id' for reasons as discussed in the eda document

In [8]:
df = df.astype('category')
df = df.drop(['Game Mode', 'Game Type','Cluster ID','Lina','Phoenix'], axis=1)
df = df.replace({1: "Team_1", -1: "Team_2"})

In [9]:
col = df.columns

# Changing the data type of all the columns to category 


In [10]:
df[col] = df[col].astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92650 entries, 0 to 92649
Columns: 111 entries, Team Won to Arc Warden
dtypes: category(111)
memory usage: 9.8 MB


# Performing get_dummies operation over the dataset and dropping the columns which may create multi-collinearity

In [11]:
dfDummies = pd.get_dummies(df)
dfDummies = dfDummies.drop("Team Won_Team_2", axis=1)
col = dfDummies.columns
dfDummies[col] = dfDummies[col].astype('category')

# Seperating the independent and dependent variables 


In [15]:
y = dfDummies['Team Won_Team_1']
X = dfDummies.iloc[:,1:]

In [18]:
# Multi-collinearity handled

for i in X:
    if "_0" in i:
        X.drop([i],axis=1,inplace=True)

# Splitting the dataset for training and testing

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

# LOGISTIC REGRESSION

In [29]:
# Initializing the model
logreg = LogisticRegression()

In [30]:
# Create regularization penalty space
penalty = ['l1', 'l2']

# Create regularization hyperparameter space
C = range(1,4)

solver = ['newton-cg', 'lbfgs']
#'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'

# Create hyperparameter options
hyperparameters = dict(C=C, penalty=penalty, solver = solver)

# Create grid search using 5-fold cross validation
clf = GridSearchCV(logreg, hyperparameters, cv=5, verbose=1, n_jobs=-1)

In [31]:
# Fit grid search
logging.info(clf.fit(X_train, y_train))
best_model = clf.best_estimator_

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  2.2min finished


# Logging the best hyperparameters


In [32]:
logging.debug("Best model:-{}".format(best_model))
logging.debug("Best penalty:-{}".format(best_model.get_params()['penalty']))
logging.debug("Best C:-{}".format(best_model.get_params()['C']))
logging.debug("Best Solver:-{}".format(best_model.get_params()['solver']))

# Viewing the best parameters


In [33]:
print('Best Penalty:', best_model.get_params()['penalty'])
print('Best C:', best_model.get_params()['C'])
print('Best Solver:', best_model.get_params()['solver'])

Best Penalty: l2
Best C: 3
Best Solver: newton-cg


# Predicting the test-set and viewing the accuracy of the model over the test set

In [34]:
# Predict target vector
y_pred = best_model.predict(X_test)
logging.info(best_model.predict(X_test))
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(best_model.score(X_test, y_test)))
logging.info('Accuracy of logistic regression classifier on test set: {:.2f}'.format(best_model.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.61


# Accuracy

In [35]:
scores_classification = accuracy_score(y_test, y_pred)
scores_classification

0.6053966540744739

# Printing the classification report


In [37]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.58      0.53      0.55      4295
           1       0.62      0.67      0.65      4970

    accuracy                           0.61      9265
   macro avg       0.60      0.60      0.60      9265
weighted avg       0.60      0.61      0.60      9265



# Printing the confusion matrix


In [38]:
results = confusion_matrix(y_test, y_pred)
print(results)

[[2258 2037]
 [1619 3351]]


# Saving the model using pickle

In [135]:
pickle.dump(best_model,open('dota_model.pkl','wb'))

# Following are the other models i tried using, I am not going into the details with them because logistic regression was finalized.

# NAIVE BAYES

In [63]:
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)

In [64]:
y_pred = gnb.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(gnb.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.59


In [65]:
from sklearn.metrics import accuracy_score  

scores_classification = accuracy_score(y_test, y_pred)
scores_classification

0.5859686994063681

# RANDOM FOREST

In [66]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

regressor = RandomForestClassifier()
param_grid = { 
    'n_estimators': [100, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

In [68]:
regressor = GridSearchCV(estimator=regressor, param_grid=param_grid, cv= 10)
regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)

'regressor = GridSearchCV(estimator=regressor, param_grid=param_grid, cv= 10)\nregressor.fit(X_train, y_train)\n\ny_pred = regressor.predict(X_test)'

In [52]:
predictions = [1 if i >=0.60 else 0 for i in y_pred]

In [53]:
from sklearn.metrics import accuracy_score  
scores_classification = accuracy_score(y_test, predictions)
scores_classification

0.5394495412844037

# RIDGE REGRESSOR

In [67]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
ridge=Ridge()
parameters={'alpha':[1e-15,1e-10,1e-8,1e-3,1e-2,1,5,10,20,30,35,40]}
ridge_regressor=GridSearchCV(ridge,parameters,scoring='neg_mean_squared_error',cv=5)
ridge_regressor.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=None, normalize=False, random_state=None,
                             solver='auto', tol=0.001),
             iid='warn', n_jobs=None,
             param_grid={'alpha': [1e-15, 1e-10, 1e-08, 0.001, 0.01, 1, 5, 10,
                                   20, 30, 35, 40]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=0)

In [68]:
y_pred = ridge_regressor.predict(X_test)
y_pred = [1 if i >=0.50 else 0 for i in y_pred]

In [69]:
from sklearn.metrics import accuracy_score  
scores_classification = accuracy_score(y_test, y_pred)
scores_classification

0.6056125202374528

# DECISION TREE REGRESSOR

In [69]:
dtree=DecisionTreeRegressor(criterion="mse")

In [73]:
params={
 "splitter"    : ["best","random"] ,
 "max_depth"        : [ 3, 4, 5, 6],
 "min_samples_leaf" : [ 1,2,3,4],
"min_weight_fraction_leaf":[0.1,0.2,0.3],
 "max_features" : ["auto","log2","sqrt",None ],
    "max_leaf_nodes":[None,10,20,30]
    
}

In [74]:
from sklearn.model_selection import GridSearchCV

random_search=GridSearchCV(dtree,param_grid=params,scoring='neg_mean_squared_error',n_jobs=-1,cv=10,verbose=3)

In [75]:
random_search.fit(X,y)

Fitting 10 folds for each of 1536 candidates, totalling 15360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    9.5s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   23.7s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:   55.8s
[Parallel(n_jobs=-1)]: Done 504 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 1144 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 1560 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 2040 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 2584 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 3864 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done 4600 tasks      | elapsed: 10.4min
[Parallel(n_jobs=-1)]: Done 5400 tasks      | elapsed: 11.9min
[Parallel(n_jobs=-1)]: Done 6264 tasks      | elapsed: 13.0min
[Parallel(n_jobs=-1)]: Done 7192 tasks      | e

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=DecisionTreeRegressor(criterion='mse', max_depth=None,
                                             max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             presort=False, random_state=None,
                                             splitter='best'),
             iid='warn', n_jobs=-1,
             param_grid={'max_depth': [3, 4, 5, 6],
                         'max_features': ['auto', 'log2', 'sqrt', None],
                         'max_leaf_nodes': [None, 10, 20, 30],
                         

In [76]:
random_search.best_params_,random_search.best_score_

({'max_depth': 3,
  'max_features': 'auto',
  'max_leaf_nodes': None,
  'min_samples_leaf': 1,
  'min_weight_fraction_leaf': 0.1,
  'splitter': 'best'},
 -0.2482957635097084)

In [77]:
predictions=random_search.predict(X_test)
predictions = [1 if i >=0.5 else 0 for i in predictions]

In [78]:
scores_classification = accuracy_score(y_test, predictions)
scores_classification

0.5369670804101457

#  LDA

In [1]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import cross_validate

LDA = LinearDiscriminantAnalysis()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(LDA, X_train, y_train, scoring=scoring, cv=20)

sorted(scores.keys())
LDA_fit_time = scores['fit_time'].mean()
LDA_score_time = scores['score_time'].mean()
LDA_accuracy = scores['test_accuracy'].mean()
LDA_precision = scores['test_precision_macro'].mean()
LDA_recall = scores['test_recall_macro'].mean()
LDA_f1 = scores['test_f1_weighted'].mean()
LDA_roc = scores['test_roc_auc'].mean()

NameError: name 'X_train' is not defined

In [None]:
LDA_precision

In [63]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

QDA = QuadraticDiscriminantAnalysis()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(QDA, X_train, y_train, scoring=scoring, cv=20)

sorted(scores.keys())
QDA_fit_time = scores['fit_time'].mean()
QDA_score_time = scores['score_time'].mean()
QDA_accuracy = scores['test_accuracy'].mean()
QDA_precision = scores['test_precision_macro'].mean()
QDA_recall = scores['test_recall_macro'].mean()
QDA_f1 = scores['test_f1_weighted'].mean()
QDA_roc = scores['test_roc_auc'].mean()

In [64]:
QDA_accuracy,QDA_precision,QDA_recall,QDA_f1,QDA_roc

(0.5324219313667039,
 0.5267209543278832,
 0.5174397743505532,
 0.48717715264339845,
 0.5333671483665519)

# XGBOOST

In [132]:
from xgboost import XGBClassifier
model = XGBClassifier(max_depth=10, learning_rate=0.0001, n_estimators=100,
                       objective='binary:logistic')


In [133]:
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bytree=1, gamma=0, learning_rate=0.0001,
              max_delta_step=0, max_depth=10, min_child_weight=1, missing=None,
              n_estimators=100, n_jobs=1, nthread=None,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
              subsample=1)

In [134]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 56.00%


# ENSEMBLE METHODS

In [73]:
from sklearn import model_selection
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

seed = 7
num_trees = 30
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = GradientBoostingClassifier(n_estimators=num_trees, random_state=seed)
results = model_selection.cross_val_score(model, X, y, cv=kfold)
print(results.mean())

0.559363194819212


# FEATURE SELECTION

In [87]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs')
rfe = RFE(model, 100)
fit = rfe.fit(X, y)
print("Feature Ranking: %s" % fit.ranking_)








Feature Ranking: [139   1 112 162  25   7 126   1   1 156  56  46 176   1   1 191   1  12
 169  48  77 193  57  55 150   1   5 168 102  69 175   1   1 178   1   1
 122   1   1 111  19  72 186   1   1 192  59  81 151   1 113 231   1   1
 203   1   1 182  39  20 213   1   1 180   1   4 173  96 172 149   1   1
 217 202 211 215  14   9 152  51  31 161  15  37 124  45  83 228  13   6
 226   1   1 199  35  22 224   1   1 148  52  85  99   1   1 128   1  11
 219 207 181 198   1   1 230  33  29 119  98  44  80   1   1 222   1   1
 117  58  28 218  10 106 160   1   1 130 210 229 223   1   1 167 114  78
 138  38  67 188 100  74  97   1   1 141   1   1 158 104  66 137  50 142
 129   1   1 132   1   1 145   1   1 127   1 105  93  92  40 164   1   1
 153 208  88 120 121  71 143  47  23  73   1   1 133   1   1  82   1   1
 118  36  75 185   1   1 125  16   1 227  34  30  61   3   1 200 225  84
 179   1   1 197   1   1 196 201 115  65   1  32 189 131 209 155 154   1
 147   1   1 183  76  90  62   1  

In [88]:
best = fit.ranking_
best_index = [i  for (i, j) in enumerate(best) if j==1]

In [89]:
X = X.iloc[:, best_index]

i for i in range(50,100) if i

In [14]:
for num in range(60, 100):
    for i in range(2, num):
        if (num % i) == 0:
            break
    else:
        print(num)

61
67
71
73
79
83
89
97


In [14]:
len(X.columns)

220

In [13]:
X

Unnamed: 0,Anti-Mage_Team_1,Anti-Mage_Team_2,Axe_Team_1,Axe_Team_2,Bane_Team_1,Bane_Team_2,Bloodseeker_Team_1,Bloodseeker_Team_2,Crystal Maiden_Team_1,Crystal Maiden_Team_2,...,Terrorblade_Team_1,Terrorblade_Team_2,Techies_Team_1,Techies_Team_2,Oracle_Team_1,Oracle_Team_2,Winter Wyvern_Team_1,Winter Wyvern_Team_2,Arc Warden_Team_1,Arc Warden_Team_2
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
