# Final Project

## Imports

In [42]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import confusion_matrix
from xgboost.sklearn import XGBClassifier
import matplotlib.pyplot as plt
%matplotlib inline

### Import End Game Data

In [57]:
data = pd.read_csv("final.csv")
df = data.drop("Unnamed: 0", axis=1)

y = df['bResult']
X = df.drop('bResult', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=42)

### Pre-Game Data

In [44]:
for key in X:
    if key[-1].isdigit():
        X.drop(key, axis=1, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=42)

### Time Data

In [50]:
def select_mins(X, mins):
    for i in range(85, mins, -5):
        for col in X.columns.values:
            if '_min_' + str(i) in col:
                X = X.drop(col, axis = 1)
    return X

def confidence_over_time(X, y, model):
    score_over_time = []
    X = select_mins(X, 15)      # Input mins desired
    m = model()
    m.fit(X, y)
    score_over_time.append(m.score(X, y))
    return score_over_time[::-1]

X = select_mins(X, 15)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=42)

### Importing Data w/ Matchups

In [25]:
data2 = pd.read_csv("final_and_matchups.csv")
y = data2['bResult']
X = data2.drop('bResult', axis=1)

X_pre = X
for key in X_pre:
    if key[-1].isdigit():
        X_pre.drop(key, axis=1, inplace=True)
        
X_train, X_test, y_train, y_test = train_test_split(X_pre, y, test_size=.33, random_state=42)

### Scale Pre-Match Data

In [None]:
scaler = StandardScaler()
scaler.fit(X_pre)
X_pre_scaled = scaler.transform(X_pre)

## Grid Searching: Logistic Regression

In [22]:
payload = {
            "penalty" : ['l1', 'l2'],
            "C"       : [.001,.01,.1,1,10,100],
          }

clf = GridSearchCV(estimator=LogisticRegression(), param_grid=payload, cv=3)
clf.fit(X_pre, y)
print(clf.best_estimator_)
print(clf.best_score_)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
0.751183587585


In [58]:
lgr = LogisticRegression(penalty='l1')
lgr.fit(X_train, y_train)
y_pred = lgr.predict(X_test)

confusion_matrix(y_true=y_test, y_pred=y_pred)
lgr.score(X_test, y_test)

0.98087649402390442

## Grid Searching: Random Forest

In [5]:
payload = {
            "n_estimators" : [5,10,20,50,100,1000],
            "criterion"    : ['entropy', 'gini']  
          }

clf = GridSearchCV(estimator=RandomForestClassifier(), param_grid=payload, cv=3)
clf.fit(X_pre, y)
print(clf.best_estimator_)
print(clf.best_score_)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
0.735928458706


In [59]:
rfc = RandomForestClassifier(criterion='gini', n_estimators=100)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)

confusion_matrix(y_true=y_test, y_pred=y_pred)
rfc.score(X_test, y_test)

0.9776892430278884

## Grid Searching: Extremely Randomized Trees

In [6]:
payload = {
            "n_estimators" : [5,10,20,50,100,1000],
            "criterion"    : ['entropy', 'gini']  
          }

clf = GridSearchCV(estimator=ExtraTreesClassifier(), param_grid=payload, cv=3)
clf.fit(X_pre, y)
print(clf.best_estimator_)
print(clf.best_score_)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
0.736717517096


In [60]:
etc = ExtraTreesClassifier(criterion='gini', n_estimators=1000)
etc.fit(X_train, y_train)
y_pred = etc.predict(X_test)

confusion_matrix(y_true=y_test, y_pred=y_pred)
etc.score(X_test, y_test)

0.96892430278884467

## Grid Searching: XGBClassifier

In [7]:
payload = {
            "n_estimators" : [5,10,20,50,100,500,1000],
#             "booster"    : ['gbtree', 'gblinear', 'dart'],
            "max_depth"    : [2,3,5,10]
          }

clf = GridSearchCV(estimator=XGBClassifier(), param_grid=payload, cv=3)
clf.fit(X_pre, y)
print(clf.best_estimator_)
print(clf.best_score_)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)
0.733298264072


In [61]:
xgb = XGBClassifier(max_depth=3, n_estimators=100)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)

confusion_matrix(y_true=y_test, y_pred=y_pred)
xgb.score(X_test, y_test)

0.97290836653386459

# ---Voting Ensemble---

In [62]:
from sklearn.ensemble import VotingClassifier

lr = LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
rf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
et = ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
xg  = XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

estimators = [('LogisticRegression', lr), ('RandomForest', rf), ('ExtraTrees', et), ('XGBC', xg)]

metaEstimator = VotingClassifier(estimators=estimators, voting='hard')

X_train, X_test, y_train, y_test = train_test_split(X_pre, y, test_size=.33, random_state=42)

metaEstimator.fit(X_train, y_train)
y_pred = metaEstimator.predict(X_test)

cm = confusion_matrix(y_true=y_test, y_pred=y_pred)
print(cm)
metaEstimator.score(X_test, y_test)

[[268 321]
 [189 477]]


0.59362549800796816