# Machine Learning Madness 2021
**Stage 1**: Use historical data to build and train models  
**Stage 2**: Predict 2021 NCAA champtionship  
**Evaluated by**: Log loss  
**Predict**: Each matchup 

In [2]:
# NumPy for numerical computing
import numpy as np

import re

# Pandas for DataFrames
import pandas as pd
pd.set_option('display.max_columns', 100)

# Matplotlib for visualization
from matplotlib import pyplot as plt
# display plots in the notebook
%matplotlib inline 

# Seaborn for easier visualization
import seaborn as sns
# Scikit-Learn for Modeling
import sklearn
# Pickle for saving model files
import pickle

# Import Logistic Regression
from sklearn.linear_model import LogisticRegression
# Import RandomForestClassifier and GradientBoostingClassifer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Function for splitting training and test set
from sklearn.model_selection import train_test_split
# Function for creating model pipelines
from sklearn.pipeline import make_pipeline
# For standardization
from sklearn.preprocessing import StandardScaler
# Helper for cross-validation
from sklearn.model_selection import GridSearchCV
# Classification metrics (added later)
from sklearn.metrics import roc_curve, auc

Let's load in the dataset from the processed data folder.

In [3]:
df = pd.read_csv('../../data/processed/model_dataset.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Season,LowID,HighID,Win,LowScore,LowFGP,LowFGP3,LowFTP,LowOR,LowDR,LowAst,LowTO,LowStl,LowBlk,LowPF,LowRank,LowSeed,HighScore,HighFGP,HighFGP3,HighFTP,HighOR,HighDR,HighAst,HighTO,HighStl,HighBlk,HighPF,HighRank,HighSeed
0,0,2003,1411,1421,0,72.8,0.448892,0.321414,0.613745,13.166667,24.8,14.2,15.233333,6.433333,2.233333,18.3,239.28125,16.0,71.206897,0.432768,0.363494,0.766142,12.275862,23.172414,13.034483,16.206897,7.068966,3.0,19.103448,240.34375,16.0
1,1,2003,1112,1436,1,85.214286,0.463563,0.35106,0.701154,15.178571,27.642857,17.642857,14.785714,8.464286,4.214286,17.75,2.676471,1.0,67.793103,0.446372,0.35792,0.649708,12.965517,25.724138,14.206897,14.068966,6.862069,2.965517,15.896552,153.125,16.0
2,2,2003,1113,1272,1,75.965517,0.48168,0.328376,0.675667,13.689655,23.310345,15.551724,14.0,5.206897,4.241379,19.413793,36.0,10.0,74.517241,0.439457,0.343441,0.628299,14.068966,25.965517,16.62069,13.793103,7.37931,5.068966,18.758621,21.705882,7.0
3,3,2003,1141,1166,1,79.344828,0.506349,0.377481,0.762741,10.586207,23.275862,15.62069,18.241379,7.103448,4.0,20.965517,45.6875,11.0,79.242424,0.498309,0.384914,0.689707,10.878788,23.181818,16.818182,13.363636,8.393939,4.454545,17.272727,20.735294,6.0
4,4,2003,1143,1301,1,74.482759,0.468741,0.375934,0.688632,11.241379,24.37931,16.0,14.172414,6.551724,2.793103,17.103448,36.40625,8.0,72.4,0.458507,0.350564,0.778148,9.733333,22.033333,14.666667,14.2,7.766667,3.066667,18.666667,50.3125,9.0


In [5]:
df.columns

Index(['Unnamed: 0', 'Season', 'LowID', 'HighID', 'Win', 'LowScore', 'LowFGP',
       'LowFGP3', 'LowFTP', 'LowOR', 'LowDR', 'LowAst', 'LowTO', 'LowStl',
       'LowBlk', 'LowPF', 'LowRank', 'LowSeed', 'HighScore', 'HighFGP',
       'HighFGP3', 'HighFTP', 'HighOR', 'HighDR', 'HighAst', 'HighTO',
       'HighStl', 'HighBlk', 'HighPF', 'HighRank', 'HighSeed'],
      dtype='object')

In [6]:
df = df.drop(['Unnamed: 0'],axis=1)
df.head()

Unnamed: 0,Season,LowID,HighID,Win,LowScore,LowFGP,LowFGP3,LowFTP,LowOR,LowDR,LowAst,LowTO,LowStl,LowBlk,LowPF,LowRank,LowSeed,HighScore,HighFGP,HighFGP3,HighFTP,HighOR,HighDR,HighAst,HighTO,HighStl,HighBlk,HighPF,HighRank,HighSeed
0,2003,1411,1421,0,72.8,0.448892,0.321414,0.613745,13.166667,24.8,14.2,15.233333,6.433333,2.233333,18.3,239.28125,16.0,71.206897,0.432768,0.363494,0.766142,12.275862,23.172414,13.034483,16.206897,7.068966,3.0,19.103448,240.34375,16.0
1,2003,1112,1436,1,85.214286,0.463563,0.35106,0.701154,15.178571,27.642857,17.642857,14.785714,8.464286,4.214286,17.75,2.676471,1.0,67.793103,0.446372,0.35792,0.649708,12.965517,25.724138,14.206897,14.068966,6.862069,2.965517,15.896552,153.125,16.0
2,2003,1113,1272,1,75.965517,0.48168,0.328376,0.675667,13.689655,23.310345,15.551724,14.0,5.206897,4.241379,19.413793,36.0,10.0,74.517241,0.439457,0.343441,0.628299,14.068966,25.965517,16.62069,13.793103,7.37931,5.068966,18.758621,21.705882,7.0
3,2003,1141,1166,1,79.344828,0.506349,0.377481,0.762741,10.586207,23.275862,15.62069,18.241379,7.103448,4.0,20.965517,45.6875,11.0,79.242424,0.498309,0.384914,0.689707,10.878788,23.181818,16.818182,13.363636,8.393939,4.454545,17.272727,20.735294,6.0
4,2003,1143,1301,1,74.482759,0.468741,0.375934,0.688632,11.241379,24.37931,16.0,14.172414,6.551724,2.793103,17.103448,36.40625,8.0,72.4,0.458507,0.350564,0.778148,9.733333,22.033333,14.666667,14.2,7.766667,3.066667,18.666667,50.3125,9.0


We need to split the data so it's the target and input features. 

In [7]:
# Create separate object for target variable
y = df.Win

# Create separate object for input features
X = df.drop('Win', axis=1)

Split into training and test data with 20% of the observations going to the test set. We also give it a random state so we can reproduce the results.

In [8]:
# Split X and y into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1234)

# Print number of observations in X_train, X_test, y_train, and y_test
print(len(X_train), len(X_test), len(y_train), len(y_test))

892 223 892 223


Ok time for pipelines. I think random forest is going to be the way to go but let's also do gradient boosting.

In [9]:
# Pipeline dictionary
pipelines = {
    'rf': make_pipeline(StandardScaler(),RandomForestClassifier(random_state=123)),
    'gb': make_pipeline(StandardScaler(),GradientBoostingClassifier(random_state=123))
            }

And declare some hyperparameters.

In [10]:
# Random Forest hyperparameters
rf_hyperparameters = {
    'randomforestclassifier__n_estimators': [100, 200],
    'randomforestclassifier__max_features': ['auto', 'sqrt', 0.33]
}

In [11]:
# Boosted Tree hyperparameters
gb_hyperparameters = {'gradientboostingclassifier__n_estimators': [100,200],
                     'gradientboostingclassifier__learning_rate': [0.05,0.1,0.2],
                     'gradientboostingclassifier__max_depth': [1,3,5]}

In [12]:
# Create hyperparameters dictionary
hyperparameters = {
    'rf': rf_hyperparameters,
    'gb': gb_hyperparameters
}

Let's fit and tune the models with 10 fold cross validation. 

In [13]:
# Create empty dictionary called fitted_models
fitted_models = {}

# Loop through model pipelines, tuning each one and saving it to fitted_models
for name, pipeline in pipelines.items():
    # Create cross-validation object from pipeline and hyperparameters
    model = GridSearchCV(pipeline, hyperparameters[name], cv=10, n_jobs=-1, scoring='neg_log_loss')
    
    # Fit model on X_train, y_train
    model.fit(X_train,y_train)
    
    # Store model in fitted_models[name] 
    fitted_models[name] = model
    
    # Print '{name} has been fitted'
    print(name, 'has beed fitted')

rf has beed fitted
gb has beed fitted


How did we do?

In [14]:
# Display best_score_ for each fitted model
for name, model in fitted_models.items():
    print(name, model.best_score_)

rf -0.57062028302303
gb -0.5560755459022557


In [15]:
model.best_params_

{'gradientboostingclassifier__learning_rate': 0.05,
 'gradientboostingclassifier__max_depth': 1,
 'gradientboostingclassifier__n_estimators': 200}

We'll need to load in the submission dataset to run the model on.

In [16]:
data = pd.read_csv('../../data/processed/model_dataset2.csv')
data.head()

Unnamed: 0,Season,LowID,HighID,LowScore,LowFGP,LowFGP3,LowFTP,LowOR,LowDR,LowAst,LowTO,LowStl,LowBlk,LowPF,LowRank,LowSeed,HighScore,HighFGP,HighFGP3,HighFTP,HighOR,HighDR,HighAst,HighTO,HighStl,HighBlk,HighPF,HighRank,HighSeed
0,2015,1107,1112,65.5,0.44403,0.358773,0.728865,10.0,23.625,10.4375,11.6875,5.65625,1.625,16.5,118.87931,14.0,76.441176,0.486694,0.358874,0.699585,10.823529,26.411765,14.205882,11.205882,7.176471,3.588235,17.911765,4.209677,2.0
1,2015,1107,1116,65.5,0.44403,0.358773,0.728865,10.0,23.625,10.4375,11.6875,5.65625,1.625,16.5,118.87931,14.0,78.029412,0.448082,0.347178,0.717672,13.0,22.764706,16.147059,11.735294,7.764706,4.764706,18.970588,22.306452,5.0
2,2015,1107,1124,65.5,0.44403,0.358773,0.728865,10.0,23.625,10.4375,11.6875,5.65625,1.625,16.5,118.87931,14.0,69.125,0.435875,0.379881,0.668958,14.5,24.03125,14.40625,12.375,8.03125,3.84375,16.71875,14.177419,3.0
3,2015,1107,1125,65.5,0.44403,0.358773,0.728865,10.0,23.625,10.4375,11.6875,5.65625,1.625,16.5,118.87931,14.0,73.774194,0.47448,0.378229,0.695699,9.322581,23.967742,15.193548,13.741935,6.354839,1.967742,16.451613,129.724138,15.0
4,2015,1107,1129,65.5,0.44403,0.358773,0.728865,10.0,23.625,10.4375,11.6875,5.65625,1.625,16.5,118.87931,14.0,69.870968,0.455117,0.394584,0.729914,8.516129,24.032258,11.709677,10.225806,5.967742,2.322581,17.225806,46.7,11.0


Let's take a look at the predictions and probabilities for gradient boosting.

In [17]:
predictions = model.predict(data)
predictions

array([0, 0, 0, ..., 1, 1, 1])

In [18]:
probs = model.predict_proba(data)
probs

array([[0.9274778 , 0.0725222 ],
       [0.79980325, 0.20019675],
       [0.89279465, 0.10720535],
       ...,
       [0.39712889, 0.60287111],
       [0.23109482, 0.76890518],
       [0.28569995, 0.71430005]])

In [19]:
prob_of_1 = probs[:,1]
prob_of_1

array([0.0725222 , 0.20019675, 0.10720535, ..., 0.60287111, 0.76890518,
       0.71430005])

In [20]:
def make_id(row):
    season = row[0]
    low_id = row[1]
    high_id = row[2]
    ID = str(int(season))+'_'+str(int(low_id))+'_'+str(int(high_id))
    return ID

In [23]:
ID = data.apply(make_id,axis=1)

In [24]:
ID

0        2015_1107_1112
1        2015_1107_1116
2        2015_1107_1124
3        2015_1107_1125
4        2015_1107_1129
              ...      
11385    2019_1449_1459
11386    2019_1449_1463
11387    2019_1458_1459
11388    2019_1458_1463
11389    2019_1459_1463
Length: 11390, dtype: object

In [25]:
df_submission = pd.DataFrame()

In [26]:
df_submission['ID'] = ID
df_submission['Pred'] = prob_of_1
df_submission.head()

Unnamed: 0,ID,Pred
0,2015_1107_1112,0.072522
1,2015_1107_1116,0.200197
2,2015_1107_1124,0.107205
3,2015_1107_1125,0.362181
4,2015_1107_1129,0.211375


In [27]:
df_submission.to_csv('../../data/predictions/phase1_submissions.csv',index=False)