## Overview ##

A neural network drafted with TensorFlow to train a model to predict the outcome of NCAA Women's Basketball Tournament.

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
import os
import re
import urllib
import tensorflow as tf

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))


## Load the training data ##


In [None]:
data_dir = '../input/wncaatourneycompactresults/'
df = pd.read_csv(data_dir + 'WNCAATourneyCompactResults.csv')
df.head()

In [None]:
teamIdList = []
oppTeamIdList = []
resultList = []
for index in range(len(df['WTeamID'])):
    if df['WTeamID'][index] < df['LTeamID'][index]:
        teamIdList.append(df['WTeamID'][index])
        oppTeamIdList.append(df['LTeamID'][index])
        resultList.append(1)
    else:
        teamIdList.append(df['LTeamID'][index])
        oppTeamIdList.append(df['WTeamID'][index])
        resultList.append(0)

In [None]:
data_dir_seeds = '../input/wncaatourneyseeds/'
dfSeeds = pd.read_csv(data_dir_seeds + 'WNCAATourneySeeds.csv')
dfSeeds.head()

In [None]:
teamIdSeedList = []
oppTeamIdSeedList = []
for indexOut in range(len(teamIdList)):
    print(indexOut)
    for indexIn in range(len(dfSeeds['Season'])):
        if df['Season'][indexOut] == dfSeeds['Season'][indexIn] and dfSeeds['TeamID'][indexIn] == teamIdList[indexOut]:
            teamIdSeedList.append(float(re.sub("[^1-9]", "", dfSeeds['Seed'][indexIn])))
            break
            
for indexOut in range(len(oppTeamIdList)):
    print(indexOut)
    for indexIn in range(len(dfSeeds['Season'])):
        if df['Season'][indexOut] == dfSeeds['Season'][indexIn] and dfSeeds['TeamID'][indexIn] == oppTeamIdList[indexOut]:
            oppTeamIdSeedList.append(float(re.sub("[^1-9]", "", dfSeeds['Seed'][indexIn])))
            break

In [None]:
df['TeamID'] = teamIdList
df['OppTeamID'] = oppTeamIdList
df['TeamSeed'] = teamIdSeedList
df['OppTeamSeed'] = oppTeamIdSeedList
df['Result'] = resultList
df.head()

In [None]:
X = df[['TeamID', 'OppTeamID', 'TeamSeed', 'OppTeamSeed']]
y = df['Result']

In [None]:
my_feature_columns = []
for key in X.keys():
    my_feature_columns.append(tf.feature_column.numeric_column(key=key))

In [None]:
classifier = tf.estimator.DNNClassifier(
    feature_columns=my_feature_columns,
    # Two hidden layers of 10 nodes each.
    hidden_units=[10, 10],
    # The model must choose between 3 classes.
    n_classes=2)

## Train the model ##

In [None]:
def train_input_fn(features, labels, batch_size):
    """An input function for training"""
    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))

    # Shuffle, repeat, and batch the examples.
    dataset = dataset.shuffle(1000).repeat().batch(batch_size)

    # Return the dataset.
    return dataset

In [None]:
classifier.train(input_fn=lambda:train_input_fn(X, y, 20), steps=15000)

## Evaluate the model ##

In [None]:
def eval_input_fn(features, labels, batch_size):
    """An input function for evaluation or prediction"""
    features=dict(features)
    if labels is None:
        # No labels, use only features.
        inputs = features
    else:
        inputs = (features, labels)

    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices(inputs)

    # Batch the examples
    assert batch_size is not None, "batch_size must not be None"
    dataset = dataset.batch(batch_size)

    # Return the dataset.
    return dataset

In [None]:
data_dir3 = '../input/womens-machine-learning-competition-2018/'
df_sample_sub = pd.read_csv(data_dir3 + 'WSampleSubmissionStage2.csv')
ids = df_sample_sub['ID']

data_dir4 = '../input/wncaatourneyseeds2018/'
dfSeeds2018 = pd.read_csv(data_dir4 + 'WNCAATourneySeeds.csv')

In [None]:
seedDict = {}
for i in range(len(dfSeeds2018['Seed'])):
    seedDict[int(dfSeeds2018['TeamID'][i])] = float(re.sub("[^1-9]", "", dfSeeds2018['Seed'][i]))

## Get Predictions ##

In [None]:
predictionList = []
teamIdPred = []
oppTeamIdPred = []
teamSeedPred = []
oppTeamSeedPred = []

for idIndex in range(len(ids)):
    teamIdPred.append(float(ids[idIndex].split('_')[1]))
    oppTeamIdPred.append(float(ids[idIndex].split('_')[2]))
    teamSeedPred.append(seedDict[int(ids[indexOut].split('_')[1])])
    oppTeamSeedPred.append(seedDict[int(ids[indexOut].split('_')[2])])
    
predict = {
    'TeamID' : teamIdPred,
    'OppTeamID' : oppTeamIdPred,
    'TeamSeed': teamSeedPred,
    'OppTeamSeed' : oppTeamSeedPred
}

predictions = classifier.predict(input_fn=lambda:eval_input_fn(predict, labels=None, batch_size=20))
for pred_dict in predictions:
    class_id = pred_dict['class_ids'][0]
    probability = pred_dict['probabilities'][class_id]
    if class_id == 1:
        predictionList.append(probability)
    else:
        predictionList.append(1-probability)

In [None]:
df_sample_sub['Pred'] = predictionList
df_sample_sub.head()

In [None]:
df_sample_sub.to_csv('logreg_seed_starter.csv', index=False)