In [None]:
import numpy as np
import pandas as pd
import os
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# March Madness 2022 Mens

Our objective will be to try and predict the probability of a team winning in the Tournament phase of the NCAA.

First listing all the files we have access to..

In [None]:
filepath = '../input/mens-march-mania-2022/MDataFiles_Stage2/'

for filename in os.listdir(filepath):
    print(filename)
    

## Regular Season Results

First looking at the Regular Season results. Using the Detailed results as the assumption would be that these detailed results would have a bearing on the outcome. Focusing on how each team scored points, how many of those were free throws and three pointers? And what was a teams win percentage.

In [None]:
df_season = pd.read_csv(filepath + 'MRegularSeasonDetailedResults.csv')
df_season.head()

In [None]:
df_season.shape

With the Regular Season Results, we are going to group this by team and by year by combining win and loss stats and by finding the mean.

In [None]:
w_cols = ['Season', 'WTeamID', 'WLoc', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF']
l_cols = ['Season', 'LTeamID', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF']
cols = ['Season', 'TeamID', 'FGM', 'FGA', 'FGM3', 'FGA3', 'FTM', 'FTA', 'OR', 'DR', 'Ast', 'TO', 'Stl', 'Blk', 'PF']

w_stats = df_season[w_cols].groupby(['Season', 'WTeamID']).mean().reset_index()
w_stats.columns = cols

l_stats = df_season[l_cols].groupby(['Season', 'LTeamID']).mean().reset_index()
l_stats.columns = cols

df_reg_season = pd.concat([w_stats, l_stats]).groupby(['Season', 'TeamID']).mean().reset_index()
df_reg_season['id'] = df_reg_season.Season.astype(str) + df_reg_season.TeamID.astype(str)
df_reg_season.head()

In [None]:
df_reg_season.shape

Also by getting a percentage of Field Goals, Three Pointers and Free Throws. Adding them to the Regular season would give us additional features.

In [None]:
df_reg_season['FG%'] = (df_reg_season['FGM']/df_reg_season['FGA'])
df_reg_season['FG%3'] = (df_reg_season['FGM3']/df_reg_season['FGA3'])
df_reg_season['FTM%'] = (df_reg_season['FTM']/df_reg_season['FTA'])

In [None]:
df_reg_season.head()

## Massey Ordinals Ratings

The second data set to consider is the ratings per team. Combining them so that the Team Ranks will be for each team per year. Having the mean ranking as well as the max and min ranking for that year.

In [None]:
df_ranks = pd.read_csv(filepath + "MMasseyOrdinals_thruDay128.csv")
df_ranks.head()

In [None]:
df_ranks.shape

In [None]:
ranks_agg = df_ranks.groupby(['Season', 'TeamID']).agg({'OrdinalRank': ['mean', 'min', 'max']})
ranks_agg.columns = ['_'.join(col) for col in ranks_agg.columns]

df_team_ranks = ranks_agg.reset_index()
df_team_ranks['id'] = df_team_ranks.Season.astype(str) + df_team_ranks.TeamID.astype(str)
df_team_ranks.head()

In [None]:
df_team_ranks.shape

# Combine Regular Season and Team Rankings

Making our Master Dataset to train our model

In [None]:
def is_winning(wteam, lteam):
    if wteam < lteam:
        return 1
    else:
        return 0
    
    
march_madness = pd.read_csv(filepath + 'MNCAATourneyCompactResults.csv')
df_train = march_madness
df_train['is_win'] = df_train.apply(lambda x: is_winning(x['WTeamID'], x['LTeamID']), axis=1)
df_train['team_a'] = df_train.Season.astype(str) + df_train.WTeamID.astype(str)
df_train['team_b'] = df_train.Season.astype(str) + df_train.LTeamID.astype(str)
df_train = df_train.drop(['WScore', 'LScore'], axis=1)
df_train = pd.merge(df_train, df_team_ranks, left_on='team_a', right_on='id').merge(df_team_ranks, left_on='team_b', right_on='id', suffixes=('_teama', '_teamb'))
df_train = df_train.drop(['Season_x', 'Season_y', 'id_teama', 'id_teamb', 'TeamID_teama', 'TeamID_teamb'], axis=1)
df_train = pd.merge(df_train, df_reg_season, left_on='team_a', right_on='id').merge(df_reg_season, left_on='team_b', right_on='id', suffixes=('_teama', '_teamb'))
df_train = df_train.drop(['Season_x', 'Season_y', 'team_a', 'team_b', 'TeamID_teama', 'TeamID_teamb', 'id_teama', 'id_teamb'], axis=1)
df_train = df_train.drop(['DayNum', 'WTeamID', 'LTeamID', 'NumOT', 'Season', 'WLoc'], axis=1)
df_train.head()

In [None]:
df_train.shape

# Training the model

Will use Random Forest to predict which team will win also to predict the probability of that team winning

In [None]:
X = df_train.drop(['is_win'], axis=1)
y = df_train.is_win

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
model = RandomForestClassifier(random_state = 1)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy_score(y_test, y_pred)

Now to predict the probability of a team winning

In [None]:
def return_value(data, data_type=None):
    data = data.split('_')
    if data_type == 'Team A':
        return str(data[0]) + str(data[1])
    else:
        return str(data[0]) + str(data[2])


df_test = pd.read_csv(filepath + 'MSampleSubmissionStage2.csv')

df_test['team_a'] = df_test.apply(lambda x: return_value(x.ID, 'Team A'), axis=1)
df_test['team_b'] = df_test.apply(lambda x: return_value(x.ID), axis=1)
df_test = pd.merge(df_test, df_team_ranks, left_on='team_a', right_on='id').merge(df_team_ranks, left_on='team_b', right_on='id', suffixes=('_teama', '_teamb'))
df_test = pd.merge(df_test, df_reg_season, left_on='team_a', right_on='id').merge(df_reg_season, left_on='team_b', right_on='id', suffixes=('_teama', '_teamb'))
df_test = df_test.drop(['Season_teama', 'Season_teamb', 'team_a', 'team_b', 'TeamID_teama', 'TeamID_teamb', 'id_teama', 'id_teamb'], axis=1)

df_test.head()

In [None]:
df_test.shape

In [None]:
X = df_test.drop(['ID', 'Pred'], axis=1)

df_test['Pred'] = model.predict_proba(X)[:, 1]
df_test = df_test[['ID', 'Pred']]
df_test.head()

In [None]:
df_test.to_csv('submission.csv', index=False)