In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import glob
import numpy as np
pd.options.display.max_rows = 200

%matplotlib inline

In [2]:
files = glob.glob('../mens-march-mania-2022/MDataFiles_Stage1/*')
[print(i,'-->',f) for i,f in enumerate(files)];

0 --> ../mens-march-mania-2022/MDataFiles_Stage1/MNCAATourneyDetailedResults.csv
1 --> ../mens-march-mania-2022/MDataFiles_Stage1/MNCAATourneyCompactResults.csv
2 --> ../mens-march-mania-2022/MDataFiles_Stage1/MSeasons.csv
3 --> ../mens-march-mania-2022/MDataFiles_Stage1/fivethirtyeight_ncaa_forecasts_2018.csv
4 --> ../mens-march-mania-2022/MDataFiles_Stage1/fivethirtyeight_ncaa_forecasts_2019.csv
5 --> ../mens-march-mania-2022/MDataFiles_Stage1/MRegularSeasonDetailedResults.csv
6 --> ../mens-march-mania-2022/MDataFiles_Stage1/MNCAATourneySlots.csv
7 --> ../mens-march-mania-2022/MDataFiles_Stage1/MGameCities.csv
8 --> ../mens-march-mania-2022/MDataFiles_Stage1/538ratingsMen.csv
9 --> ../mens-march-mania-2022/MDataFiles_Stage1/MConferenceTourneyGames.csv
10 --> ../mens-march-mania-2022/MDataFiles_Stage1/Cities.csv
11 --> ../mens-march-mania-2022/MDataFiles_Stage1/MRegularSeasonCompactResults.csv
12 --> ../mens-march-mania-2022/MDataFiles_Stage1/MNCAATourneySeedRoundSlots.csv
13 --> ../m

# Season Stats

In [3]:
def loc_func(x):
    if x['WLoc'] == 'N':
        return 'N'
    elif x['WLoc'] == 'H' and x['result'] == 1:
        return 'H'
    elif x['WLoc'] == 'A' and x['result'] == 1:
        return 'A'
    elif x['WLoc'] == 'H' and x['result'] == 0:
        return 'A'
    elif x['WLoc'] == 'A' and x['result'] == 0:
        return 'H'
    else:
        return None

In [4]:
SeasonResults = pd.read_csv(files[11])
SeasonResults['GameID'] = np.arange(1,len(SeasonResults)+1)
SeasonResultsWin = SeasonResults[['Season','DayNum','GameID','WTeamID','WScore','LTeamID','LScore','WLoc']].copy()
SeasonResultsWin.columns = ['Season','DayNum','GameID','TeamID','Score','TeamID_OPP','Score_OPP','WLoc']
SeasonResultsWin['result'] = 1
SeasonResultsLose = SeasonResults[['Season','DayNum','GameID','WTeamID','WScore','LTeamID','LScore','WLoc']].copy()
SeasonResultsLose.columns = ['Season','DayNum','GameID','TeamID_OPP','Score_OPP','TeamID','Score','WLoc']
SeasonResultsLose['result'] = 0
SeasonResultsTeams = pd.concat([SeasonResultsWin,SeasonResultsLose],axis=0,sort=True).sort_values('GameID')
SeasonResultsTeams['Loc'] = SeasonResultsTeams.apply(loc_func,axis=1)

seasonAve = (SeasonResultsTeams
             .groupby(['Season','TeamID'])[['Score','Score_OPP','result']]
             .mean()
             .rename({'result':'WinPCT'},axis=1)
             .reset_index()
            )
seasonAve['ScoreDiff'] = seasonAve['Score'] - seasonAve['Score_OPP']

seasonAve.tail(10)

Unnamed: 0,Season,TeamID,Score,Score_OPP,WinPCT,ScoreDiff
12289,2022,1463,72.1,71.5,0.55,0.6
12290,2022,1464,70.681818,69.545455,0.545455,1.136364
12291,2022,1465,70.4,71.15,0.45,-0.75
12292,2022,1466,62.894737,69.789474,0.263158,-6.894737
12293,2022,1467,57.681818,65.227273,0.363636,-7.545455
12294,2022,1468,66.6,70.9,0.45,-4.3
12295,2022,1469,69.526316,77.526316,0.368421,-8.0
12296,2022,1470,63.428571,66.47619,0.380952,-3.047619
12297,2022,1471,67.1,71.1,0.4,-4.0
12298,2022,1472,72.736842,78.105263,0.263158,-5.368421


# Rankings

In [5]:
#Data from 2003-2021 NMasseyOrdinals.csv

ranking = pd.read_csv(files[18])

#rank_methods = ['COL','DOL','MOR','POM','RTH','SAG','WLK','WOL']

rank_methods = ['POM']
team_rank = (ranking[(ranking['RankingDayNum']==133)&(ranking['SystemName'].isin(rank_methods))]
             .groupby(['Season','TeamID','SystemName'])['OrdinalRank']
             .mean()
             .unstack(2)
             .reset_index()
            )
team_rank.tail(10)

SystemName,Season,TeamID,POM
6167,2021,1461,161.0
6168,2021,1462,61.0
6169,2021,1464,227.0
6170,2021,1465,238.0
6171,2021,1466,275.0
6172,2021,1467,270.0
6173,2021,1468,180.0
6174,2021,1469,323.0
6175,2021,1470,260.0
6176,2021,1471,236.0


# Tournament Seeds

In [6]:
TourneySeeds = pd.read_csv(files[21])
TourneySeeds['SeedN'] = TourneySeeds['Seed'].apply(lambda x: int(x[1:3]))
TourneySeeds.drop('Seed',axis=1,inplace=True)
TourneySeeds.tail(10)

Unnamed: 0,Season,TeamID,SeedN
2344,2021,1196,7
2345,2021,1314,8
2346,2021,1458,9
2347,2021,1439,10
2348,2021,1429,11
2349,2021,1457,12
2350,2021,1317,13
2351,2021,1159,14
2352,2021,1331,15
2353,2021,1216,16


# Team Data
Merge all the teams information (per season) into one file

In [7]:
teamData = (seasonAve
            .merge(team_rank,on=['Season','TeamID'],how='left')
            .merge(TourneySeeds,on=['Season','TeamID'],how='left')
           )

teamData

Unnamed: 0,Season,TeamID,Score,Score_OPP,WinPCT,ScoreDiff,POM,SeedN
0,1985,1102,63.083333,68.875000,0.208333,-5.791667,,
1,1985,1103,61.043478,64.086957,0.391304,-3.043478,,
2,1985,1104,68.500000,60.700000,0.700000,7.800000,,7.0
3,1985,1106,71.625000,75.416667,0.416667,-3.791667,,
4,1985,1108,83.000000,75.040000,0.760000,7.960000,,
...,...,...,...,...,...,...,...,...
12294,2022,1468,66.600000,70.900000,0.450000,-4.300000,,
12295,2022,1469,69.526316,77.526316,0.368421,-8.000000,,
12296,2022,1470,63.428571,66.476190,0.380952,-3.047619,,
12297,2022,1471,67.100000,71.100000,0.400000,-4.000000,,


In [8]:
teamData.dropna(subset=['POM'])

Unnamed: 0,Season,TeamID,Score,Score_OPP,WinPCT,ScoreDiff,POM,SeedN
5407,2003,1102,57.250000,57.000000,0.428571,0.250000,160.0,
5408,2003,1103,78.777778,78.148148,0.481481,0.629630,163.0,
5409,2003,1104,69.285714,65.000000,0.607143,4.285714,33.0,10.0
5410,2003,1105,71.769231,76.653846,0.269231,-4.884615,307.0,
5411,2003,1106,63.607143,63.750000,0.464286,-0.142857,263.0,
...,...,...,...,...,...,...,...,...
11936,2021,1467,66.277778,66.666667,0.500000,-0.388889,270.0,
11937,2021,1468,72.555556,68.277778,0.611111,4.277778,180.0,
11938,2021,1469,67.631579,78.210526,0.315789,-10.578947,323.0,
11939,2021,1470,63.866667,68.133333,0.333333,-4.266667,260.0,


# MNCAATourneyCompactResults.csv - Tournament Data - '85-'21
Merge the teams data into the tournement data file.
TeamID1 - always the team with the lower TeamID
TeamID2 - always the team with the higher TeamID
result - for TeamID1

In [9]:
TourneyCompactResults = pd.read_csv(files[1])
TourneyCompactResults['TeamID1'] = np.minimum(TourneyCompactResults['WTeamID'],TourneyCompactResults['LTeamID'])
TourneyCompactResults['TeamID2'] = np.maximum(TourneyCompactResults['WTeamID'],TourneyCompactResults['LTeamID'])
TourneyCompactResults['result'] = np.where(TourneyCompactResults['WTeamID']==TourneyCompactResults['TeamID1'],1,0)
TourneyCompactResults['ID'] = TourneyCompactResults['Season'].astype(str)+ '_' +TourneyCompactResults['TeamID1'].astype(str)+ '_' +TourneyCompactResults['TeamID2'].astype(str)
    
TourneyCompactResults = (TourneyCompactResults
                         .merge(teamData,left_on=['Season','TeamID1'],right_on=['Season','TeamID'],how='left')
                         .drop('TeamID',axis=1)
                         .merge(teamData,left_on=['Season','TeamID2'],right_on=['Season','TeamID'],how='left')
                         .drop('TeamID',axis=1)
                        )

TourneyCompactResults['SeedDiff'] = TourneyCompactResults['SeedN_x'] - TourneyCompactResults['SeedN_y']
TourneyCompactResults['ScoreDiff'] = TourneyCompactResults['ScoreDiff_x']  - TourneyCompactResults['ScoreDiff_y']
TourneyCompactResults['POMDiff'] = TourneyCompactResults['POM_x']  - TourneyCompactResults['POM_y']
TourneyCompactResults['WinPCTDiff'] = TourneyCompactResults['WinPCT_x']  - TourneyCompactResults['WinPCT_y']

TourneyCompactResults.tail().T

Unnamed: 0,2312,2313,2314,2315,2316
Season,2021,2021,2021,2021,2021
DayNum,148,148,152,152,154
WTeamID,1211,1417,1124,1211,1124
WScore,85,51,78,93,86
LTeamID,1425,1276,1222,1417,1211
LScore,66,49,59,90,70
WLoc,N,N,N,N,N
NumOT,0,0,0,1,0
TeamID1,1211,1276,1124,1211,1124
TeamID2,1425,1417,1222,1417,1211


# TRAIN

In [10]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GroupKFold
from sklearn.metrics import log_loss
from sklearn import preprocessing
from sklearn.inspection import permutation_importance
from sklearn.ensemble import StackingClassifier

In [11]:
cols = ['Score_x',
 'Score_OPP_x',
 'ScoreDiff_x',
 'POM_x',
 'SeedN_x',
 'Score_y',
 'Score_OPP_y',
 'ScoreDiff_y',
 'POM_y',
 'SeedN_y',
 'SeedDiff',
 'ScoreDiff',
 'POMDiff',
 'WinPCT_x',
 'WinPCT_y',
 'WinPCTDiff']

In [12]:
X = TourneyCompactResults.loc[:,cols]
y = TourneyCompactResults[['result']].values.ravel()

min_year = 2002

# Train Models
Method 1
use all the previous seasons to predict a particular season

In [13]:
rf = RandomForestClassifier(n_estimators=100,random_state=42,min_samples_split=100)
lr = LogisticRegression(solver='lbfgs',C=0.1,random_state=42,max_iter=1000)
lsvc = SVC(random_state=42,probability=True,kernel='linear')
nnb = KNeighborsClassifier(50)

estimators = [
    ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
    ('lr',LogisticRegression(solver='lbfgs',C=0.1,random_state=42,max_iter=500)),
    ('nnb',KNeighborsClassifier(50)),
    ('lsvc',SVC(random_state=42,probability=True,kernel='linear'))]
clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

scores = np.zeros((5,5))
for ii,s in enumerate(range(2015,2020)):
    idxTrain = ((TourneyCompactResults['Season'] < s) & (TourneyCompactResults['Season'] > min_year))
    idxTest = (TourneyCompactResults['Season'] == s)
    # fit all models
    rf.fit(X.loc[idxTrain],y[idxTrain])
    lr.fit(X.loc[idxTrain],y[idxTrain])
    lsvc.fit(X.loc[idxTrain],y[idxTrain])
    nnb.fit(X.loc[idxTrain],y[idxTrain])
    clf.fit(X.loc[idxTrain],y[idxTrain])
    
    ypred_rf = rf.predict_proba(X.loc[idxTest])
    ypred_lr = lr.predict_proba(X.loc[idxTest])
    ypred_lsvc = lsvc.predict_proba(X.loc[idxTest])
    ypred_nnb = nnb.predict_proba(X.loc[idxTest])
    ypred_clf = clf.predict_proba(X.loc[idxTest])
    
    scores[ii,0] = log_loss(y[idxTest],ypred_rf[:,1])
    scores[ii,1] = log_loss(y[idxTest],ypred_lr[:,1])
    scores[ii,2] = log_loss(y[idxTest],ypred_lsvc[:,1])
    scores[ii,3] = log_loss(y[idxTest],ypred_nnb[:,1])
    scores[ii,4] = log_loss(y[idxTest],ypred_clf[:,1])

In [14]:
scores_df = pd.DataFrame(scores,
                         index = range(2015,2020),
                         columns=['random forest','logistic regression','linear svc','nearest neighbor','stacking']).T

scores_df['average score'] = scores_df.mean(axis=1)
scores_df['score std'] = scores_df.std(axis=1)

scores_df

Unnamed: 0,2015,2016,2017,2018,2019,average score,score std
random forest,0.514897,0.560852,0.527885,0.602276,0.496948,0.540571,0.037261
logistic regression,0.509913,0.563506,0.521742,0.611931,0.511367,0.543692,0.039286
linear svc,0.4939,0.588295,0.534624,0.606044,0.54642,0.553857,0.039823
nearest neighbor,0.557053,0.536348,0.533607,1.07483,0.49575,0.639517,0.218553
stacking,0.520554,0.560272,0.529783,0.592155,0.523941,0.545341,0.027309


# Create groups by seasons

In [15]:
idxTrain = ((TourneyCompactResults['Season'] < 2021) & (TourneyCompactResults['Season'] > min_year))
group_kfold = GroupKFold(n_splits=6)

scores = []
for model in [rf,lr,lsvc,nnb,clf]:
    scores.append(cross_val_score(model,
                            X[idxTrain],
                            y[idxTrain],
                            groups=TourneyCompactResults.loc[idxTrain,'Season'],
                            cv=group_kfold,
                            scoring='neg_log_loss'))

In [16]:
pd.DataFrame([-1*np.mean(np.array(scores),axis=1),np.std(np.array(scores),axis=1)],
            columns=['random forest','logistic regression','linear svc','nearest neighbor','stacking'],
            index = ['average score','score std']).T

Unnamed: 0,average score,score std
random forest,0.545789,0.020195
logistic regression,0.547776,0.020777
linear svc,0.550296,0.025467
nearest neighbor,0.597653,0.094532
stacking,0.553119,0.019956


In [17]:
idxTrain = ((TourneyCompactResults['Season'] < 2021) & (TourneyCompactResults['Season'] > min_year))
lr.fit(X.loc[idxTrain],y[idxTrain]);
rf.fit(X.loc[idxTrain],y[idxTrain]);

In [22]:
testDF = pd.read_csv('../mens-march-mania-2022/MDataFiles_Stage1/MSampleSubmissionStage1.csv').drop('Pred',axis=1)
testDF = pd.concat([testDF,testDF['ID'].str.split('_',expand=True).rename({0:'Season',1:'TeamID1',2:'TeamID2'},axis=1)],axis=1)
testDF['Season'] = testDF['Season'].astype('int64')
testDF['TeamID1'] = testDF['TeamID1'].astype('int64')
testDF['TeamID2'] = testDF['TeamID2'].astype('int64')
testDF.tail(10)

Unnamed: 0,ID,Season,TeamID1,TeamID2
11380,2021_1439_1452,2021,1439,1452
11381,2021_1439_1455,2021,1439,1455
11382,2021_1439_1457,2021,1439,1457
11383,2021_1439_1458,2021,1439,1458
11384,2021_1452_1455,2021,1452,1455
11385,2021_1452_1457,2021,1452,1457
11386,2021_1452_1458,2021,1452,1458
11387,2021_1455_1457,2021,1455,1457
11388,2021_1455_1458,2021,1455,1458
11389,2021_1457_1458,2021,1457,1458


In [21]:
subFile = (testDF
             .merge(teamData,left_on=['Season','TeamID1'],right_on=['Season','TeamID'],how='left')
             .drop('TeamID',axis=1)
             .merge(teamData,left_on=['Season','TeamID2'],right_on=['Season','TeamID'],how='left')
             .drop('TeamID',axis=1)
          )

subFile['SeedDiff'] = subFile['SeedN_x'] - subFile['SeedN_y']
subFile['ScoreDiff'] = subFile['ScoreDiff_x']  - subFile['ScoreDiff_y']
subFile['POMDiff'] = subFile['POM_x']  - subFile['POM_y']
subFile['WinPCTDiff'] = subFile['WinPCT_x']  - subFile['WinPCT_y']

subFile['Pred1'] = lr.predict_proba(subFile.loc[:,cols])[:,1]
subFile['Pred2'] = rf.predict_proba(subFile.loc[:,cols])[:,1]

subFile['Pred'] = 0.5*(subFile['Pred1']+subFile['Pred2'])
subFile = subFile[['ID','Pred']]


#Pushes a csv file creating a predictability score metric by ID (ID= TeamID1 & TeamID from merge)
#subFile.to_csv('results_2.csv',index=False)