In [1]:
import os
import re
import sklearn
import numpy as np 
import pandas as pd

from collections import Counter
from sklearn.metrics import *
from sklearn.linear_model import *
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Data Preparation

### Seeds

In [2]:
df_seeds = pd.read_csv("../data/2022_Stage1/MNCAATourneySeeds.csv")
df_seeds.head()

FileNotFoundError: [Errno 2] No such file or directory: 'data/2022_Stage1/MNCAATourneySeeds.csv'

### Regular Season Results

In [None]:
df_season_results = pd.read_csv("data/2022_Stage1/MRegularSeasonCompactResults.csv")
df_season_results.drop(['WLoc', 'NumOT'], axis=1, inplace=True)

In [None]:
df_season_results['ScoreDiff'] = df_season_results['WScore'] - df_season_results['LScore']

In [None]:
df_season_results

### Tournamet Results

In [None]:
df_tourney_results = pd.read_csv("data/2022_Stage1/MNCAATourneyCompactResults.csv")
df_tourney_results.drop(['NumOT', 'WLoc'], axis=1, inplace=True)
#Remove results before 2016 since FiveThirtyEight ratings don't start until 2016
df_tourney_results = df_tourney_results[df_tourney_results['Season'] >= 2016].reset_index(drop=True)

### FiveThirtyEight Ratings

In [None]:
df_538 = pd.read_csv("data/538ratingsMen.csv")
df_538.drop('TeamName', axis=1, inplace=True)
df_538

## Game Result Engineering

For each team for each season, we need to compute:

<li>Number of wins</li>
<li>Number of losses</li>
<li>Average margin of victory</li>
<li>Average margin of losses</li>

In order to calculate the following features: 
<li>Win Percentage</li>
<li>Average margin of victory/loss</li>

In [None]:
win_count = df_season_results.groupby(['Season', 'WTeamID']).count()
win_count = win_count.reset_index()[['Season', 'WTeamID', 'DayNum']].rename(columns={"DayNum": "WinCount", "WTeamID": "TeamID"})
win_count

In [None]:
loss_count = df_season_results.groupby(['Season', 'LTeamID']).count()
loss_count = loss_count.reset_index()[['Season', 'LTeamID', 'DayNum']].rename(columns={"DayNum": "LossCount", "LTeamID": "TeamID"})
loss_count

In [None]:
win_margin = df_season_results.groupby(['Season', 'WTeamID']).mean().reset_index()
win_margin = win_margin[['Season', 'WTeamID', 'ScoreDiff']].rename(columns={"ScoreDiff": "AverageWinMargin", "WTeamID": "TeamID"})
win_margin

In [None]:
loss_margin = df_season_results.groupby(['Season', 'LTeamID']).mean().reset_index()
loss_margin = loss_margin[['Season', 'LTeamID', 'ScoreDiff']].rename(columns={"ScoreDiff": "AverageLossMargin", "LTeamID": "TeamID"})
loss_margin

In [None]:
df_features_season_wins = df_season_results.groupby(['Season', 'WTeamID']).count().reset_index()[['Season', 'WTeamID']].rename(columns={"WTeamID": "TeamID"})
df_features_season_wins

In [None]:
df_features_season_losses = df_season_results.groupby(['Season', 'LTeamID']).count().reset_index()[['Season', 'LTeamID']].rename(columns={"LTeamID": "TeamID"})
df_features_season_losses

In [None]:
df_features_season = pd.concat([df_features_season_wins, df_features_season_losses], axis=0).drop_duplicates().sort_values(['Season', 'TeamID']).reset_index(drop=True)

In [None]:
#Join all of the dataframes into one dataframe
df_features_season = df_features_season.merge(win_count, on=['Season', 'TeamID'], how='left')
df_features_season = df_features_season.merge(loss_count, on=['Season', 'TeamID'], how='left')
df_features_season = df_features_season.merge(win_margin, on=['Season', 'TeamID'], how='left')
df_features_season = df_features_season.merge(loss_margin, on=['Season', 'TeamID'], how='left')
df_features_season.fillna(0, inplace=True)  
df_features_season

In [None]:
#Calculate win percentage from win and loss count
df_features_season['WinPercentage'] = df_features_season['WinCount'] / (df_features_season['WinCount'] + df_features_season['LossCount'])

In [None]:
#Calculate average margin of victory/defeat 
df_features_season['GapAvg'] = (
    (df_features_season['WinCount'] * df_features_season['AverageWinMargin'] - 
    df_features_season['LossCount'] * df_features_season['AverageLossMargin'])
    / (df_features_season['WinCount'] + df_features_season['LossCount'])
)

In [None]:
#Drop values we dont need after calculating win percentage and average margin of victory/defeat
df_features_season.drop(['WinCount', 'LossCount', 'AverageWinMargin', 'AverageLossMargin'], axis=1, inplace=True)

## Feature Engineering

### Training Data

In [None]:
df = df_tourney_results.copy()
df

In [None]:
#Add SeedW column
df = pd.merge(
    df, 
    df_seeds, 
    how='left', 
    left_on=['Season', 'WTeamID'], 
    right_on=['Season', 'TeamID']
).drop('TeamID', axis=1).rename(columns={'Seed': 'SeedW'})


In [None]:
#Add SeedL column
df = pd.merge(
    df, 
    df_seeds, 
    how='left', 
    left_on=['Season', 'LTeamID'], 
    right_on=['Season', 'TeamID']
).drop('TeamID', axis=1).rename(columns={'Seed': 'SeedL'})


In [None]:
#Remove region and play in tournament marker from seed (Convert seed into ints)
def seed_string_to_int(seed):
    return int(re.sub("[^0-9]", "", seed))

In [None]:
df['SeedW'] = df['SeedW'].apply(seed_string_to_int)
df['SeedL'] = df['SeedL'].apply(seed_string_to_int)

In [None]:
df

In [None]:
#Add WinPercentageW and GapAvgW columns
df = pd.merge(
    df,
    df_features_season,
    how='left',
    left_on=['Season', 'WTeamID'],
    right_on=['Season', 'TeamID']
).rename(columns={
    'WinCount': 'WinCountW',
    'LossCount': 'LossCountW',
    'AverageWinMargin': 'AverageWinMarginW',
    'AverageLossMargin': 'AverageLossMarginW',
    'WinPercentage': 'WinPercentageW',
    'GapAvg': 'GapAvgW',
}).drop(columns='TeamID', axis=1)

In [None]:
#Add WinPercentageL and GapAvgL columns
df = pd.merge(
    df,
    df_features_season,
    how='left',
    left_on=['Season', 'LTeamID'],
    right_on=['Season', 'TeamID']
).rename(columns={
    'WinCount': 'WinCountL',
    'LossCount': 'LossCountL',
    'AverageWinMargin': 'AverageWinMarginL',
    'AverageLossMargin': 'AverageLossMarginL',
    'WinPercentage': 'WinPercentageL',
    'GapAvg': 'GapAvgL',
}).drop(columns='TeamID', axis=1)

In [None]:
df

In [None]:
#Add 538ratingW column
df = pd.merge(
    df,
    df_538,
    how='left',
    left_on=['Season', 'WTeamID'],
    right_on=['Season', 'TeamID']
).drop('TeamID', axis=1).rename(columns={'538rating': '538ratingW'})

In [None]:
#Add 538ratingL column
df = pd.merge(
    df, 
    df_538, 
    how='left', 
    left_on=['Season', 'LTeamID'], 
    right_on=['Season', 'TeamID']
).drop('TeamID', axis=1).rename(columns={'538rating': '538ratingL'})

In [None]:
df

In [None]:
def concact_win_loss_df(df):
    #Convert win team to team A, loss team to team B 
    win_df = df.copy()
    rename_win = {
        "WTeamID": "TeamIdA", 
        "WScore" : "ScoreA", 
        "LTeamID" : "TeamIdB",
        "LScore": "ScoreB",
     }
    
    rename_win.update({col : col[:-1] + "A" for col in df.columns if col.endswith('W')})
    rename_win.update({col : col[:-1] + "B" for col in df.columns if col.endswith('L')})
    
    win_df = win_df.rename(columns=rename_win)
    
    loss_df = df.copy()

    rename_loss = {
        "WTeamID": "TeamIdB", 
        "WScore" : "ScoreB", 
        "LTeamID" : "TeamIdA",
        "LScore": "ScoreA",
    }

    rename_loss.update({col : col[:-1] + "B" for col in df.columns if col.endswith('W')})
    rename_loss.update({col : col[:-1] + "A" for col in df.columns if col.endswith('L')})
    
    loss_df = loss_df.rename(columns=rename_loss)

    
    return pd.concat([win_df, loss_df], axis=0, sort=False)

In [None]:
df = concact_win_loss_df(df)
df

#### Feature Differences

In [None]:
df['ScoreDiff'] = df['ScoreA'] - df['ScoreB']
df['WinA'] = (df['ScoreDiff'] > 0).astype(int)

In [None]:
diff_cols = ['Seed', 'WinPercentage', 'GapAvg', '538rating']
#Compute difference between team A and B for each feature
for col in diff_cols:
    df[col + 'Diff'] = df[col + 'A'] - df[col + 'B']

In [None]:
df

### Test Data

In [None]:
df_test = pd.read_csv("data/2022_Stage1/MSampleSubmissionStage1.csv")
df_test

In [None]:
df_test['Season'] = df_test['ID'].apply(lambda x: int(x.split('_')[0]))
df_test['TeamIdA'] = df_test['ID'].apply(lambda x: int(x.split('_')[1]))
df_test['TeamIdB'] = df_test['ID'].apply(lambda x: int(x.split('_')[2]))

In [None]:
df_test

In [None]:
#Add SeedA column
df_test = pd.merge(
    df_test,
    df_seeds,
    how='left',
    left_on=['Season', 'TeamIdA'],
    right_on=['Season', 'TeamID']
).drop('TeamID', axis=1).rename(columns={'Seed': 'SeedA'})
df_test

In [None]:
#Add SeedB column
df_test = pd.merge(
    df_test, 
    df_seeds, 
    how='left', 
    left_on=['Season', 'TeamIdB'], 
    right_on=['Season', 'TeamID']
).drop('TeamID', axis=1).rename(columns={'Seed': 'SeedB'})
df_test

In [None]:
#Convert seed strings into ints
df_test['SeedA'] = df_test['SeedA'].apply(seed_string_to_int)
df_test['SeedB'] = df_test['SeedB'].apply(seed_string_to_int)

In [None]:
#Add WinPercentageA and GapAvgA columns
df_test = pd.merge(
    df_test,
    df_features_season,
    how='left',
    left_on=['Season', 'TeamIdA'],
    right_on=['Season', 'TeamID']
).rename(columns={
    'WinCount': 'WinCountA',
    'LossCount': 'LossCountA',
    'AverageWinMargin': 'AverageWinMarginA',
    'AverageLossMargin': 'AverageLossMarginA',
    'WinPercentage': 'WinPercentageA',
    'GapAvg': 'GapAvgA',
}).drop(columns='TeamID', axis=1)
df_test

In [None]:
#Add WinPercentageB and GapAvgB columns
df_test = pd.merge(
    df_test,
    df_features_season,
    how='left',
    left_on=['Season', 'TeamIdB'],
    right_on=['Season', 'TeamID']
).rename(columns={
    'WinCount': 'WinCountB',
    'LossCount': 'LossCountB',
    'AverageWinMargin': 'AverageWinMarginB',
    'AverageLossMargin': 'AverageLossMarginB',
    'WinPercentage': 'WinPercentageB',
    'GapAvg': 'GapAvgB',
}).drop(columns='TeamID', axis=1)
df_test

In [None]:
#Add 538ratingA column
df_test = pd.merge(
    df_test,
    df_538,
    how='left',
    left_on=['Season', 'TeamIdA'],
    right_on=['Season', 'TeamID']
).drop('TeamID', axis=1).rename(columns={'538rating': '538ratingA'})
df_test

In [None]:
#Add 538ratingB column
df_test = pd.merge(
    df_test,
    df_538,
    how='left',
    left_on=['Season', 'TeamIdB'],
    right_on=['Season', 'TeamID']
).drop('TeamID', axis=1).rename(columns={'538rating': '538ratingB'})
df_test

In [None]:
#Compute difference between team A and B for each feature
for col in diff_cols:
    df_test[col + 'Diff'] = df_test[col + 'A'] - df_test[col + 'B']

In [None]:
df_test

## Modeling

In [None]:
features = ['SeedDiff', '538ratingDiff', 'WinPercentageDiff', 'GapAvgDiff']

In [None]:
#Rescale train, test, and val data w/ features
def rescale(features, df_train, df_val, df_test=None):
    min_value = df_train[features].min()
    max_value = df_train[features].max()
    
    df_train[features] = (df_train[features] - min_value) / (max_value - min_value)
    df_val[features] = (df_val[features] - min_value) / (max_value - min_value)
    
    if df_test is not None:
        df_test[features] = (df_test[features] - min_value) / (max_value - min_value)
        
    return df_test, df_val, df_train

In [None]:
#Cross-validates data using k-fold method
def kfold_validation(df, df_test=None, verbose=0, mode="reg"):
    seasons = df['Season'].unique()
    log_loss_scores = []
    pred_win_probabilities = []
    target_y = "ScoreDiff" if mode == "reg" else "WinA"
    
    for season in seasons[1:]:        
        #Training data uses the previous season
        df_train = df[df['Season'] < season].reset_index(drop=True).copy()
        
        #Validation data uses the current season
        df_val = df[df['Season'] == season].reset_index(drop=True).copy()
        df_test = df_test.copy()
        
        df_test, df_val, df_train = rescale(features, df_train, df_val, df_test)
        
        if mode == "reg":
            model = LinearRegression()
        elif mode == "cls":
            model = LogisticRegression()

        model.fit(df_train[features], df_train[target_y])
        
        if mode == "reg":
            pred = model.predict(df_val[features])
            pred = (pred - pred.min()) / (pred.max() - pred.min())
        elif mode == "cls":
            pred = model.predict_proba(df_val[features])[:, 1]
        
        if df_test is not None:
            if mode == "reg":
                pred_win_probability = model.predict(df_test[features])
                pred_win_probability = (pred_win_probability - pred_win_probability.min()) / (pred_win_probability.max() - pred_win_probability.min())
            elif mode == "cls":
                pred_win_probability = model.predict_proba(df_test[features])[:, 1]
                
            pred_win_probabilities.append(pred_win_probability)
        
        log_loss_score = log_loss(df_val['WinA'].values, pred)
        log_loss_scores.append(log_loss_score)
        
        #Log loss score for the current season, prints if verbose value truthy
        if verbose:
            print(f'Log loss score for season {season}: {round(log_loss_score, 3)}')
    #Prints summary results if verbose value truthy
    if verbose:
        print(f'Average log loss score: {np.mean(log_loss_scores):.3f}')

    return pred_win_probabilities

In [None]:
kfold_validation(df, df_test, verbose=1, mode="reg")

In [None]:
kfold_validation(df, df_test, verbose=1, mode="cls")

In [None]:
pred_tests = kfold_validation(df, df_test, verbose=0, mode="cls")
pred_test = np.mean(pred_tests, 0)

In [None]:
submission = df_test[['ID', 'Season', 'Pred', 'TeamIdA', 'TeamIdB', 'SeedA', 'SeedB']].copy()
submission['Pred'] = pred_test

In [None]:
df_teams = pd.read_csv("data/2022_Stage1/MTeams.csv")
submission = submission.merge(df_teams, left_on="TeamIdA", right_on="TeamID").drop('TeamID', axis=1).rename(columns={"TeamName": "TeamA"})
submission = submission.merge(df_teams, left_on="TeamIdB", right_on="TeamID").drop('TeamID', axis=1).rename(columns={"TeamName": "TeamB"})

In [None]:
df_seeds['Seed'] = df_seeds['Seed'].apply(lambda x:x[0])

submission = submission.merge(df_seeds, left_on=["TeamIdA", "Season"], right_on=["TeamID", "Season"]).drop('TeamID', axis=1).rename(columns={"Seed": "RegionA"})
submission = submission.merge(df_seeds, left_on=["TeamIdB", "Season"], right_on=["TeamID", "Season"]).drop('TeamID', axis=1).rename(columns={"Seed": "RegionB"})
submission

In [None]:
final_submission = submission[['ID', 'Pred']].copy()

In [None]:
final_submission

In [None]:
final_submission.to_csv("../submission/FirstModel_submission.csv")