## Starter: feature engineering and modeling

Hat-tip to the following notebooks for inspiration:
* https://www.kaggle.com/theoviel/ncaa-starter-the-simpler-the-better
* https://www.kaggle.com/svyatoslavsokolov/2021-ncaam-first-step
* https://www.kaggle.com/byfone/ncaa-m-2021-logistic-regression


### Dataframes

The following dataframes are built up in steps 1-4 below, before step 5 (train model and predict).

|#|dataframe|original CSV|years|description|
|-|---------|------------|-----|-----------|
|1|`df_detailed_results`|`MRegularSeasonDetailedResults.csv`|2003-2020|Team level box scores for all games between 2003 and 2020, used to populate #2 below|
|2|`df_teams`|none (manually created from #1)|2003-2020|Team level statistics by year, so we can understand team relative strength when comparing two teams in a game|
|3|`df_tourney_results`|`MNCAATourneyCompactResults.csv`|1985-2019|TRAIN data to train model and predict #5 `df_test`|
|4|`df_seeds`|`MNCAATourneySeeds.csv`|1985-2019|Merged onto df_tourney_results and df_test, so we can understand teams' relative seeds when comparing two teams in a game|
|5|`df_test`|`MSampleSubmissionStage1.csv`|2015-2019|Submission TEST data to predict (populate `Pred` column for upload)|


## Step 1 - data load and transform

Load `MRegularSeasonDetailedResults.csv` into `df_detailed_results` to populate team-level performance metrics in `df_teams`. Read CSVs, tally game statistics into season statistics `win_loss` dictionary, transfrom `win_loss` dictionary to `df_teams` dataframe.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import *
from sklearn.model_selection import *
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

df_detailed_results = pd.read_csv('/kaggle/input/ncaam-march-mania-2021/MRegularSeasonDetailedResults.csv')
teams = pd.read_csv('/kaggle/input/ncaam-march-mania-2021/MTeams.csv')

pd.set_option('display.max_columns', None)

In [None]:
def team_id_to_team_name(team_id):
    return teams[teams['TeamID'] == team_id].iloc[0]['TeamName']

df_detailed_results.head()

In [None]:
def initialize_team_row(team_id):
    return {'Season': season,
            'Wins': 0,
            'Losses': 0,

            'TeamName': team_id_to_team_name(team_id),
            'TeamID': team_id,
            'PointDifferential': 0,

            'FGM': 0,
            'FGA': 0,
            'FGM3': 0,
            'FGA3': 0,
            'FTM': 0,
            'FTA': 0,
            'OR': 0,
            'DR': 0,
            'AST': 0,
            'TO': 0,
            'STL': 0,
            'BLK': 0,

            'OppFGM': 0,
            'OppFGA': 0,
            'OppFGM3': 0,
            'OppFGA3': 0,
            'OppFTM': 0,
            'OppFTA': 0,
            'OppOR': 0,
            'OppDR': 0,
            'OppAST': 0,
            'OppTO': 0,
            'OppSTL': 0,
            'OppBLK': 0}

win_loss = {}

print("iterating through " + str(len(df_detailed_results)) + " rows")
for index, row in df_detailed_results.iterrows():
    if(index % 10000 == 0):
        print("row " + str(index) + " of " + str(len(df_detailed_results)))
    
    season = row['Season']
    w_team_id = row['WTeamID']
    l_team_id = row['LTeamID']

    wfgm = row['WFGM']
    wfga = row['WFGA']
    wfgm3 = row['WFGM3']
    wfga3 = row['WFGA3']
    wftm = row['WFTM']
    wfta = row['WFTA']
    wor = row['WOR']
    wdr = row['WDR']
    wast = row['WAst']
    wto = row['WTO']
    wstl = row['WStl']
    wblk = row['WBlk']

    lfgm = row['LFGM']
    lfga = row['LFGA']
    lfgm3 = row['LFGM3']
    lfga3 = row['LFGA3']
    lftm = row['LFTM']
    lfta = row['LFTA']
    lor = row['LOR']
    ldr = row['LDR']
    last = row['LAst']
    lto = row['LTO']
    lstl = row['LStl']
    lblk = row['LBlk']
    
    point_differential = row['WScore'] - row['LScore']
    
    if season not in win_loss:
        win_loss[season] = {}
    if w_team_id not in win_loss[season]:
        win_loss[season][w_team_id] = initialize_team_row(w_team_id)
    if l_team_id not in win_loss[season]:
        win_loss[season][l_team_id] = initialize_team_row(l_team_id)
    
    win_loss[season][w_team_id]['Wins'] += 1
    win_loss[season][w_team_id]['PointDifferential'] += point_differential
    win_loss[season][w_team_id]['FGM'] += wfgm
    win_loss[season][w_team_id]['FGA'] += wfga
    win_loss[season][w_team_id]['FGM3'] += wfgm3
    win_loss[season][w_team_id]['FGA3'] += wfga3
    win_loss[season][w_team_id]['FTM'] += wftm
    win_loss[season][w_team_id]['FTA'] += wfta
    win_loss[season][w_team_id]['OR'] += wor
    win_loss[season][w_team_id]['DR'] += wdr
    win_loss[season][w_team_id]['AST'] += wast
    win_loss[season][w_team_id]['TO'] += wto
    win_loss[season][w_team_id]['STL'] += wstl
    win_loss[season][w_team_id]['BLK'] += wblk
    
    win_loss[season][w_team_id]['OppFGM'] += lfgm
    win_loss[season][w_team_id]['OppFGA'] += lfga
    win_loss[season][w_team_id]['OppFGM3'] += lfgm3
    win_loss[season][w_team_id]['OppFGA3'] += lfga3
    win_loss[season][w_team_id]['OppFTM'] += lftm
    win_loss[season][w_team_id]['OppFTA'] += lfta
    win_loss[season][w_team_id]['OppOR'] += lor
    win_loss[season][w_team_id]['OppDR'] += ldr
    win_loss[season][w_team_id]['OppAST'] += last
    win_loss[season][w_team_id]['OppTO'] += lto
    win_loss[season][w_team_id]['OppSTL'] += lstl
    win_loss[season][w_team_id]['OppBLK'] += lblk

    win_loss[season][l_team_id]['Losses'] += 1
    win_loss[season][l_team_id]['PointDifferential'] -= point_differential
    win_loss[season][l_team_id]['FGM'] += lfgm
    win_loss[season][l_team_id]['FGA'] += lfga
    win_loss[season][l_team_id]['FGM3'] += lfgm3
    win_loss[season][l_team_id]['FGA3'] += lfga3
    win_loss[season][l_team_id]['FTM'] += lftm
    win_loss[season][l_team_id]['FTA'] += lfta
    win_loss[season][l_team_id]['OR'] += lor
    win_loss[season][l_team_id]['DR'] += ldr
    win_loss[season][l_team_id]['AST'] += last
    win_loss[season][l_team_id]['TO'] += lto
    win_loss[season][l_team_id]['STL'] += lstl
    win_loss[season][l_team_id]['BLK'] += lblk
    
    win_loss[season][l_team_id]['OppFGM'] += wfgm
    win_loss[season][l_team_id]['OppFGA'] += wfga
    win_loss[season][l_team_id]['OppFGM3'] += wfgm3
    win_loss[season][l_team_id]['OppFGA3'] += wfga3
    win_loss[season][l_team_id]['OppFTM'] += wftm
    win_loss[season][l_team_id]['OppFTA'] += wfta
    win_loss[season][l_team_id]['OppOR'] += wor
    win_loss[season][l_team_id]['OppDR'] += wdr
    win_loss[season][l_team_id]['OppAST'] += wast
    win_loss[season][l_team_id]['OppTO'] += wto
    win_loss[season][l_team_id]['OppSTL'] += wstl
    win_loss[season][l_team_id]['OppBLK'] += wblk

print("FINISHED")

In [None]:
# *** NEW DF: df_teams ***
# years 2003-2020
cols = ['Season', 'TeamID', 'TeamName', 'Wins', 'Losses', 'PointDifferential',
        'FGM', 'FGA', 'FGM3', 'FGA3', 'FTM', 'FTA', 'OR', 'DR', 'AST', 'TO', 'STL', 'BLK',
        'OppFGM', 'OppFGA', 'OppFGM3', 'OppFGA3', 'OppFTM', 'OppFTA', 'OppOR', 'OppDR', 'OppAST', 'OppTO',
        'OppSTL', 'OppBLK']

df_teams = pd.DataFrame([], columns = cols)

print("iterating through " + str(len(win_loss.values())) + " years of results")
for i, v in enumerate(win_loss.values()):
    print("year " + str(i+1) + " of " + str(len(win_loss.values())))
    for row in v.values():
        df_teams = df_teams.append(row, ignore_index=True)

df_teams['Season'] = df_teams['Season'].astype(int)
df_teams['TeamID'] = df_teams['TeamID'].astype(int)
df_teams['TeamName'] = df_teams['TeamName'].astype(str)

print("FINISHED")

In [None]:
df_teams.head()

## Step 2 - feature engineering

Add rate statistic dimensions to `df_teams`. For example:
* FG percentage and 3-point FG percentage
* Per game rates like assists/game, TO/game, and blocks/game
* Other ratios like assist-to-TO and assisted FG rate

In [None]:
n_games = (df_teams['Wins'] + df_teams['Losses'])

df_teams['WinningPct'] = df_teams['Wins'] / n_games

df_teams['FGPct'] = df_teams['FGM'] / df_teams['FGA']
df_teams['FG3Pct'] = df_teams['FGM3'] / df_teams['FGA3']
df_teams['FTPct'] = df_teams['FTM'] / df_teams['FTA']
df_teams['ORPerGame'] = df_teams['OR'] / n_games
df_teams['DRPerGame'] = df_teams['DR'] / n_games
df_teams['ASTPerGame'] = df_teams['AST'] / n_games
df_teams['TOPerGame'] = df_teams['TO'] / n_games
df_teams['ASTtoTO'] = df_teams['AST'] / df_teams['TO']
df_teams['STLPerGame'] = df_teams['STL'] / n_games
df_teams['BLKPerGame'] = df_teams['BLK'] / n_games
df_teams['FGAPerGame'] = df_teams['FGA'] / n_games
df_teams['PointDifferentialPerGame'] = df_teams['PointDifferential'] / n_games
df_teams['AssistedFGPct'] = df_teams['AST'] / df_teams['FGM']

df_teams['OppFGPct'] = df_teams['OppFGM'] / df_teams['OppFGA']
df_teams['OppFG3Pct'] = df_teams['OppFGM3'] / df_teams['OppFGA3']
df_teams['OppFTPct'] = df_teams['OppFTM'] / df_teams['OppFTA']
df_teams['OppORPerGame'] = df_teams['OppOR'] / n_games
df_teams['OppDRPerGame'] = df_teams['OppDR'] / n_games
df_teams['OppASTPerGame'] = df_teams['OppAST'] / n_games
df_teams['OppTOPerGame'] = df_teams['OppTO'] / n_games
df_teams['OppASTtoTO'] = df_teams['OppAST'] / df_teams['OppTO']
df_teams['OppSTLPerGame'] = df_teams['OppSTL'] / n_games
df_teams['OppBLKPerGame'] = df_teams['OppBLK'] / n_games
df_teams['OppFGAPerGame'] = df_teams['OppFGA'] / n_games
df_teams['OppAssistedFGPct'] = df_teams['OppAST'] / df_teams['OppFGM']

In [None]:
count_cols = ['Wins', 'Losses', 'PointDifferential',
              'FGM', 'FGA', 'FGM3', 'FGA3', 'FTM', 'FTA', 'OR', 'DR', 'AST', 'TO', 'STL', 'BLK',
              'OppFGM', 'OppFGA', 'OppFGM3', 'OppFGA3', 'OppFTM', 'OppFTA', 'OppOR', 'OppDR', 'OppAST', 'OppTO',
              'OppSTL', 'OppBLK']

per_game_cols = ['WinningPct', 'ORPerGame', 'DRPerGame', 'ASTPerGame', 'TOPerGame', 'STLPerGame', 'BLKPerGame',
                 'FGAPerGame', 'PointDifferentialPerGame',
                 'OppDRPerGame', 'OppASTPerGame', 'OppTOPerGame', 'OppASTtoTO', 'OppSTLPerGame', 'OppBLKPerGame',
                 'OppFGAPerGame']

rate_cols = per_game_cols + ['FGPct', 'FG3Pct', 'FTPct', 'ASTtoTO', 'OppFGPct', 'OppFG3Pct', 'OppFTPct',
                             'OppORPerGame', 'AssistedFGPct', 'OppAssistedFGPct']

numeric_cols = count_cols + rate_cols

df_teams[numeric_cols] = df_teams[numeric_cols].apply(pd.to_numeric)
set(df_teams.columns).difference(set(numeric_cols))

## Step 3 - prep TRAINING data (previous tourney results)

Load `MNCAATourneyCompactResults.csv` and `MNCAATourneySeeds.csv`, merge together so tournament results include seeding, clean seed fields, rename W and L team columns to match other dataframes (teams A and B).

In [None]:
# *** NEW DF: df_tourney_results ***
# *** NEW DF: df_seeds ***
# (years 1985-2019)

df_tourney_results = pd.read_csv('/kaggle/input/ncaam-march-mania-2021/MNCAATourneyCompactResults.csv')
df_seeds = pd.read_csv('/kaggle/input/ncaam-march-mania-2021/MNCAATourneySeeds.csv')

df_tourney_results = df_tourney_results.merge(df_seeds, how='left', left_on=['Season', 'WTeamID'], right_on=['Season', 'TeamID']).drop('TeamID', axis=1).rename(columns={'Seed': 'WSeed'})
df_tourney_results = df_tourney_results.merge(df_seeds, how='left', left_on=['Season', 'LTeamID'], right_on=['Season', 'TeamID']).drop('TeamID', axis=1).rename(columns={'Seed': 'LSeed'})

df_tourney_results['WSeed'] = df_tourney_results['WSeed'].str.replace(r'[A-Za-z]', '', regex=True).astype('int')
df_tourney_results['LSeed'] = df_tourney_results['LSeed'].str.replace(r'[A-Za-z]', '', regex=True).astype('int')

df_tourney_results = df_tourney_results.drop(['DayNum', 'NumOT', 'WLoc'], axis=1)
df_tourney_results = df_tourney_results.rename(columns={'WTeamID': 'ATeamID',
                                                        'LTeamID': 'BTeamID',
                                                        'WScore': 'AScore',
                                                        'LScore': 'BScore',
                                                        'WSeed': 'ASeed',
                                                        'LSeed': 'BSeed'})

In [None]:
# **SIMILAR CODE TO BELOW (here for df_tourney_results TRAIN data, below for df_test TEST data)**
# merge df_teams and rename to Team A and Team B column names
df_tourney_results = df_tourney_results.merge(df_teams, how='left', left_on=['Season', 'ATeamID'], right_on=['Season', 'TeamID'])
df_tourney_results = df_tourney_results.rename(columns={'Wins': 'AWins',
                                                        'Losses': 'ALosses',
                                                        'PointDifferential': 'APointDifferential',
                                                        'WinningPct': 'AWinningPct',
                                                        'TeamName': 'ATeamName',
                                                        'FGM': 'AFGM',
                                                        'FGA': 'AFGA',
                                                        'FGM3': 'AFGM3',
                                                        'FGA3': 'AFGA3',
                                                        'FTM': 'AFTM',
                                                        'FTA': 'AFTA',
                                                        'OR': 'AOR',
                                                        'DR': 'ADR',
                                                        'AST': 'AAST',
                                                        'TO': 'ATO',
                                                        'STL': 'ASTL',
                                                        'BLK': 'ABLK',
                                                        'OppFGM': 'AOppFGM',
                                                        'OppFGA': 'AOppFGA',
                                                        'OppFGM3': 'AOppFGM3',
                                                        'OppFGA3': 'AOppFGA3',
                                                        'OppFTM': 'AOppFTM',
                                                        'OppFTA': 'AOppFTA',
                                                        'OppOR': 'AOppOR',
                                                        'OppDR': 'AOppDR',
                                                        'OppAST': 'AOppAST',
                                                        'OppTO': 'AOppTO',
                                                        'OppSTL': 'AOppSTL',
                                                        'OppBLK': 'AOppBLK',
                                                        'FGPct': 'AFGPct',
                                                        'FGAPerGame': 'AFGAPerGame',
                                                        'FG3Pct': 'AFG3Pct',
                                                        'FTPct': 'AFTPct',
                                                        'PointDifferentialPerGame': 'APointDifferentialPerGame',
                                                        'ORPerGame': 'AORPerGame',
                                                        'DRPerGame': 'ADRPerGame',
                                                        'ASTPerGame': 'AASTPerGame',
                                                        'TOPerGame': 'ATOPerGame',
                                                        'ASTtoTO': 'AASTtoTO',
                                                        'AssistedFGPct': 'AAssistedFGPct',
                                                        'STLPerGame': 'ASTLPerGame',
                                                        'BLKPerGame': 'ABLKPerGame',
                                                        'OppFGPct': 'AOppFGPct',
                                                        'OppFG3Pct': 'AOppFG3Pct',
                                                        'OppFTPct': 'AOppFTPct',
                                                        'OppORPerGame': 'AOppORPerGame',
                                                        'OppDRPerGame': 'AOppDRPerGame',
                                                        'OppASTPerGame': 'AOppASTPerGame',
                                                        'OppTOPerGame': 'AOppTOPerGame',
                                                        'OppASTtoTO': 'AOppASTtoTO',
                                                        'OppSTLPerGame': 'AOppSTLPerGame',
                                                        'OppBLKPerGame': 'AOppBLKPerGame',
                                                        'OppFGAPerGame': 'AOppFGAPerGame',
                                                        'OppAssistedFGPct': 'AOppAssistedFGPct'})
df_tourney_results = df_tourney_results.drop("TeamID", axis=1)

df_tourney_results = df_tourney_results.merge(df_teams, how='left', left_on=['Season', 'BTeamID'], right_on=['Season', 'TeamID'])
df_tourney_results = df_tourney_results.rename(columns={'Wins': 'BWins',
                                                        'Losses': 'BLosses',
                                                        'PointDifferential': 'BPointDifferential',
                                                        'WinningPct': 'BWinningPct',
                                                        'TeamName': 'BTeamName',
                                                        'FGM': 'BFGM',
                                                        'FGA': 'BFGA',
                                                        'FGM3': 'BFGM3',
                                                        'FGA3': 'BFGA3',
                                                        'FTM': 'BFTM',
                                                        'FTA': 'BFTA',
                                                        'OR': 'BOR',
                                                        'DR': 'BDR',
                                                        'AST': 'BAST',
                                                        'TO': 'BTO',
                                                        'STL': 'BSTL',
                                                        'BLK': 'BBLK',
                                                        'OppFGM': 'BOppFGM',
                                                        'OppFGA': 'BOppFGA',
                                                        'OppFGM3': 'BOppFGM3',
                                                        'OppFGA3': 'BOppFGA3',
                                                        'OppFTM': 'BOppFTM',
                                                        'OppFTA': 'BOppFTA',
                                                        'OppOR': 'BOppOR',
                                                        'OppDR': 'BOppDR',
                                                        'OppAST': 'BOppAST',
                                                        'OppTO': 'BOppTO',
                                                        'OppSTL': 'BOppSTL',
                                                        'OppBLK': 'BOppBLK',
                                                        'FGPct': 'BFGPct',
                                                        'FGAPerGame': 'BFGAPerGame',
                                                        'FG3Pct': 'BFG3Pct',
                                                        'FTPct': 'BFTPct',
                                                        'PointDifferentialPerGame': 'BPointDifferentialPerGame',
                                                        'ORPerGame': 'BORPerGame',
                                                        'DRPerGame': 'BDRPerGame',
                                                        'ASTPerGame': 'BASTPerGame',
                                                        'TOPerGame': 'BTOPerGame',
                                                        'ASTtoTO': 'BASTtoTO',
                                                        'AssistedFGPct': 'BAssistedFGPct',
                                                        'STLPerGame': 'BSTLPerGame',
                                                        'BLKPerGame': 'BBLKPerGame',
                                                        'OppFGPct': 'BOppFGPct',
                                                        'OppFG3Pct': 'BOppFG3Pct',
                                                        'OppFTPct': 'BOppFTPct',
                                                        'OppORPerGame': 'BOppORPerGame',
                                                        'OppDRPerGame': 'BOppDRPerGame',
                                                        'OppASTPerGame': 'BOppASTPerGame',
                                                        'OppTOPerGame': 'BOppTOPerGame',
                                                        'OppASTtoTO': 'BOppASTtoTO',
                                                        'OppSTLPerGame': 'BOppSTLPerGame',
                                                        'OppBLKPerGame': 'BOppBLKPerGame',
                                                        'OppFGAPerGame': 'BOppFGAPerGame',
                                                        'OppAssistedFGPct': 'BOppAssistedFGPct'})

df_tourney_results = df_tourney_results.drop("TeamID", axis=1)
df_tourney_results['_targetScoreDiff'] = df_tourney_results['AScore'] - df_tourney_results['BScore']
df_tourney_results['_targetWinA'] = (df_tourney_results['_targetScoreDiff'] > 0).astype(int)
df_tourney_results['SeedDiff'] = df_tourney_results['ASeed'] - df_tourney_results['BSeed']
df_tourney_results['PointDiff'] = df_tourney_results['APointDifferentialPerGame'] - df_tourney_results['BPointDifferentialPerGame']
df_tourney_results['WinPctDiff'] = df_tourney_results['AWinningPct'] - df_tourney_results['BWinningPct']

# df_teams only has data after 2003, so disregard tourney results before 2003
df_tourney_results = df_tourney_results[df_tourney_results["Season"] >= 2003]

df_tourney_results.shape

In [None]:
# duplicate the df_tourney_results rows
# take original 2251 rows and duplicate with Team A and Team B inverted
# that way the WIN target value will not be the same value 1 for all rows

print("The shape of `data_tourney_results` BEFORE is: " + str(df_tourney_results.shape))

_cp = df_tourney_results.copy(deep=True)
_cp = _cp.rename(columns={'BTeamID': 'ATeamID',
                          'BScore': 'AScore',
                          'BWins': 'AWins',
                          'BLosses': 'ALosses',
                          'BPointDifferential': 'APointDifferential',
                          'BWinningPct': 'AWinningPct',
                          'BTeamName': 'ATeamName',
                          'BFGM': 'AFGM',
                          'BFGA': 'AFGA',
                          'BFGM3': 'AFGM3',
                          'BFGA3': 'AFGA3',
                          'BFTM': 'AFTM',
                          'BFTA': 'AFTA',
                          'BOR': 'AOR',
                          'BDR': 'ADR',
                          'BAST': 'AAST',
                          'BTO': 'ATO',
                          'BSTL': 'ASTL',
                          'BBLK': 'ABLK',
                          'BOppFGM': 'AOppFGM',
                          'BOppFGA': 'AOppFGA',
                          'BOppFGM3': 'AOppFGM3',
                          'BOppFGA3': 'AOppFGA3',
                          'BOppFTM': 'AOppFTM',
                          'BOppFTA': 'AOppFTA',
                          'BOppOR': 'AOppOR',
                          'BOppDR': 'AOppDR',
                          'BOppAST': 'AOppAST',
                          'BOppTO': 'AOppTO',
                          'BOppSTL': 'AOppSTL',
                          'BOppBLK': 'AOppBLK',
                          'BFGPct': 'AFGPct',
                          'BFGAPerGame': 'AFGAPerGame',
                          'BFG3Pct': 'AFG3Pct',
                          'BFTPct': 'AFTPct',
                          'BPointDifferentialPerGame': 'APointDifferentialPerGame',
                          'BORPerGame': 'AORPerGame',
                          'BDRPerGame': 'ADRPerGame',
                          'BASTPerGame': 'AASTPerGame',
                          'BTOPerGame': 'ATOPerGame',
                          'BASTtoTO': 'AASTtoTO',
                          'BAssistedFGPct': 'AAssistedFGPct',
                          'BSTLPerGame': 'ASTLPerGame',
                          'BBLKPerGame': 'ABLKPerGame',
                          'BOppFGPct': 'AOppFGPct',
                          'BOppFG3Pct': 'AOppFG3Pct',
                          'BOppFTPct': 'AOppFTPct',
                          'BOppORPerGame': 'AOppORPerGame',
                          'BOppDRPerGame': 'AOppDRPerGame',
                          'BOppASTPerGame': 'AOppASTPerGame',
                          'BOppTOPerGame': 'AOppTOPerGame',
                          'BOppASTtoTO': 'AOppASTtoTO',
                          'BOppSTLPerGame': 'AOppSTLPerGame',
                          'BOppBLKPerGame': 'AOppBLKPerGame',
                          'BOppFGAPerGame': 'AOppFGAPerGame',
                          'BOppAssistedFGPct': 'AOppAssistedFGPct',

                          'ATeamID': 'BTeamID',
                          'AScore': 'BScore',
                          'AWins': 'BWins',
                          'ALosses': 'BLosses',
                          'APointDifferential': 'BPointDifferential',
                          'AWinningPct': 'BWinningPct',
                          'ATeamName': 'BTeamName',
                          'AFGM': 'BFGM',
                          'AFGA': 'BFGA',
                          'AFGM3': 'BFGM3',
                          'AFGA3': 'BFGA3',
                          'AFTM': 'BFTM',
                          'AFTA': 'BFTA',
                          'AOR': 'BOR',
                          'ADR': 'BDR',
                          'AAST': 'BAST',
                          'ATO': 'BTO',
                          'ASTL': 'BSTL',
                          'ABLK': 'BBLK',
                          'AOppFGM': 'BOppFGM',
                          'AOppFGA': 'BOppFGA',
                          'AOppFGM3': 'BOppFGM3',
                          'AOppFGA3': 'BOppFGA3',
                          'AOppFTM': 'BOppFTM',
                          'AOppFTA': 'BOppFTA',
                          'AOppOR': 'BOppOR',
                          'AOppDR': 'BOppDR',
                          'AOppAST': 'BOppAST',
                          'AOppTO': 'BOppTO',
                          'AOppSTL': 'BOppSTL',
                          'AOppBLK': 'BOppBLK',
                          'AFGPct': 'BFGPct',
                          'AFGAPerGame': 'BFGAPerGame',
                          'AFG3Pct': 'BFG3Pct',
                          'AFTPct': 'BFTPct',
                          'APointDifferentialPerGame': 'BPointDifferentialPerGame',
                          'AORPerGame': 'BORPerGame',
                          'ADRPerGame': 'BDRPerGame',
                          'AASTPerGame': 'BASTPerGame',
                          'ATOPerGame': 'BTOPerGame',
                          'AASTtoTO': 'BASTtoTO',
                          'AAssistedFGPct': 'BAssistedFGPct',
                          'ASTLPerGame': 'BSTLPerGame',
                          'ABLKPerGame': 'BBLKPerGame',
                          'AOppFGPct': 'BOppFGPct',
                          'AOppFG3Pct': 'BOppFG3Pct',
                          'AOppFTPct': 'BOppFTPct',
                          'AOppORPerGame': 'BOppORPerGame',
                          'AOppDRPerGame': 'BOppDRPerGame',
                          'AOppASTPerGame': 'BOppASTPerGame',
                          'AOppTOPerGame': 'BOppTOPerGame',
                          'AOppASTtoTO': 'BOppASTtoTO',
                          'AOppSTLPerGame': 'BOppSTLPerGame',
                          'AOppBLKPerGame': 'BOppBLKPerGame',
                          'AOppFGAPerGame': 'BOppFGAPerGame',
                          'AOppAssistedFGPct': 'BOppAssistedFGPct'})

df_tourney_results = pd.concat([df_tourney_results, _cp], 0, sort=False)
print("The shape of `data_tourney_results` AFTER is: " + str(df_tourney_results.shape))

In [None]:
df_tourney_results['_targetScoreDiff'] = df_tourney_results['AScore'] - df_tourney_results['BScore']
df_tourney_results['_targetWinA'] = (df_tourney_results['_targetScoreDiff'] > 0).astype(int)
df_tourney_results.shape

## Step 4 - prep TEST data (submission file with `Pred` column)

Load `MSampleSubmissionStage1.csv` file, split ID into `Season`/`ATeamID`/`BTeamID`, merge seed data, merge team performance data.

In [None]:
# *** NEW DF: test ***
# (years 2015-2019)

df_test = pd.read_csv('/kaggle/input/ncaam-march-mania-2021/MSampleSubmissionStage1.csv')
df_test[['Season', 'ATeamID', 'BTeamID']] = df_test['ID'].str.split('_', expand=True) 
df_test = df_test.drop(['Pred'], axis=1)

df_test['Season'] = df_test['Season'].astype(int)
df_test['ATeamID'] = df_test['ATeamID'].astype(int)
df_test['BTeamID'] = df_test['BTeamID'].astype(int)

df_test = df_test.merge(df_seeds, how='left', left_on=['Season', 'ATeamID'], right_on=['Season', 'TeamID']).rename(columns={'Seed': 'ASeed'}).drop("TeamID", axis=1)
df_test = df_test.merge(df_seeds, how='left', left_on=['Season', 'BTeamID'], right_on=['Season', 'TeamID']).rename(columns={'Seed': 'BSeed'}).drop("TeamID", axis=1)

df_test['ASeed'] = df_test['ASeed'].str.replace(r'[A-Za-z]', '', regex=True).astype('int')
df_test['BSeed'] = df_test['BSeed'].str.replace(r'[A-Za-z]', '', regex=True).astype('int')
df_test.head()

In [None]:
# **SIMILAR CODE TO BELOW (here for df_test TEST data, above for df_tourney_results TRAIN data)**
# merge df_teams and rename to Team A and Team B column names
df_test = df_test.merge(df_teams, how='left', left_on=['Season', 'ATeamID'], right_on=['Season', 'TeamID'])
df_test = df_test.rename(columns={'Wins': 'AWins',
                                  'Losses': 'ALosses',
                                  'PointDifferential': 'APointDifferential',
                                  'WinningPct': 'AWinningPct',
                                  'TeamName': 'ATeamName',
                                  'FGM': 'AFGM',
                                  'FGA': 'AFGA',
                                  'FGM3': 'AFGM3',
                                  'FGA3': 'AFGA3',
                                  'FTM': 'AFTM',
                                  'FTA': 'AFTA',
                                  'OR': 'AOR',
                                  'DR': 'ADR',
                                  'AST': 'AAST',
                                  'TO': 'ATO',
                                  'STL': 'ASTL',
                                  'BLK': 'ABLK',
                                  'OppFGM': 'AOppFGM',
                                  'OppFGA': 'AOppFGA',
                                  'OppFGM3': 'AOppFGM3',
                                  'OppFGA3': 'AOppFGA3',
                                  'OppFTM': 'AOppFTM',
                                  'OppFTA': 'AOppFTA',
                                  'OppOR': 'AOppOR',
                                  'OppDR': 'AOppDR',
                                  'OppAST': 'AOppAST',
                                  'OppTO': 'AOppTO',
                                  'OppSTL': 'AOppSTL',
                                  'OppBLK': 'AOppBLK',
                                  'FGPct': 'AFGPct',
                                  'FGAPerGame': 'AFGAPerGame',
                                  'FG3Pct': 'AFG3Pct',
                                  'FTPct': 'AFTPct',
                                  'PointDifferentialPerGame': 'APointDifferentialPerGame',
                                  'ORPerGame': 'AORPerGame',
                                  'DRPerGame': 'ADRPerGame',
                                  'ASTPerGame': 'AASTPerGame',
                                  'TOPerGame': 'ATOPerGame',
                                  'ASTtoTO': 'AASTtoTO',
                                  'AssistedFGPct': 'AAssistedFGPct',
                                  'STLPerGame': 'ASTLPerGame',
                                  'BLKPerGame': 'ABLKPerGame',
                                  'OppFGPct': 'AOppFGPct',
                                  'OppFG3Pct': 'AOppFG3Pct',
                                  'OppFTPct': 'AOppFTPct',
                                  'OppORPerGame': 'AOppORPerGame',
                                  'OppDRPerGame': 'AOppDRPerGame',
                                  'OppASTPerGame': 'AOppASTPerGame',
                                  'OppTOPerGame': 'AOppTOPerGame',
                                  'OppASTtoTO': 'AOppASTtoTO',
                                  'OppSTLPerGame': 'AOppSTLPerGame',
                                  'OppBLKPerGame': 'AOppBLKPerGame',
                                  'OppFGAPerGame': 'AOppFGAPerGame',
                                  'OppAssistedFGPct': 'AOppAssistedFGPct'})
df_test = df_test.drop("TeamID", axis=1)

df_test = df_test.merge(df_teams, how='left', left_on=['Season', 'BTeamID'], right_on=['Season', 'TeamID'])
df_test = df_test.rename(columns={'Wins': 'BWins',
                                  'Losses': 'BLosses',
                                  'PointDifferential': 'BPointDifferential',
                                  'WinningPct': 'BWinningPct',
                                  'TeamName': 'BTeamName',
                                  'FGM': 'BFGM',
                                  'FGA': 'BFGA',
                                  'FGM3': 'BFGM3',
                                  'FGA3': 'BFGA3',
                                  'FTM': 'BFTM',
                                  'FTA': 'BFTA',
                                  'OR': 'BOR',
                                  'DR': 'BDR',
                                  'AST': 'BAST',
                                  'TO': 'BTO',
                                  'STL': 'BSTL',
                                  'BLK': 'BBLK',
                                  'OppFGM': 'BOppFGM',
                                  'OppFGA': 'BOppFGA',
                                  'OppFGM3': 'BOppFGM3',
                                  'OppFGA3': 'BOppFGA3',
                                  'OppFTM': 'BOppFTM',
                                  'OppFTA': 'BOppFTA',
                                  'OppOR': 'BOppOR',
                                  'OppDR': 'BOppDR',
                                  'OppAST': 'BOppAST',
                                  'OppTO': 'BOppTO',
                                  'OppSTL': 'BOppSTL',
                                  'OppBLK': 'BOppBLK',
                                  'FGPct': 'BFGPct',
                                  'FGAPerGame': 'BFGAPerGame',
                                  'FG3Pct': 'BFG3Pct',
                                  'FTPct': 'BFTPct',
                                  'PointDifferentialPerGame': 'BPointDifferentialPerGame',
                                  'ORPerGame': 'BORPerGame',
                                  'DRPerGame': 'BDRPerGame',
                                  'ASTPerGame': 'BASTPerGame',
                                  'TOPerGame': 'BTOPerGame',
                                  'ASTtoTO': 'BASTtoTO',
                                  'AssistedFGPct': 'BAssistedFGPct',
                                  'STLPerGame': 'BSTLPerGame',
                                  'BLKPerGame': 'BBLKPerGame',
                                  'OppFGPct': 'BOppFGPct',
                                  'OppFG3Pct': 'BOppFG3Pct',
                                  'OppFTPct': 'BOppFTPct',
                                  'OppORPerGame': 'BOppORPerGame',
                                  'OppDRPerGame': 'BOppDRPerGame',
                                  'OppASTPerGame': 'BOppASTPerGame',
                                  'OppTOPerGame': 'BOppTOPerGame',
                                  'OppASTtoTO': 'BOppASTtoTO',
                                  'OppSTLPerGame': 'BOppSTLPerGame',
                                  'OppBLKPerGame': 'BOppBLKPerGame',
                                  'OppFGAPerGame': 'BOppFGAPerGame',
                                  'OppAssistedFGPct': 'BOppAssistedFGPct'})
df_test = df_test.drop("TeamID", axis=1)
df_test['_targetPred'] = 0
# df_test.sample(20)[['Season','ATeamName', 'BTeamName','ASeed','BSeed','Pred']]
df_test.sample(10)

In [None]:
df_test['SeedDiff'] = df_test['ASeed'] - df_test['BSeed']
df_test['PointDiff'] = df_test['APointDifferentialPerGame'] - df_test['BPointDifferentialPerGame']
df_test['WinPctDiff'] = df_test['AWinningPct'] - df_test['BWinningPct']
df_test['_targetScoreDiff'] = 0
df_test['_targetWinA'] = 0

In [None]:
features = [
#             'ASeed',
#             'BSeed',
#             'AWinningPct',
#             'BWinningPct',
#             'AFGPct',
#             'BFGPct',
#             'AFG3Pct',
#             'BFG3Pct',
            'APointDifferentialPerGame',
            'BPointDifferentialPerGame',
#             'AOppASTtoTO',
#             'BOppASTtoTO',,
            'WinPctDiff',
#             'PointDiff',
            'SeedDiff']

In [None]:
df_tourney_results

## Step 5 - Logistic Regression model to predict `Pred`

Adapted from:
* https://www.kaggle.com/theoviel/ncaa-starter-the-simpler-the-better
* https://scikit-learn.org/stable/auto_examples/calibration/plot_compare_calibration.html

Rescale, then validate on each season in the last 10 seasons, and predict class (team A wins or loses).

**WORK IN PROGRESS**: try Logistic Regression, Random Forest, Gaussian Naive Bayes, Linear SVC.

In [None]:
def rescale(features, df_train, df_val, df_test):
    min_ = df_train[features].min()
    max_ = df_train[features].max()
    
    df_train[features] = (df_train[features] - min_) / (max_ - min_)
    df_val[features] = (df_val[features] - min_) / (max_ - min_)
    df_test[features] = (df_test[features] - min_) / (max_ - min_)
        
    return df_train, df_val, df_test

def kfold_reg(df, df_test_, plot=False):
    seasons = df['Season'].unique()
    cvs = []
    pred_tests = []
    target = '_targetWinA'
    
    for season in range(2013,2020):
        print(f'\nValidating on season {season}')
        
        df_train = df[df['Season'] != season].reset_index(drop=True).copy()
        df_val = df[df['Season'] == season].reset_index(drop=True).copy()
        df_test = df_test_.copy()
        
        df_train, df_val, df_test = rescale(features, df_train, df_val, df_test)

        lr = LogisticRegression()
        gnb = GaussianNB()
        svc = LinearSVC(C=10)
        rfc = RandomForestClassifier()

#         for model in [lr, gnb, svc, rfc]:
        for model in [lr, gnb, svc]:
            model.fit(df_train[features], df_train[target])

            if hasattr(model, "predict_proba"):
                pred = model.predict_proba(df_val[features])[:, 1]
                pred_test = model.predict_proba(df_test[features])[:, 1]
            else:  # use decision function
                pred = model.decision_function(df_val[features])
                pred = (pred - pred.min()) / (pred.max() - pred.min())
                pred_test = model.decision_function(df_test[features])
                pred_test = (pred_test - pred_test.min()) / (pred_test.max() - pred_test.min())
            
            pred_tests.append(pred_test)
        
            loss = log_loss(df_val['_targetWinA'].values, pred)
            cvs.append(loss)
            print(f'\t -> Scored {loss:.3f}')
        
    print(f'\n Local CV is {np.mean(cvs):.3f}')
    
    return pred_tests

In [None]:
pred_tests = kfold_reg(df_tourney_results, df_test, plot=True)

In [None]:
pred_tests

In [None]:
pred_test = np.mean(pred_tests, 0)
pred_test

In [None]:
sub = df_test[['ID']].copy()
sub['Pred'] = pred_test
sub.to_csv('submission.csv', index=False)
_ = sns.histplot(pred_test)

In [None]:
sub.head()