In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Thinking About Different Options**
Here we will explore some potentially useful features for our model.

Some ideas:
1. Is it possible to get individual player stats for each season?
    - cumulative stats prior to the start of each game would be ideal because it would be independent of the team id
    - possible pitfall is that we must predict the entire tournament prior to its start
    - this would be less history dependent (less overfitting to the past?)
    - this does not currently seem possible (outside of top 100 players per category)
    - maybe can be used as a feature (i.e. # of players in top 100 for each category)
2. Some useful outside data can be found here: https://www.teamrankings.com/ncb/rankings/
    - try building a random forest model using all the different kinds of ratings, only
    - need to download data to notebook
3. Develop new features for each team for each season leading up to the tournament
    - win percentage
    - field goal percentage
    - 3-point percentage
    - free throw percentage
    - offensive rebounds
    - defensive rebounds
    - assists
    - turnovers (assist/turnover ratio seems relevant)
    - steals
    - blocks
    - personal fouls
4. Do the same thing as 3. but use the team rank in each of these categories instead
    - i.e. rank 1 Eastern Ky. vs rank 9 Alabama in 3-pt FG attempts
    - data source https://www.ncaa.com/stats/basketball-men/d1/current/team/625
5. Develop features based on $n$ previous games
    - this is the idea that recent performance is most important in determining who wins
    - for each game, features are computed for both teams based on previous $n$ games (need to handle season overlap)
    - this might be bad for a tournament since you won't know how a team plays throughout the tournament, only the last games of the season are available
    - game data is available in `MRegularSeasonDetailedResults.csv`
    
Current Plan:
- Create a dataframe that has the features described below of each team indexed by (team, season)
- For the year to be predicted, we will weight the results of the previous $n$ years by exponential decay factor $\lambda$. I.e. for season Y
    $$F_{Y} = (1 - \lambda) F_{Y} + (1 - \lambda)\lambda F_{Y - 1} + (1 - \lambda)\lambda^2 F_{Y - 2} + \cdots = \sum_{t = 0}^n (1 - \lambda) \lambda^t F_{Y - t}$$
- Build a random forest (or xgboost) model and cross-validate to select hyperparameters
- List of features (per team per season):
    - "AverageRank": average ordinal ranking
    - "PPG": points per game
    - "W-L%": Win percentage
    - "SOS": strength of schedule
    - "FTr": free throw attempt rate
    - "3PAr": 3-point attempt rate
    - "TRB%": Total rebound percentage
    - "AST%": Total assist percentage
    - "BLK%": Total block percentage
    - "eFG%": Effective field goal percentage
    - "TOV%": Turnover percentage
    - "FT/FGA": Free throw per field goal attempt

Data Source: https://www.sports-reference.com/cbb/seasons/ or https://www.kaggle.com/thomashopkins32/mncaa-additional-data-2022

In [None]:
STAGE_1_DIR = '/kaggle/input/mens-march-mania-2022/MDataFiles_Stage1/'
SEASON_DIR = '/kaggle/input/mncaa-additional-data-2022/'
teams_df = pd.read_csv(STAGE_1_DIR + 'MTeams.csv')
teams_df.head()

Let's get into cleaning the data! Note: there was a weird encoding error when reading `MTeamSpellings.csv`. Adding `ecoding='cp1252'` seems to fix it.

## Processing Data

In [None]:
import glob


def compute_average_rank(teamID, season, rankings_df=None):
    if rankings_df is None:
        rankings_df = pd.read_csv(STAGE_1_DIR + 'MMasseyOrdinals.csv')
    ranks = rankings_df[(rankings_df.TeamID == teamID) & (rankings_df.Season == season)]
    return ranks.OrdinalRank.mean()


def clean_season_data(dataframe, season, teamIDs=None, rankings=None):
    ''' Cleans the data by removing rows, filling in NA values, etc. '''
    print(f'Processing data for {season} season')
    # drop columns that we do not need
    dataframe = dataframe.drop(['Rk', 'G', 'W', 'L', 'SRS', 'Unnamed: 8', 'W.1',
                                'L.1', 'Unnamed: 11', 'W.2', 'L.2', 'Unnamed: 14',
                                'W.3', 'L.3', 'Unnamed: 17', 'Tm.', 'Opp.',
                                'Unnamed: 20', 'Pace', 'ORtg', 'TS%', 'STL%',
                                'ORB%'], axis=1)
    # convert team names to team id
    dataframe.School = dataframe.School.apply(lambda x: x.lower().replace('ncaa', '').strip())
    if teamIDs is None:
        teamIDs = pd.read_csv(STAGE_1_DIR + 'MTeamSpellings.csv', encoding='cp1252')
    dfIDs = []
    ranks = []
    for i, sch in enumerate(dataframe.School):
        team = teamIDs[teamIDs.TeamNameSpelling.str.lower() == sch]
        if len(team) == 0:
            # some entries were not in the data (see table below)
            if sch == 'purdue-fort wayne':
                schid = 1236
            elif sch == 'cal state long beach':
                schid = np.nan
            elif sch == 'kansas city':
                schid = 1282
            elif sch == 'st. thomas (mn)':
                schid = 1472
            else:
                schid = int(input(sch))
        else:
            schid = int(team.TeamID.iloc[0])
        dfIDs.append(schid)
        ranks.append(compute_average_rank(schid, int(season),
                                          rankings_df=rankings))
    dataframe['ID'] = dfIDs
    
    # add season column
    season_data = [int(season)] * len(dataframe)
    dataframe['Season'] = season_data
    
    # add rank column
    dataframe['Rank'] = ranks
    
    return dataframe


def process_season_data(datadir):
    ''' Reads in the data, cleans, and returns a dataframe '''
    data = glob.glob(datadir + '*.csv')
    teamIDs = pd.read_csv(STAGE_1_DIR + 'MTeamSpellings.csv', encoding='cp1252')
    rankings_df = pd.read_csv(STAGE_1_DIR + 'MMasseyOrdinals.csv')
    dataframes = []
    for d in data:
        df = pd.read_csv(d, header=1)
        df = clean_season_data(df, '20' + d.rstrip('.csv')[-2:],
                               teamIDs=teamIDs, rankings=rankings_df)
        dataframes.append(df)
    return pd.concat(dataframes)

In the additional data I gathered for my model, some of the names are not present in the `MTeamSpellings.csv` so we have to manually search for them.

Here is a list of ones that I found so far:

|Team Name | ID|
|------------|---------------|
|purdue-fort wayne | 1236|
|cal state long beach | NA|
|kansas city | 1282|
| st. thomas (mn) | 1472 |

In [None]:
team_season_stats = process_season_data(SEASON_DIR)
team_season_stats.head()

Now let's see some statistics about all of the teams across all of the seasons. Some interesting ones are the features with high standard deviation (`std`). Assists and strength of schedule (SOS) seem to have the greatest deviation. The standard deviation among free throw and 3-point attempt rate is low indicating that all of the teams are similar in the number of attempts for these categories.

In [None]:
team_season_stats.describe()

With that out of the way, we can get to building our model. Every time we encounter a (season, team, team) trio in the training data, we will query the `team_season_stats` data frame to get each team's statistics over the past $n$ seasons. The features for a team in a particular season will be a weighted sum of the previous $n$ seasons with a weight factor that decays exponentially. If you are familiar with reinforcement learning, this is similar to the idea of $n$-step TD returns.

We will then concatenate the two feature vectors and use them as input for our model.

There are some interesting considerations to make about the training/validation split:
- Should we only train using the Regular Season games and validate on Tournament games?
- Should we use a random set split derived from Regular Season, Tournament, and Conference games?
    - For stage 1, we cannot use Tournament games from 2016-2021 to train since this is what we will submit as predictions
    - For stage 2, this option looks good
- Is there data leakage in our feature space?
    - If we only use data from previous seasons, I don't think this will be an issue.
    
Current Plan:
- Train the model on all regular season and tournament games
- Concatenate season stats (maybe try difference?)
- If a game is in regular season, feature vector is created only from previous seasons
- If a game is a tournament game, feature vector includes the regular season that occurred right before it in the same year
- We don't have team stats from before 2003, so we are restricted to training on seasons after 2003.

In [None]:
def get_features(stats_df, team_id, season, n=4, discount=0.5):
    '''
    Computes the feature vector of the team over the past n seasons
    
    Parameters
    ----------
    stats_df : pd.DataFrame
        Table that maps (team_id, season) to feature vector
    team_id : str
        The unique identifier for the team
    season : int
        Current season
    n : int
        Number of seasons to average over the past
    discount : float
        Exponential decay factor to apply to averaging
    
    Returns
    -------
    features : np.array
        The team's features averaged over the previous n seasons.
    '''
    team_and_seasons = stats_df[(stats_df.ID == team_id) &
                                (season - n <= stats_df.Season) &
                                (stats_df.Season < season)]
    sorted_feature_table = team_and_seasons.sort_values('Season', ascending=False)
    ordered_features = sorted_feature_table[
        sorted_feature_table.columns.difference(['School', 'ID', 'Season'])]
    discount_vector = np.array([(1 - discount) * (discount ** t) for t in range(n)])
    if len(ordered_features) < n:
        discount_vector = discount_vector[:len(ordered_features)]
    return ordered_features.multiply(discount_vector, axis='rows').sum(axis=0)

## Stage 1 Data Preparation

In [None]:
def create_lower_id_won_feature(full_data):
    lower_id_won = []
    for i, row in full_data.iterrows():
        if row.WTeamID < row.LTeamID:
            lower_id_won.append(1)
        else:
            lower_id_won.append(0)
    full_data['LowerIDWon'] = lower_id_won

RNG = 32
tournament_games = pd.read_csv(STAGE_1_DIR + 'MNCAATourneyCompactResults.csv')
tournament_games = tournament_games[tournament_games.Season > 2003]
# must do this to get averaging over current season as well
tournament_games.Season = tournament_games.Season + 1

season_games = pd.read_csv(STAGE_1_DIR + 'MRegularSeasonCompactResults.csv')
season_games = season_games[season_games.Season > 2003]

full_train_tournament = tournament_games[tournament_games.Season < 2016 + 1]
full_test_tournament = tournament_games[tournament_games.Season >= 2016 + 1]
full_train_season = season_games[season_games.Season < 2016]
full_test_season = season_games[season_games.Season >= 2016]
full_train = pd.concat([full_train_season, full_train_tournament])
full_test = pd.concat([full_test_season, full_test_tournament])
create_lower_id_won_feature(full_train)
create_lower_id_won_feature(full_test)

full_train_shuffled = full_train.sample(frac=1, random_state=RNG)
full_test_shuffled = full_test.sample(frac=1, random_state=RNG)

full_train_shuffled.head()

In [None]:
def transform_matchup_to_features(team_id1, team_id2, season, stats_df, **kwargs):
    if team_id1 < team_id2:
        lower = team_id1
        higher = team_id2
    else:
        lower = team_id2
        higher = team_id1
    lower_id_features = get_features(stats_df, lower, season, **kwargs)
    # lower_id_features.index = [c + '.L' for c in lower_id_features.index]
    higher_id_features = get_features(stats_df, higher, season, **kwargs)
    # higher_id_features.index = [c + '.R' for c in higher_id_features.index]
    new_row = lower_id_features - higher_id_features
    return new_row

Now we can precompute all of the training and testing data.

In [None]:
from tqdm import tqdm

def precompute_matchup_to_features(id_table, stats_df, **kwargs):
    new_rows = []
    for i, row in tqdm(id_table.iterrows(), total=len(id_table)):
        if row.WTeamID < row.LTeamID:
            lower = row.WTeamID
            higher = row.LTeamID
        else:
            lower = row.LTeamID
            higher = row.WTeamID
        lower_id_features = get_features(stats_df, lower, row.Season, **kwargs)
        # lower_id_features.index = [c + '.L' for c in lower_id_features.index]
        higher_id_features = get_features(stats_df, higher, row.Season, **kwargs)
        # higher_id_features.index = [c + '.R' for c in higher_id_features.index]
        new_row = lower_id_features - higher_id_features
        new_rows.append(new_row)
    return pd.DataFrame(new_rows)
        

N = 4
DISCOUNT = 0.5
X_train_ids = full_train_shuffled.loc[:, ['WTeamID', 'LTeamID', 'Season']]
y_train = full_train_shuffled.loc[:, 'LowerIDWon']
X_test_ids = full_test_shuffled.loc[:, ['WTeamID', 'LTeamID', 'Season']]
y_test = full_test_shuffled.loc[:, 'LowerIDWon']
X_train = precompute_matchup_to_features(X_train_ids, team_season_stats, n=N,
                                    discount=DISCOUNT)
X_test = precompute_matchup_to_features(X_test_ids, team_season_stats, n=N,
                                   discount=DISCOUNT)
print(X_train.head())
print(y_train.head())

## Stage 1 Model Building

In [None]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=500, random_state=RNG, criterion='entropy')
classifier.fit(X_train, y_train)

In [None]:
classifier.score(X_test, y_test)

## Stage 1 Make and Submit Predictions

In [None]:
stage_1_submission = pd.read_csv(STAGE_1_DIR + 'MSampleSubmissionStage1.csv')

for i, row in tqdm(stage_1_submission.iterrows(), total=len(stage_1_submission)):
    season_str, low_id_str, high_id_str = row.ID.split('_')
    season = int(season_str)
    low_id = int(low_id_str)
    high_id = int(high_id_str)
    x = transform_matchup_to_features(low_id, high_id, season, team_season_stats, n=N,
                                      discount=DISCOUNT).to_numpy().reshape(1, -1)
    y = classifier.predict_proba(x)[0]
    stage_1_submission.iloc[i, 1] = y[1]

In [None]:
stage_1_submission.head()

In [None]:
stage_1_submission.to_csv('submission.csv', index=False)

In [None]:
STAGE_2_DIR = '/kaggle/input/mens-march-mania-2022/MDataFiles_Stage2/'
RNG = 32
tournament_games = pd.read_csv(STAGE_2_DIR + 'MNCAATourneyCompactResults.csv')
tournament_games = tournament_games[tournament_games.Season > 2003]
# must do this to get averaging over current season as well
tournament_games.Season = tournament_games.Season + 1

season_games = pd.read_csv(STAGE_2_DIR + 'MRegularSeasonCompactResults.csv')
season_games = season_games[season_games.Season > 2003]

full_train_tournament = tournament_games
full_train_season = season_games
full_train = pd.concat([full_train_season, full_train_tournament])
create_lower_id_won_feature(full_train)

full_train_shuffled = full_train.sample(frac=1, random_state=RNG)

full_train_shuffled.head()

In [None]:
N = 4
DISCOUNT = 0.5
X_train_ids = full_train_shuffled.loc[:, ['WTeamID', 'LTeamID', 'Season']]
y_train = full_train_shuffled.loc[:, 'LowerIDWon']
X_test_ids = full_test_shuffled.loc[:, ['WTeamID', 'LTeamID', 'Season']]
y_test = full_test_shuffled.loc[:, 'LowerIDWon']
X_train = precompute_matchup_to_features(X_train_ids, team_season_stats, n=N,
                                    discount=DISCOUNT)
X_test = precompute_matchup_to_features(X_test_ids, team_season_stats, n=N,
                                   discount=DISCOUNT)
print(X_train.head())
print(y_train.head())

In [None]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=500, random_state=RNG, criterion='entropy')
classifier.fit(X_train, y_train)

In [None]:
classifier.feature_importances_

In [None]:
X_train.columns

In [None]:
stage_2_submission = pd.read_csv(STAGE_2_DIR + 'MSampleSubmissionStage2.csv')

for i, row in tqdm(stage_2_submission.iterrows(), total=len(stage_2_submission)):
    season_str, low_id_str, high_id_str = row.ID.split('_')
    season = int(season_str)
    low_id = int(low_id_str)
    high_id = int(high_id_str)
    x = transform_matchup_to_features(low_id, high_id, season, team_season_stats, n=N,
                                      discount=DISCOUNT).to_numpy().reshape(1, -1)
    y = classifier.predict_proba(x)[0]
    stage_2_submission.iloc[i, 1] = y[1]

In [None]:
stage_2_submission.head()

In [None]:
stage_2_submission.to_csv('submission2.csv', index=False)

In [None]:
id_df = pd.read_csv(STAGE_2_DIR + 'MTeams.csv')
print('If these two teams are playing each other, the one on the left has this chance of winning the matchup...')
for i, row in stage_2_submission.iterrows():
    season_str, low_id_str, high_id_str = row.ID.split('_')
    season = int(season_str)
    lower = int(low_id_str)
    higher = int(high_id_str)
    lower_team_name = id_df[id_df.TeamID == lower].TeamName.iloc[0]
    higher_team_name = id_df[id_df.TeamID == higher].TeamName.iloc[0]
    print(f'{lower_team_name} vs. {higher_team_name}: {row.Pred}')