# **March Madness 2022** - Prediction using most recent games
Here we will explore some potentially useful features for our model.

Develop features based on $n$ previous games
- this is the idea that recent performance is most important in determining who wins the current matchup
- for each game, features are computed for both teams based on previous $n$ games (the first games for each team in each season are dropped)
- this might be bad for a tournament since you won't know how a team plays throughout the tournament, only the last games of the season are available
- game data is available in `MRegularSeasonDetailedResults.csv`
- For the game to be predicted, we will average the results of the previous $n$ games.
- We can double the amount of training data since there is symmetry to exploit. More on this later.
- Build a random forest model!

In [None]:
import numpy as np
import pandas as pd

STAGE_1_DIR = './MDataFiles_Stage1/'
teams_df = pd.read_csv(STAGE_1_DIR + 'MTeams.csv')
teams_df.head()

## **Stage 1** - Data Preparation

Let's get into cleaning and transforming the data! First we will define some functions that will allow us to transform the game data into something we can use to train a ML model. Notice that we can get double the amount of data by switching the lower ID team with the higher ID team and using the opposite label. That is, there is some degree of symmetry we can exploit.


In [None]:
from sklearn.base import TransformerMixin
from tqdm import tqdm

class Preprocessor(TransformerMixin):
    def __init__(self, games_df, n=10, weighting='avg', verbose=False):
        super().__init__()
        self.n = n
        self.games_df = games_df
        self.verbose = verbose
        self.weighting = weighting # 'avg' or 'exp'
        self.cached = {}
    
    def get_features(self, team_id, season, day_num):
        '''
        Computes the feature vector of the team over the past n games

        Parameters
        ----------
        team_id : str
            The unique identifier for the team
        season : int
            Current season
        day_num : int
            The day from the start of the season in which the game was played

        Returns
        -------
        features : pd.Series
            The team's features averaged over the previous n games
        '''
        # column names for winning or losing
        if (team_id, season, day_num) not in self.cached:
            won_columns = [c for c in self.games_df.columns if c[0] == 'W']
            lost_columns = [c for c in self.games_df.columns if c[0] == 'L']
            # mapping from old column names to new column names
            won_renamed_columns = {c: c[1:] for c in won_columns}
            lost_renamed_columns = {c: c[1:] for c in lost_columns}
            # select all previous games for this team in the current season
            lost_games = self.games_df[(self.games_df.LTeamID == team_id) &
                                       (self.games_df.Season == season) &
                                       (self.games_df.DayNum < day_num)]
            won_games = self.games_df[(self.games_df.WTeamID == team_id) &
                                      (self.games_df.Season == season) &
                                      (self.games_df.DayNum < day_num)]
            # drop the other teams statistics
            lost_games = lost_games.drop(won_columns, axis=1)
            won_games = won_games.drop(lost_columns, axis=1)
            # rename the columns so they match and can be joined
            lost_games = lost_games.rename(columns=lost_renamed_columns)
            won_games = won_games.rename(columns=won_renamed_columns)
            # join previous lost and won games
            previous_games = pd.concat([lost_games, won_games], axis=0)
            # sort by DayNum
            sorted_games = previous_games.sort_values('DayNum', ascending=False)
            self.cached[(team_id, season, day_num)] = sorted_games
        else:
            sorted_games = self.cached[(team_id, season, day_num)]
        # remove unused columns
        columns_to_remove = ['Season', 'DayNum', 'TeamID', 'NumOT']
        n = self.n
        if len(sorted_games) < self.n:
            n = len(sorted_games)
        last_n_games = sorted_games.iloc[:n, :]
        last_n_games_features = last_n_games[last_n_games.columns.difference(columns_to_remove)]
        if self.weighting == 'exp':
            if n != 0:
                exp_weights = [((1 - 0.9) / (1 - (0.9 ** n))) * ((0.9) ** (t - 1)) for t in range(1, n + 1)] 
                return last_n_games_features.multiply(exp_weights, axis='rows').sum(axis=0)
        return last_n_games_features.mean(axis=0)

    def fit(self, x, y):
        return self
    
    def transform(self, x, testing=False):
        new_rows = []
        for i, row in tqdm(x.iterrows(), total=len(x), disable=not self.verbose):
            if row.WTeamID < row.LTeamID:
                lower = row.WTeamID
                higher = row.LTeamID
            else:
                lower = row.LTeamID
                higher = row.WTeamID
            lower_id_features = self.get_features(lower, row.Season, row.DayNum)
            higher_id_features = self.get_features(higher, row.Season, row.DayNum)
            new_row_1 = lower_id_features - higher_id_features
            new_rows.append(new_row_1)
            # Doubles the amount of training data!
            # Note that the target values are doubled during creation
            # See `create_targets()` below
            if not testing:
                new_row_2 = higher_id_features - lower_id_features
                new_rows.append(new_row_2)
        ret = pd.DataFrame(new_rows)
        return ret
    
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self
        
        
def create_targets(full_data, testing=False):
    targets = []
    for i, row in full_data.iterrows():
        if row.WTeamID < row.LTeamID:
            targets.append(1)
            if not testing:
                targets.append(0)
        else:
            targets.append(0)
            if not testing:
                targets.append(1)
    return pd.Series(targets)
    
    
def create_new_features(full_data):
    to_remove = ['WScore', 'LScore', 'WFTM', 'WFTA', 'LFTM', 'LFTA',
                 'WFGM3', 'WFGA3', 'LFGM3', 'LFGA3', 'WFGM', 'WFGA',
                 'LFGM', 'LFGA', 'WLoc']
    full_data['ScoreDiff'] = full_data.WScore - full_data.LScore
    full_data['WFTrR'] = full_data.WFTM / full_data.WFTA
    full_data['LFTrR'] = full_data.LFTM / full_data.LFTA
    full_data['WFGR3'] = full_data.WFGM3 / full_data.WFGA3
    full_data['LFGR3'] = full_data.LFGM3 / full_data.LFGA3
    full_data['WFGR2'] = (full_data.WFGM - full_data.WFGM3) / (full_data.WFGA - full_data.WFGA3)
    full_data['LFGR2'] = (full_data.LFGM - full_data.LFGM3) / (full_data.LFGA - full_data.LFGA3)
    return full_data[full_data.columns.difference(to_remove)]

Now we need to read in the data and construct our new features! These are:
- `ScoreDiff`: Difference in score (WScore - LScore)
- `FTrR`: Free throw rate
- `FGR3`: 3-point field goal rate
- `FGR2`: 2-point field goal rate
- `OR`: Offensive rebounds
- `DR`: Defensive rebounds
- `Ast`: Assists
- `Blk`: Blocks
- `TO`: Turnovers
- `Stl`: Steals
- `PF`: Personal fouls


In [None]:
tournament_games = pd.read_csv(STAGE_1_DIR + 'MNCAATourneyDetailedResults.csv')
season_games = pd.read_csv(STAGE_1_DIR + 'MRegularSeasonDetailedResults.csv')

# Use 2016 and above for testing in stage 1
full_train_tournament = tournament_games[tournament_games.Season < 2016]
full_test_tournament = tournament_games[tournament_games.Season >= 2016]

full_train_season = season_games[season_games.Season < 2016]
full_test_season = season_games[season_games.Season >= 2016]

full_train = pd.concat([full_train_season, full_train_tournament])
full_test = pd.DataFrame(full_test_tournament)

target_train = create_targets(full_train)
target_test = create_targets(full_test, testing=True)

features_train = full_train
features_test = full_test

season_games = create_new_features(season_games)

full_train.head()

We can see now that each matchup has a winning team, losing team, a day in which in took place, and the  season. This is what we will use to construct the features for both teams.

Here the preprocessor expands a given matchup into the features to use for that matchup. Each matchup produces two training samples for the model. The first is the `lowerID - higherID` with a target corresponding to the lower ID team winning. The second is `higherID - lowerID` with the opposite target. Note that the preprocessor only transforms the features, the targets were determined in the previous section when they were created.

In [None]:
from sklearn.metrics import log_loss
from sklearn.ensemble import RandomForestClassifier

def hyperparameter_search(n_list, estimator_list,
                          train_features, train_targets,
                          test_features, test_targets,
                          season_games):
    '''
    This performs a search over a parameter space specified by the different options available for
    both feature selection and model training.
    It will return all of the scores and the best parameter combination.
    '''
    print(f'Starting search over\nn: {n_list}\nn_estimators: {estimator_list}')
    accuracy_scores = {}
    log_loss_scores = {}
    best_score = float('inf')
    best_params = None
    preprocessor = Preprocessor(season_games, n=1, weighting='mean', verbose=False)
    for method in ['exp', 'mean']:
        accuracy_scores[method] = []
        log_loss_scores[method] = []
        preprocessor.method = method
        print(f'Calculating scores for {method} method...')
        for i, n in tqdm(enumerate(n_list), total=len(n_list)):
            preprocessor.n = n
            accuracy_scores[method].append([])
            log_loss_scores[method].append([])
            X_train = preprocessor.transform(train_features)
            X_test = preprocessor.transform(test_features, testing=True)
            X_train['Target'] = target_train
            X_train = X_train.dropna()
            X_train = X_train.sample(frac=1, random_state=1)
            y_train = X_train.loc[:, 'Target']
            X_train = X_train.loc[:, X_train.columns != 'Target']
            for n_estimators in estimator_list:
                model = RandomForestClassifier(n_estimators=n_estimators, criterion='entropy', random_state=2)
                model.fit(X_train, y_train)
                accuracy_score = model.score(X_test, test_targets)
                log_loss_score = log_loss(test_targets, model.predict_proba(X_test))
                accuracy_scores[method][i].append(accuracy_score)
                log_loss_scores[method][i].append(log_loss_score)
                if log_loss_score < best_score:
                    best_score = log_loss_score
                    best_params = (method, n, n_estimators)
    return (accuracy_scores, log_loss_scores), best_params

In [None]:
PERFORM_SEARCH = False
if PERFORM_SEARCH:
    n_list = list(range(1, 45, 6))
    estimator_list = list(range(50, 600, 50))
    (accuracy_scores, log_loss_scores), (method, n, n_estimators) = hyperparameter_search(n_list, estimator_list,
                                                                                          features_train[:20000], target_train[:40000],
                                                                                          features_test, target_test,
                                                                                          season_games)

In [None]:
import matplotlib.pyplot as plt

if PERFORM_SEARCH:
    fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(20, 20))
    ax1.imshow(accuracy_scores['mean'])
    ax1.set_title('Accuracy for the mean method')
    ax1.set_ylabel('n')
    ax1.set_xlabel('n_estimators')
    ax1.set_yticks(range(len(n_list)), labels=n_list)
    ax1.set_xticks(range(len(estimator_list)), labels=estimator_list)
    ax2.imshow(accuracy_scores['exp'])
    ax2.set_title('Accuracy for the exp method')
    ax2.set_ylabel('n')
    ax2.set_xlabel('n_estimators')
    ax2.set_yticks(range(len(n_list)), labels=n_list)
    ax2.set_xticks(range(len(estimator_list)), labels=estimator_list)
    ax3.imshow(log_loss_scores['mean'])
    ax3.set_title('Log loss for the mean method')
    ax3.set_ylabel('n')
    ax3.set_xlabel('n_estimators')
    ax3.set_yticks(range(len(n_list)), labels=n_list)
    ax3.set_xticks(range(len(estimator_list)), labels=estimator_list)
    ax4.imshow(log_loss_scores['exp'])
    ax4.set_title('Log loss for the exp method')
    ax4.set_ylabel('n')
    ax4.set_xlabel('n_estimators')
    ax4.set_yticks(range(len(n_list)), labels=n_list)
    ax4.set_xticks(range(len(estimator_list)), labels=estimator_list)
    plt.show()

I forgot to check how many games are in a regular season schedule. It turns out that there are only 35-40 games. This is why past $n =37$ there is no improvement. A lot of wasted time and computation :(

Either way, it looks like the best method was to use the entire season weighting by exponential decay.

In [None]:
n = 37
method = 'exp'
n_estimators = 450
print(f'Best Parameters: method = {method}, n = {n}, n_estimators = {n_estimators}')

In [None]:
preprocessor = Preprocessor(season_games, n=n, weighting=method, verbose=True)

X_train = preprocessor.fit_transform(features_train, target_train)
X_test = preprocessor.transform(features_test, testing=True)

In [None]:
X_train['Target'] = target_train
X_train = X_train.dropna()
X_train = X_train.sample(frac=1, random_state=1)
y_train = X_train.loc[:, 'Target']
X_train = X_train.loc[:, X_train.columns != 'Target']

print(X_train.shape)
print(y_train.shape)

## **Stage 1** - Model Training 
Here we build a simple random forest classifier. The best number of estimators was determined by the table below in the Model Testing section.

In [None]:
model = RandomForestClassifier(n_estimators=n_estimators, criterion='entropy', random_state=2)

In [None]:
model.fit(X_train, y_train)

## **Stage 1** - Model Testing
Here we can get the accuracy score for tournament games.

In [None]:
print(f'Performance of {n} previous games using a model with {n_estimators} estimators:')
print(f'Accuracy: {model.score(X_test, target_test)}')
print(f'Log Loss: {log_loss(target_test, model.predict_proba(X_test))}')

## **Stage 1** - Model Submission
This is the submission for Stage 1 of the Kaggle competition. It gives the probability that the lower ID team wins a given matchup. All games are tournament games from 2016 to 2021.

In [None]:
stage_1_submission = pd.read_csv(STAGE_1_DIR + 'MSampleSubmissionStage1.csv')

stage_1_submission['Season'] = stage_1_submission.ID.str.split('_', expand=True).iloc[:, 0].astype('int')
stage_1_submission['WTeamID'] = stage_1_submission.ID.str.split('_', expand=True).iloc[:, 1].astype('int')
stage_1_submission['LTeamID'] = stage_1_submission.ID.str.split('_', expand=True).iloc[:, 2].astype('int')
stage_1_submission['DayNum'] = [134 for _ in range(len(stage_1_submission))]

X = preprocessor.transform(stage_1_submission, testing=True)

new_preds = model.predict_proba(X)[:, 1]
stage_1_submission.Pred = new_preds
stage_1_submission = stage_1_submission[
    stage_1_submission.columns.difference(['Season', 'WTeamID', 'LTeamID', 'DayNum'])]

In [None]:
stage_1_submission.head()

In [None]:
stage_1_submission.to_csv(f'submission_{n}_{n_estimators}.csv', index=False)

## **Stage 2** - Model Training, Testing, & Submission
Now we can train using all of the data! The procedure is exactly the same as before.

In [None]:
STAGE_2_DIR = './MDataFiles_Stage2/'
tournament_games = pd.read_csv(STAGE_2_DIR + 'MNCAATourneyDetailedResults.csv')
season_games = pd.read_csv(STAGE_2_DIR + 'MRegularSeasonDetailedResults.csv')

# Use 2016 and above for testing in stage 1
full_train_tournament = tournament_games

full_train_season = season_games

full_train = pd.concat([full_train_season, full_train_tournament])

target_train = create_targets(full_train)

features_train = full_train
season_games = create_new_features(season_games)

full_train.head()

In [None]:
preprocessor = Preprocessor(season_games, n=n, weighting=method, verbose=True)

X_train = preprocessor.fit_transform(features_train, target_train)

In [None]:
X_train['Target'] = target_train
X_train = X_train.dropna()
X_train = X_train.sample(frac=1, random_state=1)
y_train = X_train.loc[:, 'Target']
X_train = X_train.loc[:, X_train.columns != 'Target']

print(X_train.shape)
print(y_train.shape)

In [None]:
model = RandomForestClassifier(n_estimators=n_estimators, criterion='entropy', random_state=2)

In [None]:
model.fit(X_train, y_train)

In [None]:
stage_2_submission = pd.read_csv(STAGE_2_DIR + 'MSampleSubmissionStage2.csv')

stage_2_submission['Season'] = stage_2_submission.ID.str.split('_', expand=True).iloc[:, 0].astype('int')
stage_2_submission['WTeamID'] = stage_2_submission.ID.str.split('_', expand=True).iloc[:, 1].astype('int')
stage_2_submission['LTeamID'] = stage_2_submission.ID.str.split('_', expand=True).iloc[:, 2].astype('int')
stage_2_submission['DayNum'] = [134 for _ in range(len(stage_2_submission))]

X = preprocessor.transform(stage_2_submission, testing=True)

new_preds = model.predict_proba(X)[:, 1]
stage_2_submission.Pred = new_preds
stage2 = stage_2_submission[
    stage_2_submission.columns.difference(['Season', 'WTeamID', 'LTeamID', 'DayNum'])]

In [None]:
stage2.head()

In [None]:
stage2.to_csv(f'submission_{n}_{n_estimators}_stage2.csv', index=False)

## **Stage 2** - Model Predictions
Here we will show all of the predictions that our model makes for each possible matchup.

In [None]:
id_df = pd.read_csv(STAGE_2_DIR + 'MTeams.csv')
print('If these two teams are playing each other, the one on the left has this chance of winning the matchup...')
for i, row in stage_2_submission.iterrows():
    if row.WTeamID < row.LTeamID:
        lower = row.WTeamID
        higher = row.LTeamID
    else:
        lower = row.LTeamID
        higher = row.WTeamID
    lower_team_name = id_df[id_df.TeamID == lower].TeamName.iloc[0]
    higher_team_name = id_df[id_df.TeamID == higher].TeamName.iloc[0]
    print(f'{lower_team_name} vs. {higher_team_name}: {row.Pred}')