In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

This notebook is meant to complement this video: https://www.youtube.com/watch?v=pPfw2fzwNiM

# Part 1 Simple Elo Rating System


The elo rating system originated in chess and is a great way to compare individual players based on past records. Essentially, all players start at the same number of elo rating points (usually 1500). As they play games, they earn points for beating other players and lose points when they lose games. They get more points from beating opponents with higher ratings and lose more points for losing to lower rated opponents. 


This same methodology can be applied to college basketball. We can create elo ratings for teams. The nice thing about this system, is that from teh elo ratings you can get predicted win probability between two opponents. 


Rather than create the elo ratings ourself, there are some great places online that have already constructed them. I got my data from: http://warrennolan.com/basketball/2021/elochess

Good articles 
* Elo math: https://www.cantorsparadise.com/the-mathematics-of-elo-ratings-b6bfc9ca1dba
* Elo math analysis: https://chance.amstat.org/2020/09/chess/



In [None]:
#get seed and team data for elo ratings
df_seed = pd.read_csv('/kaggle/input/ncaam-march-mania-2021/MDataFiles_Stage2/MNCAATourneySeeds.csv')
df_teams = pd.read_csv('/kaggle/input/ncaam-march-mania-2021/MDataFiles_Stage2/MTeams.csv')
df_teams['tm_join'] = df_teams.TeamName.apply(lambda x: x.replace('St','State'))

#get 2021 data & join seed w/ team names
df_s_2021 = df_seed[df_seed['Season'] == 2021]
seed_tms = pd.merge(df_s_2021, df_teams.loc[:,['TeamID','TeamName','tm_join']], on='TeamID')


In [None]:
#get elo data from warrennolan.com - Great resource (check it out!)
df_elo = pd.read_html('http://warrennolan.com/basketball/2021/elochess')[0]

df_elo_final = df_elo.loc[:,['Team','ELO']]
df_elo_final.head()

In [None]:
#join elo data with team and seed data | fill in null values
seed_elos = pd.merge(seed_tms,df_elo_final, left_on = 'tm_join',right_on = 'Team', how='left')

seed_elos.iloc[8,6] = 1597.16
seed_elos.iloc[13,6] = 1519.06
seed_elos.iloc[14,6] = 1481.20
seed_elos.iloc[16,6] = 1278.92
seed_elos.iloc[17,6] = 1395.14
seed_elos.iloc[32,6] = 1469.18

seed_elos.isnull().any()

In [None]:
#simple formula to get win probability from elo rating differential
def win_prob_t1(team1_elo,team2_elo):
    elo_diff_m = (team2_elo-team1_elo)/400
    t1_win_prob = 1/(1+10**elo_diff_m)
    return t1_win_prob

# Part 2 Monte Carlo Simulation

Check out the streamlit app I built here that illustrates this: https://share.streamlit.io/playingnumbers/basketball_sim_dash/main

Another approach that produces a probability distribution is a monte carlo simulation. To do this, we create distributions of expected scores for each team and then randomly sample from these distributions. When the expected score of team 1 is higher than team 2 in a given simulation, we award team 1 a win and vice versa. If we do this enough times we can get the expected win probabiltiy when the teams play.

It should be noted that this approach will not guarantee stable results. By definition there is randomness, so each run should be different than the previous ones. To solve for this, you can either set a random seed or  project the outcome out to infinity.

Our methodology here is very basic. We want to simulate how two teams will perform against each other (team1 & team2). To simulate the score that team1 would have, we do the following:
* sample from a distribution of past points scored by team 1
* sample from a distribution of past points allowed by team 2 
* take an average of points scored by team 1 & points allowed by team 2

To see who would win, we do the same for team 2 and compare the final result. I find this to be a decent basline model for evaluating performance that can be built upon. There are plenty of things wrong with this approach, but it is a good way to think about this problem and get started! 

In [None]:
# build team class for simulation
class Team:
    def __init__(self, teamid, data, season):
        self.teamid = teamid
        self.data = data[(data['TeamID'] == self.teamid)&
                         (data['Season'] == season)].copy()
        self.team_name = self.data['TeamName'].unique()[0]
    
    def getPointsScored(self):
        return self.data['PtScored'].values

    def getPointsAllowed(self):
        return self.data['PtAllowed'].values
    
    def getAttributes(self):
        self.attributes = dict()
        for col in self.data.columns:
            self.attributes[col] = self.data[col].values
        return self.attributes
    

In [None]:
# make simulation functions

import random as rd

def sim_once(team1,team2):
    score_team1= rd.gauss(team1.getPointsScored().mean(),team1.getPointsScored().std())
    score_team2= rd.gauss(team2.getPointsScored().mean(),team2.getPointsScored().std())
    score_against_team1= rd.gauss(team1.getPointsAllowed().mean(),team1.getPointsAllowed().std())
    score_against_team2= rd.gauss(team2.getPointsAllowed().mean(),team2.getPointsAllowed().std())
    final_score_t1 = (score_team1+score_against_team2)/2
    final_score_t2 = (score_team2+score_against_team1)/2
    if final_score_t1 == final_score_t2:
        sim_once(team1,team2)
    return (final_score_t1,final_score_t2, final_score_t1 > final_score_t2)
    

def sim_multiple(team1,team2,n=100):
    """Takes two teams in and returns win % of t1, t1 point dist, t2 point dist, win loss binary"""
    t1_points = []
    t2_points = []
    w_l = []
    for i in range(n):
        sim = sim_once(team1,team2)
        t1_points.append(sim[0])
        t2_points.append(sim[1])
        w_l.append(sim[2])
    return (sum(w_l)/n, t1_points, t2_points, w_l)

In [None]:
#get data by team 
df = pd.read_csv('/kaggle/input/ncaam-march-mania-2021/MDataFiles_Stage2/MRegularSeasonDetailedResults.csv')

In [None]:
#Only columns we need 
df.head()
df_winners = df.loc[:,['Season','DayNum','WTeamID','WScore','LScore']]
df_losers = df.loc[:,['Season','DayNum','LTeamID','LScore','WScore']]

#rename columns
df_winners.columns = ['Season','DayNum','TeamID','PtScored','PtAllowed']
df_losers.columns = ['Season','DayNum','TeamID','PtScored','PtAllowed']

df_tm = pd.concat([df_winners,df_losers])

In [None]:
# get team names 
df_tm_names = pd.merge(df_tm,df_teams.loc[:,['TeamID','TeamName']], on ='TeamID')
df_tm_names[df_tm_names.TeamID == 1216]

In [None]:
#Example of how to team objects 
alabama = Team(1104,df_tm_names,2021)
hartford = Team(1216,df_tm_names,2021)


In [None]:
#run simulation of two teams 
sim_out = sim_multiple(alabama,hartford,1000)

In [None]:
#results of simulation (win probability, t1 scors, t2 scores, win / loss binary)
sim_out