The "Four Factors" in basketball are designed to measure how good a team is at shooting, taking care of the ball, offensive rebounding, and getting to the foul line. Dean Oliver wrote about them in [Basketball on Paper](https://www.amazon.com/Basketball-Paper-Rules-Performance-Analysis/dp/1574886886), and [kenpom uses them extensively](https://kenpom.com/blog/four-factors/). This notebook will calculate and compile the four factor ratings for each team throughout the regular season and uses a difference between teams approach to generate a training file.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math

pd.set_option('display.float_format', lambda x: '%.3f' % x)
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

First, let's randomize the order that the teams come into our training set and calculate the seeding differential.

In [None]:
#bring in data from 2003 on
tourney_results = pd.read_csv('/kaggle/input/google-cloud-ncaa-march-madness-2020-division-1-mens-tournament/MDataFiles_Stage1/MNCAATourneyCompactResults.csv')
submission = pd.read_csv('/kaggle/input/google-cloud-ncaa-march-madness-2020-division-1-mens-tournament/MSampleSubmissionStage1_2020.csv')
tourney_results = tourney_results.loc[tourney_results['Season'] >= 2003]

training_set = pd.DataFrame()
training_set['Result'] = np.random.randint(0,2,len(tourney_results.index))
training_set['Season'] = tourney_results['Season'].values
training_set['Team1'] = training_set['Result'].values * tourney_results['WTeamID'].values + (1-training_set['Result'].values) * tourney_results['LTeamID'].values 
training_set['Team2'] = (1-training_set['Result'].values) * tourney_results['WTeamID'].values + training_set['Result'].values * tourney_results['LTeamID'].values 


In [None]:
# Calculate Delta Seeds
seeds = pd.read_csv('/kaggle/input/google-cloud-ncaa-march-madness-2020-division-1-mens-tournament/MDataFiles_Stage1/MNCAATourneySeeds.csv')
seeds['Seed'] =  pd.to_numeric(seeds['Seed'].str[1:3], downcast='integer',errors='coerce')
training_set = training_set.merge(seeds, how='left', left_on=['Season','Team1'], right_on = ['Season','TeamID'])
training_set = training_set.merge(seeds, how='left', left_on=['Season','Team2'], right_on = ['Season','TeamID'])
training_set['deltaSeed'] = training_set['Seed_x'] - training_set['Seed_y']
training_set.drop(columns = ['Seed_x','TeamID_x','Seed_y','TeamID_y'], inplace=True)

To add another feature, I'm going to adjust Wins and Losses according to the NCAA's net methodology, which weights home games by 1.4 and away games by 0.6, and calculate an adjusted winning percentage for each team in the training set.

In [None]:
# Calculate weighted win pct based on location
#home wins are weighted 0.6, away 1.4
season_results = pd.read_csv('/kaggle/input/google-cloud-ncaa-march-madness-2020-division-1-mens-tournament/MDataFiles_Stage1/MRegularSeasonDetailedResults.csv')
season_results['LLoc'] = season_results['WLoc']
season_results.loc[season_results['WLoc'] == 'H','LLoc'] = 'A'
season_results.loc[season_results['WLoc'] == 'A','LLoc'] = 'H'
season_results['WLocWeight'] = season_results['WLoc']
season_results.loc[season_results['WLoc'] == 'H','WLocWeight'] = 0.6
season_results.loc[season_results['WLoc'] == 'N','WLocWeight'] = 1
season_results.loc[season_results['WLoc'] == 'A','WLocWeight'] = 1.4
season_results['LLocWeight'] = season_results['LLoc']
season_results.loc[season_results['LLoc'] == 'H','LLocWeight'] = 1.4
season_results.loc[season_results['LLoc'] == 'N','LLocWeight'] = 1
season_results.loc[season_results['LLoc'] == 'A','LLocWeight'] = 0.6

record = pd.DataFrame({'Adj. wins': season_results.groupby(['Season','WTeamID'])['WLocWeight'].sum(),
                      'Adj. losses': season_results.groupby(['Season','LTeamID'])['LLocWeight'].sum(),
                      'Wins': season_results.groupby(['Season','WTeamID']).size(),
                      'Losses': season_results.groupby(['Season','LTeamID']).size()}).reset_index();

record.rename(index=str,columns={'level_0':'Season',
                                 'level_1':'TeamID'}, inplace=True)
record.fillna(value=0 , inplace=True)
record['games'] = record['Wins']+record['Losses']
record = record.merge(seeds, how = 'left', left_on = ['Season','TeamID'], right_on = ['Season','TeamID'])

Calculate [RPI.](https://www.nbastuffer.com/analytics101/relative-percentage-index-rpi/)

Rating Percentage Index (RPI) Formula = .25 \* (Team’s Winning Percentage)+ .50 \* (Opponents’  Average Winning Percentage) + 0.25 \* (Opponents’ Opponents’  Average Winning Percentage)

We will use normal winning percentage, not adjusted.

In [None]:
season_results = season_results.merge(record[['Season','TeamID','Wins','games']], how = 'left', left_on = ['Season','WTeamID'], right_on=['Season','TeamID'])
season_results['WWins'] = season_results['Wins']
season_results['WGames'] = season_results['games']
season_results.drop(columns = ['TeamID','Wins','games'], inplace=True)

season_results = season_results.merge(record[['Season','TeamID','Wins','games']], how = 'left', left_on = ['Season','LTeamID'], right_on=['Season','TeamID'])
season_results['LWins'] = season_results['Wins']
season_results['LGames'] = season_results['games']
season_results.drop(columns = ['TeamID','Wins','games'], inplace=True)
season_results.fillna(value=0 , inplace=True)

opprecord = pd.DataFrame({'WOppWins' : season_results.groupby(['Season','WTeamID'])['LWins'].sum(),
                          'WOppGames' : season_results.groupby(['Season','WTeamID'])['LGames'].sum(),
                          'LOppWins' : season_results.groupby(['Season','LTeamID'])['WWins'].sum(),
                          'LOppGames' : season_results.groupby(['Season','LTeamID'])['WGames'].sum()}).reset_index();

opprecord.rename(index=str,columns={'level_0':'Season',
                                    'level_1':'TeamID'}, inplace=True)
opprecord.fillna(value=0 , inplace=True)
opprecord['OppWins'] = opprecord['WOppWins'] + opprecord['LOppWins']
opprecord['OppGames'] = opprecord['WOppGames'] + opprecord['LOppGames']
opprecord['OppWinPercent'] = (opprecord['OppWins']/opprecord['OppGames'])


season_results = season_results.merge(opprecord[['Season','TeamID','OppWins','OppGames']], how = 'left', left_on = ['Season','WTeamID'], right_on = ['Season','TeamID'])
season_results['WOppWins'] = season_results['OppWins']
season_results['WOppGames'] = season_results['OppGames']
season_results.drop(columns = ['TeamID','OppWins','OppGames'], inplace=True)

season_results = season_results.merge(opprecord[['Season','TeamID','OppWins','OppGames']], how = 'left', left_on = ['Season','LTeamID'], right_on = ['Season','TeamID'])
season_results['LOppWins'] = season_results['OppWins']
season_results['LOppGames'] = season_results['OppGames']
season_results.drop(columns = ['TeamID','OppWins','OppGames'], inplace=True)

record = record.merge(opprecord[['Season','TeamID','OppWins','OppGames']],how='left',left_on=['Season','TeamID'],right_on=['Season','TeamID'])

oppopprecord = pd.DataFrame({'WOppOppWins' : season_results.groupby(['Season','WTeamID'])['LOppWins'].sum(),
                             'WOppOppGames' : season_results.groupby(['Season','WTeamID'])['LOppGames'].sum(),
                             'LOppOppWins' : season_results.groupby(['Season','LTeamID'])['WOppWins'].sum(),
                             'LOppOppGames' : season_results.groupby(['Season','LTeamID'])['WOppGames'].sum()}).reset_index();

oppopprecord.rename(index=str,columns={'level_0':'Season',
                                       'level_1':'TeamID'}, inplace=True)
oppopprecord.fillna(value=0 , inplace=True)
oppopprecord['OppOppWins'] = oppopprecord['WOppOppWins'] + oppopprecord['LOppOppWins']
oppopprecord['OppOppGames'] = oppopprecord['WOppOppGames'] + oppopprecord['LOppOppGames']
oppopprecord['OppOppWinPercent'] = (oppopprecord['OppOppWins']/oppopprecord['OppOppGames'])


record = record.merge(oppopprecord[['Season','TeamID','OppOppWins','OppOppGames']],how='left',left_on=['Season','TeamID'],right_on=['Season','TeamID'])

record['RPI'] = (.25*(record['Wins']/record['games'])) + (.50*(record['OppWins']/record['OppGames'])) + (.25*(record['OppOppWins']/record['OppOppGames']))

In [None]:
training_set = training_set.merge(record[['Season','TeamID','RPI']], how='left', left_on = ['Season','Team1'], right_on = ['Season','TeamID'])
training_set = training_set.merge(record[['Season','TeamID','RPI']], how='left', left_on = ['Season','Team2'], right_on = ['Season','TeamID'])
training_set['deltaRPI'] = training_set['RPI_x'] - training_set['RPI_y']
training_set.drop(columns=['TeamID_x','RPI_x','TeamID_y','RPI_y'],inplace=True)

Create O and D efficiency ratings for each team and game  
ORtg is roughly points scored per possession and DRtg is points allowed per possession  
Possession formula comes from here: https://www.nbastuffer.com/analytics101/possession/  
eFG = effective fg %  
TOP = Turnover %  
OR% = offensive rebounding percentage  
FTR = ability to get to the line  

In [None]:
season_results['WPoss'] = 0.96*(season_results['WFGA'] + season_results['WTO']  - season_results['WOR'] + 0.475*season_results['WFTA'])
season_results['LOppPoss'] = season_results['WPoss']

season_results['WPace'] = season_results['WPoss']/(season_results['NumOT']*5 + 40)
season_results['LOppPace'] = season_results['WPace']

season_results['WORtg'] = season_results['WScore']/season_results['WPoss']
season_results['LDRtg'] = season_results['WORtg']

season_results['WOeFG'] = (season_results['WFGM'] + season_results['WFGM3']*0.5)/season_results['WFGA']
season_results['LOppOeFG'] = season_results['WOeFG']

season_results['WTOP'] = season_results['WTO']/season_results['WPoss']
season_results['LOppTOP'] = season_results['WTOP']

season_results['WOR%'] = season_results['WOR']/(season_results['WOR']+season_results['LDR'])
season_results['LOppOR%'] = season_results['WOR%']

season_results['WFTR'] = season_results['WFTM']/season_results['WFGA']
season_results['LOppFTR'] = season_results['WFTR']




season_results['LPoss'] = 0.96*(season_results['LFGA'] + season_results['LTO'] +  - season_results['LOR'] + 0.475*season_results['LFTA'])
season_results['WOppPoss'] = season_results['LPoss']

season_results['LPace'] = season_results['LPoss']/(season_results['NumOT']*5 + 40)
season_results['WOppPace'] = season_results['LPace']

season_results['LORtg'] = season_results['LScore']/season_results['LPoss']
season_results['WDRtg'] = season_results['LORtg']

season_results['LOeFG'] = (season_results['LFGM'] + season_results['LFGM3']*0.5)/season_results['LFGA']
season_results['WOppOeFG'] = season_results['LOeFG']

season_results['LTOP'] = season_results['LTO']/season_results['LPoss']
season_results['WOppTOP'] = season_results['LTOP']

season_results['LOR%'] = season_results['LOR']/(season_results['LOR']+season_results['WDR'])
season_results['WOppOR%'] = season_results['LOR%']

season_results['LFTR'] = season_results['LFTM']/season_results['LFGA']
season_results['WOppFTR'] = season_results['LFTR']


season_results['Poss'] = season_results['WPoss']+season_results['LPoss']

Group Offensive and Defensive Efficiencies for each team and season

In [None]:
ratings = pd.DataFrame({'WORtg' : season_results.groupby(['Season','WTeamID'])['WORtg'].mean(),
                        'WDRtg' : season_results.groupby(['Season','WTeamID'])['WDRtg'].mean(),
                        'LORtg' : season_results.groupby(['Season','LTeamID'])['LORtg'].mean(),
                        'LDRtg' : season_results.groupby(['Season','LTeamID'])['LDRtg'].mean()}).reset_index();

ratings.rename(index=str,columns={'level_0':'Season',
                                 'level_1':'TeamID'}, inplace=True)

values = {'LORtg' : 0,
         'LDRtg' : 0}
ratings = ratings.fillna(value = values)
ratings = ratings.merge(record, how='outer', left_on=['Season','TeamID'], right_on=['Season','TeamID'],copy=False)
#create average ratings for the entire season since each team's games are split into wins and losses
#AORtg = average offensive rating
#ADRtg = average defensive rating
ratings['AORtg'] = (ratings['WORtg']*ratings['Wins'] + ratings['LORtg']*ratings['Losses'])/ratings['games']
ratings['ADRtg'] = (ratings['WDRtg']*ratings['Wins'] + ratings['LDRtg']*ratings['Losses'])/ratings['games']

Create average ratings for entire NCAA by season to compare relative strength of each team.

OStrength will give a multiplier > 1 if the team is an above average offensive team.

DStrength will give a multiplier > 1 if the team is an above average defensive team.

These multipliers will be used to adjust each team's opponent's o rating and d rating for each game. The idea is to adjust for the opponent's level.

Scoring 70 points against a team that sucks at defense should be less valuable than scoring 70 on Texas Tech for instance.

In [None]:
season_ratings = pd.DataFrame({'NCAA AORtg' : ratings.groupby(['Season'])['AORtg'].mean(),
                               'NCAA ADRtg' : ratings.groupby(['Season'])['ADRtg'].mean()}).reset_index();

In [None]:
ratings = ratings.merge(season_ratings[['Season','NCAA AORtg','NCAA ADRtg']],
                        how = 'left',
                        left_on = ['Season'],
                        right_on = ['Season'])

ratings['OStrength'] = ratings['AORtg']/ratings['NCAA AORtg']
ratings['DStrength'] = ratings['NCAA ADRtg']/ratings['ADRtg']

In [None]:
#merge season strength ratings for losing team
season_results = season_results.merge(ratings[['Season','TeamID','OStrength','DStrength']], 
                                          how='left', 
                                          left_on = ['Season','LTeamID'], 
                                          right_on = ['Season','TeamID'],)
season_results.rename(index=str,columns={'OStrength':'LOStrength',
                                         'DStrength':'LDStrength'}, inplace=True)
season_results.drop(columns=['TeamID'],inplace=True)

#merge season strength ratings for winning team
season_results = season_results.merge(ratings[['Season','TeamID','OStrength','DStrength']], 
                                          how='left', 
                                          left_on = ['Season','WTeamID'], 
                                          right_on = ['Season','TeamID'],)
season_results.rename(index=str,columns={'OStrength':'WOStrength',
                                         'DStrength':'WDStrength'}, inplace=True)

season_results.drop(columns=['TeamID'],inplace=True)


season_results.dropna(inplace=True)

In [None]:
#Adjust each o rating and d rating by opponent's o and d strength
season_results['WAdjORtng'] = season_results['WORtg']*season_results['LDStrength']
season_results['WAdjDRtng'] = season_results['WDRtg']*season_results['LOStrength']
season_results['LAdjORtng'] = season_results['LORtg']*season_results['WDStrength']
season_results['LAdjDRtng'] = season_results['LORtg']*season_results['WOStrength']

Now we will group stats for each teams' wins and losses to be summed up and divided by games played to get per game averages

In [None]:
#create data from for winning team and losing team and merge into record dataframe
dfW = season_results.groupby(['Season','WTeamID']).sum().reset_index()
dfL = season_results.groupby(['Season','LTeamID']).sum().reset_index()
cols = ['Pace','AdjORtng','AdjDRtng','OeFG','TOP','OR%','FTR','OppPace','OppOeFG','OppTOP','OppOR%','OppFTR']

wcols = ['Season','WTeamID']
lcols = ['Season','LTeamID']
for col in cols:
    wname = 'W'+col
    lname = 'L'+col
    wcols.append(wname)
    lcols.append(lname)
    
df = record.merge(dfW[wcols], how='left', left_on = ['Season','TeamID'], right_on = ['Season','WTeamID'])
df = df.merge(dfL[lcols], how='left', left_on = ['Season','TeamID'], right_on = ['Season','LTeamID'])
df.fillna(0,inplace=True)

In [None]:
for col in cols:
    df[col] = (df['W'+col] + df['L'+col])/df['games']
    

wcols.remove('Season')
lcols.remove('Season')
df.drop(columns = wcols+lcols, inplace=True)

record = record.merge(df[['Season','TeamID']+cols],how='left',left_on=['Season','TeamID'],right_on=['Season','TeamID'])

In [None]:
training_set = training_set.merge(record[['Season','TeamID']+cols], how='left', left_on=['Season','Team1'], right_on = ['Season','TeamID'])
training_set = training_set.merge(record[['Season','TeamID']+cols], how='left', left_on=['Season','Team2'], right_on = ['Season','TeamID'],suffixes = ['_1','_2'])
training_set.drop(columns=['TeamID_1','TeamID_2'], inplace=True)

In [None]:
for col in cols:
    name = 'delta'+col
    training_set[name] = training_set[col+'_1'] - training_set[col+'_2']
    training_set.drop(columns=[col+'_1',col+'_2'],inplace=True)

Print training_set to be used to develop a model

In [None]:
record.to_csv('record.csv',index=False)
training_set.to_csv('training_set.csv', index=False)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
training_set.describe()

By calculating each team's four factor stats and how effectively they limited their opponent's four factor stats, we should have a robust training set on which to develop a model.