# Men's Starters

Get ratings of each team's starters in a given year. This file needs to be ran for every relevant year (2012 to 2024). 

### Web Scraping

Fair warning that this program unfortunately takes a very long time to run due to web scraping limits set by Sports Reference

In [1]:
import pandas as pd
import numpy as np
import time

season = 2024

df_active_teams = pd.read_html('https://www.sports-reference.com/cbb/schools/index.html')[0]
time.sleep(2)

df_active_teams = df_active_teams.loc[~df_active_teams['School'].isin([np.NaN, 'School']), :].reset_index(drop=True)

df_active_teams['From'] = pd.to_numeric(df_active_teams['From'], errors='coerce')
df_active_teams['To'] = pd.to_numeric(df_active_teams['To'], errors='coerce')

df_active_teams = df_active_teams.loc[(df_active_teams['To'] >= season) & (df_active_teams['From'] <= season), :].reset_index(drop=True)

if season == 2021:  # if 2021, remove ivy league because they cancelled their season in 2021
    df_active_teams = df_active_teams.loc[
        ~df_active_teams['School'].isin(['Princeton', 'Harvard', 'Yale', 'Pennsylvania', 'Brown', 'Columbia', 'Cornell', 'Dartmouth']),
        :
    ].reset_index(drop=True)

df_active_teams

Unnamed: 0,Rk,School,"City, State",From,To,Yrs,G,W,L,W-L%,SRS,SOS,AP,CREG,CTRN,NCAA,FF,NC
0,1,Abilene Christian,"Abilene, Texas",1971,2024,14,422,226,196,.536,-7.69,-5.57,0,0,2,2,0,0
1,2,Air Force,"USAF Academy, Colorado",1958,2024,67,1843,794,1049,.431,-2.79,1.34,0,1,0,4,0,0
2,3,Akron,"Akron, Ohio",1902,2024,73,1717,1027,690,.598,0.01,-1.56,0,9,6,6,0,0
3,4,Alabama,"Tuscaloosa, Alabama",1913,2024,111,2891,1790,1100,.619,7.95,4.92,17,12,10,25,0,0
4,5,Alabama A&M,"Normal, Alabama",2000,2024,25,721,276,445,.383,-16.62,-11.07,0,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357,487,Wright State,"Dayton, Ohio",1988,2024,37,1128,631,497,.559,-1.32,-2.56,0,4,4,4,0,0
358,488,Wyoming,"Laramie, Wyoming",1905,2024,119,2876,1609,1267,.559,2.73,2.74,4,21,3,16,1,1
359,489,Xavier,"Cincinnati, Ohio",1920,2024,103,2697,1611,1086,.597,6.07,3.99,13,18,9,29,0,0
360,490,Yale,"New Haven, Connecticut",1896,2024,128,3084,1565,1516,.508,-5.99,-4.46,1,16,3,7,0,0


In [2]:
from datetime import datetime

cutoff_dates = {
    2012: datetime(2012, 3, 14),
    2013: datetime(2013, 3, 20),
    2014: datetime(2014, 3, 19),
    2015: datetime(2015, 3, 18),
    2016: datetime(2016, 3, 16),
    2017: datetime(2017, 3, 15),
    2018: datetime(2018, 3, 14),
    2019: datetime(2019, 3, 20),
    2021: datetime(2021, 3, 18),
    2022: datetime(2022, 3, 16),
    2023: datetime(2023, 3, 15),
    2024: datetime(2024, 3, 20),
}

cutoff_dates[season]

datetime.datetime(2024, 3, 20, 0, 0)

Get school names used in Sports Reference URL

In [3]:
df_active_teams.loc[df_active_teams['School'] == 'Hartford Hawks', 'School'] = 'Hartford'
df_active_teams.loc[df_active_teams['School'] == 'Savannah State Tigers', 'School'] = 'Savannah State'
df_active_teams.loc[df_active_teams['School'] == 'St. Francis (NY) Terriers', 'School'] = 'St. Francis (NY)'

df_active_teams['URL School'] = df_active_teams['School']

# hard code some schools that cause issues
df_active_teams.loc[df_active_teams['School'] == 'Bowling Green', 'URL School'] = 'Bowling Green State'
df_active_teams.loc[df_active_teams['School'] == 'BYU', 'URL School'] = 'Brigham Young'
df_active_teams.loc[df_active_teams['School'] == 'Louisiana', 'URL School'] = 'Louisiana Lafayette'
df_active_teams.loc[df_active_teams['School'] == 'LSU', 'URL School'] = 'Louisiana State'
df_active_teams.loc[df_active_teams['School'] == 'Ole Miss', 'URL School'] = 'Mississippi'
df_active_teams.loc[df_active_teams['School'] == 'Pitt', 'URL School'] = 'Pittsburgh'
df_active_teams.loc[df_active_teams['School'] == 'SMU', 'URL School'] = 'Southern Methodist'
df_active_teams.loc[df_active_teams['School'] == 'UAB', 'URL School'] = 'Alabama Birmingham'
df_active_teams.loc[df_active_teams['School'] == 'UCF', 'URL School'] = 'Central Florida'
df_active_teams.loc[df_active_teams['School'] == 'USC', 'URL School'] = 'Southern California'
df_active_teams.loc[df_active_teams['School'] == 'UTEP', 'URL School'] = 'Texas El Paso'
df_active_teams.loc[df_active_teams['School'] == 'UTSA', 'URL School'] = 'Texas San Antonio'
df_active_teams.loc[df_active_teams['School'] == 'Sam Houston', 'URL School'] = 'Sam Houston State'
df_active_teams.loc[df_active_teams['School'] == 'Houston Christian', 'URL School'] = 'Houston Baptist'
df_active_teams.loc[df_active_teams['School'] == 'Kansas City', 'URL School'] = 'Missouri Kansas City'
df_active_teams.loc[df_active_teams['School'] == 'Little Rock', 'URL School'] = 'Arkansas Little Rock'
df_active_teams.loc[df_active_teams['School'] == 'NC State', 'URL School'] = 'North Carolina State'
df_active_teams.loc[df_active_teams['School'] == 'Omaha', 'URL School'] = 'Nebraska Omaha'
df_active_teams.loc[df_active_teams['School'] == 'Purdue Fort Wayne', 'URL School'] = 'IPFW'
df_active_teams.loc[df_active_teams['School'] == 'TCU', 'URL School'] = 'Texas Christian'
df_active_teams.loc[df_active_teams['School'] == 'Texas-Rio Grande Valley', 'URL School'] = 'Texas Pan American'
df_active_teams.loc[df_active_teams['School'] == 'The Citadel', 'URL School'] = 'Citadel'
df_active_teams.loc[df_active_teams['School'] == 'Utah Tech', 'URL School'] = 'Dixie State'
df_active_teams.loc[df_active_teams['School'] == 'St. Thomas', 'URL School'] = 'St. Thomas MN'
df_active_teams.loc[df_active_teams['School'] == 'UC Berkley', 'URL School'] = 'California'
df_active_teams.loc[df_active_teams['School'] == 'UT Arlington', 'URL School'] = 'Texas Arlington'
df_active_teams.loc[df_active_teams['School'] == 'FDU', 'URL School'] = 'Fairleigh Dickinson'

df_active_teams['URL School'] = df_active_teams['URL School'].str.replace(r'^UC\s', 'California ', regex=True)
df_active_teams['URL School'] = df_active_teams['URL School'].str.replace(r'^UNC\s', 'North Carolina ', regex=True)

In [4]:
urls = (
    'https://www.sports-reference.com/cbb/schools/' +
    df_active_teams['URL School'].str.replace(' ', '-', regex=False).str.replace('[)(&.\']', '', regex=True).str.replace('--', '-', regex=False).str.lower() +
    '/men/' + str(season) +
    '-starters.html'
)

urls

0      https://www.sports-reference.com/cbb/schools/a...
1      https://www.sports-reference.com/cbb/schools/a...
2      https://www.sports-reference.com/cbb/schools/a...
3      https://www.sports-reference.com/cbb/schools/a...
4      https://www.sports-reference.com/cbb/schools/a...
                             ...                        
357    https://www.sports-reference.com/cbb/schools/w...
358    https://www.sports-reference.com/cbb/schools/w...
359    https://www.sports-reference.com/cbb/schools/x...
360    https://www.sports-reference.com/cbb/schools/y...
361    https://www.sports-reference.com/cbb/schools/y...
Name: URL School, Length: 362, dtype: object

In [5]:
from urllib.error import HTTPError
from tqdm.notebook import tqdm

problems = []

l = []
loop = tqdm(
    np.array((df_active_teams['School'], urls)).transpose(),  # get both school and url into loop iteration
    bar_format='{bar}{percentage:3.0f}%{r_bar}   {desc}'  # make the description not affect bar location
)

for (school, url) in loop:
  try:
    loop.set_description(f'{school}')
    # print(f'{i:03d}: {url}')
    df_team_season = pd.read_html(url)[0]  # starters should be initial table on site
    time.sleep(2)
    
    df_team_season.insert(4, 'Team', school)
    l.append(df_team_season)
  except HTTPError as e:
    if e.code == 404:
      print(f'{url} | {e}')
      time.sleep(2)
    else:
      print(f'{url} | {e}')
      break

df = pd.concat(l, ignore_index=True)

df

            0%| 0/362 [00:00<?, ?it/s]   

Unnamed: 0,G,Date,Unnamed: 2,Unnamed: 3,Team,Unnamed: 4,Opponent,Unnamed: 6,OT,Tm,Opp,W,L,Unnamed: 12
0,1,"Mon, Nov 6, 2023",,Box Score,Abilene Christian,@,Oklahoma State,W,,64,59,1,0,A. Dibba · H. Madden · K. McClain · A. Simmons...
1,2,"Fri, Nov 10, 2023",,Box Score,Abilene Christian,@,NC State,L,,64,84,1,1,A. Dibba · H. Madden · K. McClain · A. Simmons...
2,3,"Tue, Nov 14, 2023",,Box Score,Abilene Christian,,Prairie View,L,,74,79,1,2,A. Dibba · A. Igiehon · H. Madden · K. McClain...
3,4,"Fri, Nov 17, 2023",,Box Score,Abilene Christian,N,San Jose State,W,,77,71,2,2,N. DeGruy · A. Dibba · H. Madden · A. Simmons ...
4,5,"Sun, Nov 19, 2023",,Box Score,Abilene Christian,N,Fordham,W,,59,45,3,2,N. DeGruy · A. Dibba · H. Madden · A. Simmons ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11750,28,"Sat, Feb 17, 2024",,Box Score,Youngstown State,@,Cleveland State,L,,73,81,19,9,D. Burns · B. Langdon · Z. Reid · B. Rush · B....
11751,29,"Fri, Feb 23, 2024",,Box Score,Youngstown State,@,Milwaukee,W,OT,84,80,20,9,D. Burns · G. Dynes · B. Langdon · Z. Reid · B...
11752,30,"Sun, Feb 25, 2024",,Box Score,Youngstown State,@,Green Bay,W,,71,59,21,9,D. Burns · G. Dynes · B. Langdon · Z. Reid · B...
11753,31,"Wed, Feb 28, 2024",,Box Score,Youngstown State,,Detroit Mercy,W,,69,55,22,9,D. Burns · G. Dynes · B. Langdon · Z. Reid · B...


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11755 entries, 0 to 11754
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   G            11755 non-null  int64  
 1   Date         11755 non-null  object 
 2   Unnamed: 2   0 non-null      float64
 3   Unnamed: 3   11755 non-null  object 
 4   Team         11755 non-null  object 
 5   Unnamed: 4   6270 non-null   object 
 6   Opponent     11755 non-null  object 
 7   Unnamed: 6   11755 non-null  object 
 8   OT           669 non-null    object 
 9   Tm           11755 non-null  int64  
 10  Opp          11755 non-null  int64  
 11  W            11755 non-null  int64  
 12  L            11755 non-null  int64  
 13  Unnamed: 12  11755 non-null  object 
dtypes: float64(1), int64(5), object(8)
memory usage: 1.3+ MB


### Data Setup

In [7]:
temp = df.copy()

In [8]:
df = temp.copy()

Fix column names and drop useless columns

In [9]:
df.drop(columns=['G', 'Unnamed: 2', 'Unnamed: 3', 'W', 'L'], inplace=True)

df.columns = ['Date', 'Team', 'Location', 'Opponent', 'Result', 'Overtime', 'Team Score', 'Opponent Score', 'Starters']

df

Unnamed: 0,Date,Team,Location,Opponent,Result,Overtime,Team Score,Opponent Score,Starters
0,"Mon, Nov 6, 2023",Abilene Christian,@,Oklahoma State,W,,64,59,A. Dibba · H. Madden · K. McClain · A. Simmons...
1,"Fri, Nov 10, 2023",Abilene Christian,@,NC State,L,,64,84,A. Dibba · H. Madden · K. McClain · A. Simmons...
2,"Tue, Nov 14, 2023",Abilene Christian,,Prairie View,L,,74,79,A. Dibba · A. Igiehon · H. Madden · K. McClain...
3,"Fri, Nov 17, 2023",Abilene Christian,N,San Jose State,W,,77,71,N. DeGruy · A. Dibba · H. Madden · A. Simmons ...
4,"Sun, Nov 19, 2023",Abilene Christian,N,Fordham,W,,59,45,N. DeGruy · A. Dibba · H. Madden · A. Simmons ...
...,...,...,...,...,...,...,...,...,...
11750,"Sat, Feb 17, 2024",Youngstown State,@,Cleveland State,L,,73,81,D. Burns · B. Langdon · Z. Reid · B. Rush · B....
11751,"Fri, Feb 23, 2024",Youngstown State,@,Milwaukee,W,OT,84,80,D. Burns · G. Dynes · B. Langdon · Z. Reid · B...
11752,"Sun, Feb 25, 2024",Youngstown State,@,Green Bay,W,,71,59,D. Burns · G. Dynes · B. Langdon · Z. Reid · B...
11753,"Wed, Feb 28, 2024",Youngstown State,,Detroit Mercy,W,,69,55,D. Burns · G. Dynes · B. Langdon · Z. Reid · B...


In [10]:
# remove games that have not been played
df = df.loc[(df['Result'].notna()) & (df['Result'] != 'W/L'), :].reset_index(drop=True)

# get overtime info
df['Overtime'] = df['Overtime'].notna().astype('int8')

# reformat result column
df['Result'] = df['Result'].str[:1].map({'W': 1, 'L': -1}).astype('int8')

# remove games with non-D1 opponents
df = df.loc[df['Opponent'].isin(df['Team']), :].reset_index(drop=True)

# remove games where starters are not listed
df = df.loc[df['Starters'].notna(), :].reset_index(drop=True)

df

Unnamed: 0,Date,Team,Location,Opponent,Result,Overtime,Team Score,Opponent Score,Starters
0,"Mon, Nov 6, 2023",Abilene Christian,@,Oklahoma State,1,0,64,59,A. Dibba · H. Madden · K. McClain · A. Simmons...
1,"Fri, Nov 10, 2023",Abilene Christian,@,NC State,-1,0,64,84,A. Dibba · H. Madden · K. McClain · A. Simmons...
2,"Tue, Nov 14, 2023",Abilene Christian,,Prairie View,-1,0,74,79,A. Dibba · A. Igiehon · H. Madden · K. McClain...
3,"Fri, Nov 17, 2023",Abilene Christian,N,San Jose State,1,0,77,71,N. DeGruy · A. Dibba · H. Madden · A. Simmons ...
4,"Sun, Nov 19, 2023",Abilene Christian,N,Fordham,1,0,59,45,N. DeGruy · A. Dibba · H. Madden · A. Simmons ...
...,...,...,...,...,...,...,...,...,...
11233,"Sat, Feb 17, 2024",Youngstown State,@,Cleveland State,-1,0,73,81,D. Burns · B. Langdon · Z. Reid · B. Rush · B....
11234,"Fri, Feb 23, 2024",Youngstown State,@,Milwaukee,1,1,84,80,D. Burns · G. Dynes · B. Langdon · Z. Reid · B...
11235,"Sun, Feb 25, 2024",Youngstown State,@,Green Bay,1,0,71,59,D. Burns · G. Dynes · B. Langdon · Z. Reid · B...
11236,"Wed, Feb 28, 2024",Youngstown State,,Detroit Mercy,1,0,69,55,D. Burns · G. Dynes · B. Langdon · Z. Reid · B...


Reformat location column for home/neutral/away

In [11]:
df['Location'] = df['Location'].map({np.NaN: 1, 'N': 0, '@': -1}).astype('int8')

df['Location']

0       -1
1       -1
2        1
3        0
4        0
        ..
11233   -1
11234   -1
11235   -1
11236    1
11237    1
Name: Location, Length: 11238, dtype: int8

Other general fixes

In [12]:
# fix dates
df['Date'] = pd.to_datetime(df['Date'])

# only include games before NCAA tournament
df = df.loc[df['Date'] <= cutoff_dates[season], :].reset_index(drop=True)

# fix column types
df.loc[:, df.columns.str.contains('Team ', regex=False)] = df.loc[:, df.columns.str.contains('Team ', regex=False)].astype('float64')
df.loc[:, df.columns.str.contains('Opponent ', regex=False)] = df.loc[:, df.columns.str.contains('Opponent ', regex=False)].astype('float64')

df

Unnamed: 0,Date,Team,Location,Opponent,Result,Overtime,Team Score,Opponent Score,Starters
0,2023-11-06,Abilene Christian,-1,Oklahoma State,1,0,64.0,59.0,A. Dibba · H. Madden · K. McClain · A. Simmons...
1,2023-11-10,Abilene Christian,-1,NC State,-1,0,64.0,84.0,A. Dibba · H. Madden · K. McClain · A. Simmons...
2,2023-11-14,Abilene Christian,1,Prairie View,-1,0,74.0,79.0,A. Dibba · A. Igiehon · H. Madden · K. McClain...
3,2023-11-17,Abilene Christian,0,San Jose State,1,0,77.0,71.0,N. DeGruy · A. Dibba · H. Madden · A. Simmons ...
4,2023-11-19,Abilene Christian,0,Fordham,1,0,59.0,45.0,N. DeGruy · A. Dibba · H. Madden · A. Simmons ...
...,...,...,...,...,...,...,...,...,...
11233,2024-02-17,Youngstown State,-1,Cleveland State,-1,0,73.0,81.0,D. Burns · B. Langdon · Z. Reid · B. Rush · B....
11234,2024-02-23,Youngstown State,-1,Milwaukee,1,1,84.0,80.0,D. Burns · G. Dynes · B. Langdon · Z. Reid · B...
11235,2024-02-25,Youngstown State,-1,Green Bay,1,0,71.0,59.0,D. Burns · G. Dynes · B. Langdon · Z. Reid · B...
11236,2024-02-28,Youngstown State,1,Detroit Mercy,1,0,69.0,55.0,D. Burns · G. Dynes · B. Langdon · Z. Reid · B...


Get opponent starters

In [13]:
# create a pseudo game ID that can be looked at from opponent's perspective to get opposing starters
key_to_starters = dict(zip(
    df['Date'].astype(str) + ' ' + df['Team'] + ' ' + df['Opponent'] + ' ' + df['Team Score'].astype(str) + ' ' + df['Opponent Score'].astype(str),
    df['Starters']
))

df['Opponent Starters'] = (df['Date'].astype(str) + ' ' + df['Opponent'] + ' ' + df['Team'] + ' ' + df['Opponent Score'].astype(str) + ' ' + df['Team Score'].astype(str)).map(key_to_starters)

df

Unnamed: 0,Date,Team,Location,Opponent,Result,Overtime,Team Score,Opponent Score,Starters,Opponent Starters
0,2023-11-06,Abilene Christian,-1,Oklahoma State,1,0,64.0,59.0,A. Dibba · H. Madden · K. McClain · A. Simmons...,J. Hicklen · M. Marsh · B. Thompson · Q. Willi...
1,2023-11-10,Abilene Christian,-1,NC State,-1,0,64.0,84.0,A. Dibba · H. Madden · K. McClain · A. Simmons...,D. Burns · M. Diarra · D. Horne · C. Morsell ·...
2,2023-11-14,Abilene Christian,1,Prairie View,-1,0,74.0,79.0,A. Dibba · A. Igiehon · H. Madden · K. McClain...,"C. Felix, Jr. · N. Gazelas · B. Myles · A. Nun..."
3,2023-11-17,Abilene Christian,0,San Jose State,1,0,77.0,71.0,N. DeGruy · A. Dibba · H. Madden · A. Simmons ...,M. Amey · T. Anderson · A. Cardenas · A. Diong...
4,2023-11-19,Abilene Christian,0,Fordham,1,0,59.0,45.0,N. DeGruy · A. Dibba · H. Madden · A. Simmons ...,A. Charlton · J. Medor · J. Rivera · K. Rose ·...
...,...,...,...,...,...,...,...,...,...,...
11233,2024-02-17,Youngstown State,-1,Cleveland State,-1,0,73.0,81.0,D. Burns · B. Langdon · Z. Reid · B. Rush · B....,D. Arnett · T. Enaruna · D. Lowder · T. Willia...
11234,2024-02-23,Youngstown State,-1,Milwaukee,1,1,84.0,80.0,D. Burns · G. Dynes · B. Langdon · Z. Reid · B...,F. Fields · B. Freeman · D. Ham · E. Pratt · K...
11235,2024-02-25,Youngstown State,-1,Green Bay,1,0,71.0,59.0,D. Burns · G. Dynes · B. Langdon · Z. Reid · B...,R. Byhre · C. Cummings · D. Douglas Jr. · E. J...
11236,2024-02-28,Youngstown State,1,Detroit Mercy,1,0,69.0,55.0,D. Burns · G. Dynes · B. Langdon · Z. Reid · B...,E. Del Cadia · E. Kuac · M. Manciel · J. Stone...


In [14]:
df.loc[df['Opponent Starters'].isna(), :]

Unnamed: 0,Date,Team,Location,Opponent,Result,Overtime,Team Score,Opponent Score,Starters,Opponent Starters


Split starters into separate columns

In [15]:
# rename starters column
df.rename(columns={'Starters': 'Team Starters'}, inplace=True)

df[[f'Team Starter {i}' for i in range(1, 6)]] = df['Team Starters'].str.split(r'\xa0· | \·', regex=True, expand=True)

for i in range(1, 6):
    df[f'Team Starter {i}'] = df['Team'] + ' ' + df[f'Team Starter {i}']

df[[f'Opponent Starter {i}' for i in range(1, 6)]] = df['Opponent Starters'].str.split(r'\xa0· | \·', regex=True, expand=True)

for i in range(1, 6):
    df[f'Opponent Starter {i}'] = df['Opponent'] + ' ' + df[f'Opponent Starter {i}']

df

Unnamed: 0,Date,Team,Location,Opponent,Result,Overtime,Team Score,Opponent Score,Team Starters,Opponent Starters,Team Starter 1,Team Starter 2,Team Starter 3,Team Starter 4,Team Starter 5,Opponent Starter 1,Opponent Starter 2,Opponent Starter 3,Opponent Starter 4,Opponent Starter 5
0,2023-11-06,Abilene Christian,-1,Oklahoma State,1,0,64.0,59.0,A. Dibba · H. Madden · K. McClain · A. Simmons...,J. Hicklen · M. Marsh · B. Thompson · Q. Willi...,Abilene Christian A. Dibba,Abilene Christian H. Madden,Abilene Christian K. McClain,Abilene Christian A. Simmons,Abilene Christian C. Steele,Oklahoma State J. Hicklen,Oklahoma State M. Marsh,Oklahoma State B. Thompson,Oklahoma State Q. Williams,Oklahoma State J. Wright
1,2023-11-10,Abilene Christian,-1,NC State,-1,0,64.0,84.0,A. Dibba · H. Madden · K. McClain · A. Simmons...,D. Burns · M. Diarra · D. Horne · C. Morsell ·...,Abilene Christian A. Dibba,Abilene Christian H. Madden,Abilene Christian K. McClain,Abilene Christian A. Simmons,Abilene Christian C. Steele,NC State D. Burns,NC State M. Diarra,NC State D. Horne,NC State C. Morsell,NC State J. Taylor
2,2023-11-14,Abilene Christian,1,Prairie View,-1,0,74.0,79.0,A. Dibba · A. Igiehon · H. Madden · K. McClain...,"C. Felix, Jr. · N. Gazelas · B. Myles · A. Nun...",Abilene Christian A. Dibba,Abilene Christian A. Igiehon,Abilene Christian H. Madden,Abilene Christian K. McClain,Abilene Christian C. Steele,"Prairie View C. Felix, Jr.",Prairie View N. Gazelas,Prairie View B. Myles,Prairie View A. Nunley,Prairie View C. Smith
3,2023-11-17,Abilene Christian,0,San Jose State,1,0,77.0,71.0,N. DeGruy · A. Dibba · H. Madden · A. Simmons ...,M. Amey · T. Anderson · A. Cardenas · A. Diong...,Abilene Christian N. DeGruy,Abilene Christian A. Dibba,Abilene Christian H. Madden,Abilene Christian A. Simmons,Abilene Christian C. Steele,San Jose State M. Amey,San Jose State T. Anderson,San Jose State A. Cardenas,San Jose State A. Diongue,San Jose State T. Gorener
4,2023-11-19,Abilene Christian,0,Fordham,1,0,59.0,45.0,N. DeGruy · A. Dibba · H. Madden · A. Simmons ...,A. Charlton · J. Medor · J. Rivera · K. Rose ·...,Abilene Christian N. DeGruy,Abilene Christian A. Dibba,Abilene Christian H. Madden,Abilene Christian A. Simmons,Abilene Christian C. Steele,Fordham A. Charlton,Fordham J. Medor,Fordham J. Rivera,Fordham K. Rose,Fordham A. Tsimbila
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11233,2024-02-17,Youngstown State,-1,Cleveland State,-1,0,73.0,81.0,D. Burns · B. Langdon · Z. Reid · B. Rush · B....,D. Arnett · T. Enaruna · D. Lowder · T. Willia...,Youngstown State D. Burns,Youngstown State B. Langdon,Youngstown State Z. Reid,Youngstown State B. Rush,Youngstown State B. Thompson,Cleveland State D. Arnett,Cleveland State T. Enaruna,Cleveland State D. Lowder,Cleveland State T. Williams,Cleveland State J. Woodrich
11234,2024-02-23,Youngstown State,-1,Milwaukee,1,1,84.0,80.0,D. Burns · G. Dynes · B. Langdon · Z. Reid · B...,F. Fields · B. Freeman · D. Ham · E. Pratt · K...,Youngstown State D. Burns,Youngstown State G. Dynes,Youngstown State B. Langdon,Youngstown State Z. Reid,Youngstown State B. Thompson,Milwaukee F. Fields,Milwaukee B. Freeman,Milwaukee D. Ham,Milwaukee E. Pratt,Milwaukee K. Pullian
11235,2024-02-25,Youngstown State,-1,Green Bay,1,0,71.0,59.0,D. Burns · G. Dynes · B. Langdon · Z. Reid · B...,R. Byhre · C. Cummings · D. Douglas Jr. · E. J...,Youngstown State D. Burns,Youngstown State G. Dynes,Youngstown State B. Langdon,Youngstown State Z. Reid,Youngstown State B. Thompson,Green Bay R. Byhre,Green Bay C. Cummings,Green Bay D. Douglas Jr.,Green Bay E. Jones,Green Bay P. Ruedinger
11236,2024-02-28,Youngstown State,1,Detroit Mercy,1,0,69.0,55.0,D. Burns · G. Dynes · B. Langdon · Z. Reid · B...,E. Del Cadia · E. Kuac · M. Manciel · J. Stone...,Youngstown State D. Burns,Youngstown State G. Dynes,Youngstown State B. Langdon,Youngstown State Z. Reid,Youngstown State B. Thompson,Detroit Mercy E. Del Cadia,Detroit Mercy E. Kuac,Detroit Mercy M. Manciel,Detroit Mercy J. Stone,Detroit Mercy M. Tankersley


Set up adjusted Score Differential

In [16]:
blowout_threshold = 18  # adjusted score differential will only account up to this score difference

df['Score Differential'] = df['Team Score'] - df['Opponent Score']

df['Adjusted Score Differential'] = df['Score Differential'].abs()

df.loc[df['Adjusted Score Differential'] > blowout_threshold, 'Adjusted Score Differential'] = blowout_threshold

df['Adjusted Score Differential'] = df['Adjusted Score Differential']**(np.log(1.5)/np.log(blowout_threshold))

df

Unnamed: 0,Date,Team,Location,Opponent,Result,Overtime,Team Score,Opponent Score,Team Starters,Opponent Starters,...,Team Starter 3,Team Starter 4,Team Starter 5,Opponent Starter 1,Opponent Starter 2,Opponent Starter 3,Opponent Starter 4,Opponent Starter 5,Score Differential,Adjusted Score Differential
0,2023-11-06,Abilene Christian,-1,Oklahoma State,1,0,64.0,59.0,A. Dibba · H. Madden · K. McClain · A. Simmons...,J. Hicklen · M. Marsh · B. Thompson · Q. Willi...,...,Abilene Christian K. McClain,Abilene Christian A. Simmons,Abilene Christian C. Steele,Oklahoma State J. Hicklen,Oklahoma State M. Marsh,Oklahoma State B. Thompson,Oklahoma State Q. Williams,Oklahoma State J. Wright,5.0,1.253292
1,2023-11-10,Abilene Christian,-1,NC State,-1,0,64.0,84.0,A. Dibba · H. Madden · K. McClain · A. Simmons...,D. Burns · M. Diarra · D. Horne · C. Morsell ·...,...,Abilene Christian K. McClain,Abilene Christian A. Simmons,Abilene Christian C. Steele,NC State D. Burns,NC State M. Diarra,NC State D. Horne,NC State C. Morsell,NC State J. Taylor,-20.0,1.500000
2,2023-11-14,Abilene Christian,1,Prairie View,-1,0,74.0,79.0,A. Dibba · A. Igiehon · H. Madden · K. McClain...,"C. Felix, Jr. · N. Gazelas · B. Myles · A. Nun...",...,Abilene Christian H. Madden,Abilene Christian K. McClain,Abilene Christian C. Steele,"Prairie View C. Felix, Jr.",Prairie View N. Gazelas,Prairie View B. Myles,Prairie View A. Nunley,Prairie View C. Smith,-5.0,1.253292
3,2023-11-17,Abilene Christian,0,San Jose State,1,0,77.0,71.0,N. DeGruy · A. Dibba · H. Madden · A. Simmons ...,M. Amey · T. Anderson · A. Cardenas · A. Diong...,...,Abilene Christian H. Madden,Abilene Christian A. Simmons,Abilene Christian C. Steele,San Jose State M. Amey,San Jose State T. Anderson,San Jose State A. Cardenas,San Jose State A. Diongue,San Jose State T. Gorener,6.0,1.285760
4,2023-11-19,Abilene Christian,0,Fordham,1,0,59.0,45.0,N. DeGruy · A. Dibba · H. Madden · A. Simmons ...,A. Charlton · J. Medor · J. Rivera · K. Rose ·...,...,Abilene Christian H. Madden,Abilene Christian A. Simmons,Abilene Christian C. Steele,Fordham A. Charlton,Fordham J. Medor,Fordham J. Rivera,Fordham K. Rose,Fordham A. Tsimbila,14.0,1.448039
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11233,2024-02-17,Youngstown State,-1,Cleveland State,-1,0,73.0,81.0,D. Burns · B. Langdon · Z. Reid · B. Rush · B....,D. Arnett · T. Enaruna · D. Lowder · T. Willia...,...,Youngstown State Z. Reid,Youngstown State B. Rush,Youngstown State B. Thompson,Cleveland State D. Arnett,Cleveland State T. Enaruna,Cleveland State D. Lowder,Cleveland State T. Williams,Cleveland State J. Woodrich,-8.0,1.338710
11234,2024-02-23,Youngstown State,-1,Milwaukee,1,1,84.0,80.0,D. Burns · G. Dynes · B. Langdon · Z. Reid · B...,F. Fields · B. Freeman · D. Ham · E. Pratt · K...,...,Youngstown State B. Langdon,Youngstown State Z. Reid,Youngstown State B. Thompson,Milwaukee F. Fields,Milwaukee B. Freeman,Milwaukee D. Ham,Milwaukee E. Pratt,Milwaukee K. Pullian,4.0,1.214668
11235,2024-02-25,Youngstown State,-1,Green Bay,1,0,71.0,59.0,D. Burns · G. Dynes · B. Langdon · Z. Reid · B...,R. Byhre · C. Cummings · D. Douglas Jr. · E. J...,...,Youngstown State B. Langdon,Youngstown State Z. Reid,Youngstown State B. Thompson,Green Bay R. Byhre,Green Bay C. Cummings,Green Bay D. Douglas Jr.,Green Bay E. Jones,Green Bay P. Ruedinger,12.0,1.417062
11236,2024-02-28,Youngstown State,1,Detroit Mercy,1,0,69.0,55.0,D. Burns · G. Dynes · B. Langdon · Z. Reid · B...,E. Del Cadia · E. Kuac · M. Manciel · J. Stone...,...,Youngstown State B. Langdon,Youngstown State Z. Reid,Youngstown State B. Thompson,Detroit Mercy E. Del Cadia,Detroit Mercy E. Kuac,Detroit Mercy M. Manciel,Detroit Mercy J. Stone,Detroit Mercy M. Tankersley,14.0,1.448039


### Model Building

##### Starters Ratings

Unfortunately, there is no way to differentiate between 2 starters with the same name on the same team. In such circumstances, we double count the entity. 

In [17]:
X = (
    pd.get_dummies(df['Team Starter 1']).astype('int8')
).add(
    pd.get_dummies(df['Team Starter 2']).astype('int8'), fill_value=0
).add(
    pd.get_dummies(df['Team Starter 3']).astype('int8'), fill_value=0 
).add(
    pd.get_dummies(df['Team Starter 4']).astype('int8'), fill_value=0 
).add(
    pd.get_dummies(df['Team Starter 5']).astype('int8'), fill_value=0 
).add(
    -pd.get_dummies(df['Opponent Starter 1']).astype('int8'), fill_value=0 
).add(
    -pd.get_dummies(df['Opponent Starter 2']).astype('int8'), fill_value=0 
).add(
    -pd.get_dummies(df['Opponent Starter 3']).astype('int8'), fill_value=0 
).add(
    -pd.get_dummies(df['Opponent Starter 4']).astype('int8'), fill_value=0 
).add(
    -pd.get_dummies(df['Opponent Starter 5']).astype('int8'), fill_value=0 
)

X['Home Field Advantage'] = df['Location'].copy()

X

  X['Home Field Advantage'] = df['Location'].copy()


Unnamed: 0,Abilene Christian A. Dibba,Abilene Christian A. Igiehon,Abilene Christian A. Simmons,Abilene Christian C. Steele,Abilene Christian H. Madden,Abilene Christian J. Seat,Abilene Christian K. McClain,Abilene Christian L. Bettiol,Abilene Christian N. DeGruy,Air Force B. Becker,...,Yale N. Townsend,Yale Y. Basa-Ama,Youngstown State B. Langdon,Youngstown State B. Rush,Youngstown State B. Thompson,Youngstown State D. Burns,Youngstown State E. Farmer,Youngstown State G. Dynes,Youngstown State Z. Reid,Home Field Advantage
0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1
1,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1
2,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11233,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,-1
11234,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,-1
11235,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,-1
11236,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1


In [18]:
from sklearn.model_selection import GroupKFold

def get_gkf_data(X, y, w, groups, cv=3):
    """
    Converts training data to list of folds
    """
    np.random.seed(22)
    gkf = GroupKFold(n_splits=cv)

    data = []
    for train_index, test_index in gkf.split(X, y, groups=groups):
        X_train = X[train_index]
        X_test = X[test_index]

        y_train = y[train_index]
        y_test = y[test_index]

        # sample weights
        w_train = w[train_index]

        data.append((X_train, X_test, y_train, y_test, w_train))

    return data

cv_data = get_gkf_data(X.to_numpy(), df['Result'].to_numpy(), df[['Adjusted Score Differential']].to_numpy(), df['Date'].to_numpy())

len(cv_data)

3

In [19]:
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
import warnings
import optuna

from sklearn.preprocessing import MinMaxScaler
optuna.logging.set_verbosity(optuna.logging.WARNING)

def objective(trial, cv_data=cv_data):
    # model tuning
    C = trial.suggest_float('C', 0.1, 10, log=True)
    mod = LogisticRegression(penalty='l2', C=C, fit_intercept=False)
    minimum = trial.suggest_float('minimum', 0.1, 1.0, step=0.1)
    maximum = trial.suggest_float('maximum', 1.0, 8.0, step=0.5)
    if minimum >= maximum:
        maximum = minimum + 0.0001

    # cross validation
    y_actuals = []
    y_preds = []
    for X_train, X_test, y_train, y_test, w_train in cv_data:
        y_actuals.append(y_test)

        weights = MinMaxScaler(feature_range=(minimum, maximum)).fit_transform(w_train).reshape(-1)

        with warnings.catch_warnings():
            warnings.filterwarnings('ignore')  # prevent convergence warnings
            mod.fit(X_train, y_train, sample_weight=weights)

        y_preds.append(mod.predict_proba(X_test)[:, 1])

    return log_loss(np.hstack(y_actuals), np.hstack(y_preds))

study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=22))
study.optimize(objective, n_trials=100, show_progress_bar=True)

study.best_params

  0%|          | 0/100 [00:00<?, ?it/s]

{'C': 0.11724716973681776, 'minimum': 0.2, 'maximum': 1.5}

In [20]:
optuna.importance.get_param_importances(study)

{'C': 0.8411629080975099,
 'minimum': 0.11592975755452764,
 'maximum': 0.04290733434796252}

In [21]:
minimum = study.best_params['minimum']
maximum = study.best_params['maximum']

if minimum >= maximum:
        maximum = minimum + 0.0001

weight = MinMaxScaler(
    feature_range=(minimum, maximum)
).fit_transform(df[['Adjusted Score Differential']]).reshape(-1)

mod = LogisticRegression(penalty='l2', C=study.best_params['C'], fit_intercept=False)

mod.fit(X, df['Result'], sample_weight=weight)

df_ratings = pd.DataFrame(
    {
        'Starter': X.columns,
        'Rating': mod.coef_[0]
    }
).sort_values(by=['Rating'], ascending=False, ignore_index=True)

df_ratings_display = df_ratings.loc[df_ratings['Starter'] != 'Home Field Advantage', :].reset_index(drop=True)
df_ratings_display.index += 1

df_ratings_display.head(25)

Unnamed: 0,Starter,Rating
1,Kentucky D. Wagner,0.682051
2,Connecticut T. Newton,0.649121
3,Connecticut C. Spencer,0.649121
4,Auburn A. Holloway,0.614663
5,Auburn D. Jones,0.583905
6,Duke M. Mitchell,0.582029
7,Houston L. Cryer,0.579957
8,Houston J. Shead,0.579957
9,Houston J. Francis,0.579957
10,Connecticut S. Castle,0.571875


### Miscellaneous

##### Save Rankings

Get last available lineup for each team

In [22]:
starter_to_rating = dict(zip(df_ratings_display['Starter'], df_ratings_display['Rating']))

len(starter_to_rating)

3201

In [23]:
df_last_starters = df.sort_values(by=['Date']).groupby(['Team']).tail(1).reset_index(drop=True)[['Team'] + [f'Team Starter {i}' for i in range(1, 6)]]

df_last_starters

Unnamed: 0,Team,Team Starter 1,Team Starter 2,Team Starter 3,Team Starter 4,Team Starter 5
0,Chicago State,Chicago State W. Cardet,Chicago State J. Corbett,Chicago State N. Crawford,Chicago State B. Davis,Chicago State C. Jernigan
1,Bellarmine,Bellarmine L. Hatton,Bellarmine B. Johnson,Bellarmine P. Suder,Bellarmine G. Tipton,Bellarmine B. Wieland
2,Central Arkansas,Central Arkansas T. Anderson,Central Arkansas E. Cato,Central Arkansas I. Klintman,Central Arkansas M. Olowokere,Central Arkansas D. Sofield
3,Tennessee Tech,Tennessee Tech J. Davis,Tennessee Tech D. Egbuniwe,Tennessee Tech J. Harvey,Tennessee Tech K. Layton,Tennessee Tech D. Wood
4,Lindenwood,Lindenwood D. Beane,Lindenwood K. Cole,Lindenwood K. Haymon,Lindenwood J. McDaniel,Lindenwood C. Ruffin
...,...,...,...,...,...,...
357,South Florida,South Florida K. Knox,South Florida J. Placer,South Florida K. Pryor,South Florida B. Stroud,South Florida C. Youngblood
358,Providence,Providence G. Dual,Providence C. Floyd,Providence D. Gaines,Providence J. Oduro,Providence J. Pierre
359,Virginia,Virginia R. Beekman,Virginia R. Dunn,Virginia I. McKneely,Virginia J. Minor,Virginia T. Murray
360,Wagner,Wagner T. Allen,Wagner J. Brown,Wagner M. Council Jr.,Wagner J. Ezquerra,Wagner K. Lewis


In [24]:
for i in range(1, 6):
    df_last_starters[f'Team Starter {i} Rating'] = df_last_starters[f'Team Starter {i}'].map(starter_to_rating)

df_last_starters['Rating'] = df_last_starters[[f'Team Starter {i} Rating' for i in range(1, 6)]].mean(axis=1)

df_last_starters

Unnamed: 0,Team,Team Starter 1,Team Starter 2,Team Starter 3,Team Starter 4,Team Starter 5,Team Starter 1 Rating,Team Starter 2 Rating,Team Starter 3 Rating,Team Starter 4 Rating,Team Starter 5 Rating,Rating
0,Chicago State,Chicago State W. Cardet,Chicago State J. Corbett,Chicago State N. Crawford,Chicago State B. Davis,Chicago State C. Jernigan,-0.259754,-0.230212,0.187788,-0.381526,-0.053285,-0.147398
1,Bellarmine,Bellarmine L. Hatton,Bellarmine B. Johnson,Bellarmine P. Suder,Bellarmine G. Tipton,Bellarmine B. Wieland,-0.351918,-0.148645,-0.351918,-0.351918,-0.150141,-0.270908
2,Central Arkansas,Central Arkansas T. Anderson,Central Arkansas E. Cato,Central Arkansas I. Klintman,Central Arkansas M. Olowokere,Central Arkansas D. Sofield,-0.216398,-0.667990,-0.141727,-0.437200,-0.120833,-0.316830
3,Tennessee Tech,Tennessee Tech J. Davis,Tennessee Tech D. Egbuniwe,Tennessee Tech J. Harvey,Tennessee Tech K. Layton,Tennessee Tech D. Wood,-0.306122,-0.233920,-0.256035,-0.135884,-0.238218,-0.234036
4,Lindenwood,Lindenwood D. Beane,Lindenwood K. Cole,Lindenwood K. Haymon,Lindenwood J. McDaniel,Lindenwood C. Ruffin,-0.473702,-0.578937,0.084340,-0.228978,-0.296894,-0.298834
...,...,...,...,...,...,...,...,...,...,...,...,...
357,South Florida,South Florida K. Knox,South Florida J. Placer,South Florida K. Pryor,South Florida B. Stroud,South Florida C. Youngblood,0.323854,0.029609,0.528797,0.245222,0.273304,0.280157
358,Providence,Providence G. Dual,Providence C. Floyd,Providence D. Gaines,Providence J. Oduro,Providence J. Pierre,-0.245961,0.002267,0.174983,0.214397,0.081703,0.045478
359,Virginia,Virginia R. Beekman,Virginia R. Dunn,Virginia I. McKneely,Virginia J. Minor,Virginia T. Murray,0.222578,0.222578,0.162981,0.106739,-0.152834,0.112408
360,Wagner,Wagner T. Allen,Wagner J. Brown,Wagner M. Council Jr.,Wagner J. Ezquerra,Wagner K. Lewis,0.084284,0.092731,-0.042122,-0.132374,-0.132374,-0.025971


In [25]:
df_sheet = df_last_starters[['Team', 'Rating']].copy().sort_values(by=['Rating'], ascending=False, ignore_index=True)

df_sheet.head(25)

Unnamed: 0,Team,Rating
0,Connecticut,0.575063
1,Purdue,0.566599
2,Houston,0.521213
3,Auburn,0.516003
4,North Carolina,0.468077
5,McNeese State,0.437639
6,Illinois,0.43568
7,James Madison,0.432858
8,Tennessee,0.427364
9,Arizona,0.410191


In [26]:
df_sheet.to_csv(f'../data/preprocessed/starters/starters_{season}.csv', index=False)
df.to_csv(f'../data/unprocessed/sports_reference_starters/sports_reference_starters_{season}.csv', index=False)

'Done'

'Done'