# Women's Starters

Get ratings of each team's starters in a given year. This file needs to be ran for every relevant year. 

### Web Scraping

Fair warning that this program unfortunately takes a very long time to run due to web scraping limits set by Sports Reference

In [1]:
import pandas as pd
import numpy as np
import time

season = 2023

df_active_teams = pd.read_html('https://www.sports-reference.com/cbb/schools/index.html')[0]
time.sleep(2)

df_active_teams = df_active_teams.loc[~df_active_teams['School'].isin([np.NaN, 'School']), :].reset_index(drop=True)

df_active_teams['From'] = pd.to_numeric(df_active_teams['From'], errors='coerce')
df_active_teams['To'] = pd.to_numeric(df_active_teams['To'], errors='coerce')

df_active_teams = df_active_teams.loc[(df_active_teams['To'] >= season) & (df_active_teams['From'] <= season), :].reset_index(drop=True)

if season == 2021:  # if 2021, remove ivy league because they cancelled their season in 2021
    df_active_teams = df_active_teams.loc[
        ~df_active_teams['School'].isin(['Princeton', 'Harvard', 'Yale', 'Pennsylvania', 'Brown', 'Columbia', 'Cornell', 'Dartmouth']),
        :
    ].reset_index(drop=True)

df_active_teams

Unnamed: 0,Rk,School,"City, State",From,To,Yrs,G,W,L,W-L%,SRS,SOS,AP,CREG,CTRN,NCAA,FF,NC
0,1,Abilene Christian,"Abilene, Texas",1971,2024,14,420,225,195,.536,-7.71,-5.56,0,0,2,2,0,0
1,2,Air Force,"USAF Academy, Colorado",1958,2024,67,1842,794,1048,.431,-2.78,1.34,0,1,0,4,0,0
2,3,Akron,"Akron, Ohio",1902,2024,73,1714,1024,690,.597,0.01,-1.56,0,9,5,5,0,0
3,4,Alabama,"Tuscaloosa, Alabama",1913,2024,111,2890,1790,1099,.620,7.96,4.92,17,12,9,24,0,0
4,5,Alabama A&M,"Normal, Alabama",2000,2024,25,719,275,444,.382,-16.63,-11.06,0,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
358,487,Wright State,"Dayton, Ohio",1988,2024,37,1128,631,497,.559,-1.32,-2.55,0,4,4,4,0,0
359,488,Wyoming,"Laramie, Wyoming",1905,2024,119,2875,1609,1266,.560,2.74,2.75,4,21,3,16,1,1
360,489,Xavier,"Cincinnati, Ohio",1920,2024,103,2694,1610,1084,.598,6.08,3.99,13,18,9,29,0,0
361,490,Yale,"New Haven, Connecticut",1896,2024,128,3082,1563,1516,.508,-5.99,-4.45,1,16,2,6,0,0


Heads up that there is an issue with 2023. Some games from UAB and Florida international have either more or less than 5 starters listed. I manually removed those games from this analysis. 

In [2]:
from datetime import datetime

cutoff_dates = {
    2012: datetime(2012, 3, 16),
    2013: datetime(2013, 3, 22),
    2014: datetime(2014, 3, 21),
    2015: datetime(2015, 3, 19),
    2016: datetime(2016, 3, 17),
    2017: datetime(2017, 3, 16),
    2018: datetime(2018, 3, 15),
    2019: datetime(2019, 3, 21),
    2020: datetime(2020, 3, 13),  # cancelled
    2021: datetime(2021, 3, 24),
    2022: datetime(2022, 3, 17),
    2023: datetime(2023, 3, 16),
}

cutoff_dates[season]

datetime.datetime(2023, 3, 16, 0, 0)

Get school names used in Sports Reference URL

In [3]:
df_active_teams.loc[df_active_teams['School'] == 'Hartford Hawks', 'School'] = 'Hartford'
df_active_teams.loc[df_active_teams['School'] == 'Savannah State Tigers', 'School'] = 'Savannah State'
df_active_teams.loc[df_active_teams['School'] == 'St. Francis (NY) Terriers', 'School'] = 'St. Francis (NY)'

df_active_teams['URL School'] = df_active_teams['School']

# hard code some schools that cause issues
df_active_teams.loc[df_active_teams['School'] == 'Bowling Green', 'URL School'] = 'Bowling Green State'
df_active_teams.loc[df_active_teams['School'] == 'BYU', 'URL School'] = 'Brigham Young'
df_active_teams.loc[df_active_teams['School'] == 'Louisiana', 'URL School'] = 'Louisiana Lafayette'
df_active_teams.loc[df_active_teams['School'] == 'LSU', 'URL School'] = 'Louisiana State'
df_active_teams.loc[df_active_teams['School'] == 'Ole Miss', 'URL School'] = 'Mississippi'
df_active_teams.loc[df_active_teams['School'] == 'Pitt', 'URL School'] = 'Pittsburgh'
df_active_teams.loc[df_active_teams['School'] == 'SMU', 'URL School'] = 'Southern Methodist'
df_active_teams.loc[df_active_teams['School'] == 'UAB', 'URL School'] = 'Alabama Birmingham'
df_active_teams.loc[df_active_teams['School'] == 'UCF', 'URL School'] = 'Central Florida'
df_active_teams.loc[df_active_teams['School'] == 'USC', 'URL School'] = 'Southern California'
df_active_teams.loc[df_active_teams['School'] == 'UTEP', 'URL School'] = 'Texas El Paso'
df_active_teams.loc[df_active_teams['School'] == 'UTSA', 'URL School'] = 'Texas San Antonio'
df_active_teams.loc[df_active_teams['School'] == 'Sam Houston', 'URL School'] = 'Sam Houston State'
df_active_teams.loc[df_active_teams['School'] == 'Houston Christian', 'URL School'] = 'Houston Baptist'
df_active_teams.loc[df_active_teams['School'] == 'Kansas City', 'URL School'] = 'Missouri Kansas City'
df_active_teams.loc[df_active_teams['School'] == 'Little Rock', 'URL School'] = 'Arkansas Little Rock'
df_active_teams.loc[df_active_teams['School'] == 'NC State', 'URL School'] = 'North Carolina State'
df_active_teams.loc[df_active_teams['School'] == 'Omaha', 'URL School'] = 'Nebraska Omaha'
df_active_teams.loc[df_active_teams['School'] == 'Purdue Fort Wayne', 'URL School'] = 'IPFW'
df_active_teams.loc[df_active_teams['School'] == 'TCU', 'URL School'] = 'Texas Christian'
df_active_teams.loc[df_active_teams['School'] == 'Texas-Rio Grande Valley', 'URL School'] = 'Texas Pan American'
df_active_teams.loc[df_active_teams['School'] == 'The Citadel', 'URL School'] = 'Citadel'
df_active_teams.loc[df_active_teams['School'] == 'Utah Tech', 'URL School'] = 'Dixie State'
df_active_teams.loc[df_active_teams['School'] == 'St. Thomas', 'URL School'] = 'St. Thomas MN'
df_active_teams.loc[df_active_teams['School'] == 'UC Berkley', 'URL School'] = 'California'
df_active_teams.loc[df_active_teams['School'] == 'UT Arlington', 'URL School'] = 'Texas Arlington'
df_active_teams.loc[df_active_teams['School'] == 'FDU', 'URL School'] = 'Fairleigh Dickinson'

df_active_teams['URL School'] = df_active_teams['URL School'].str.replace(r'^UC\s', 'California ', regex=True)
df_active_teams['URL School'] = df_active_teams['URL School'].str.replace(r'^UNC\s', 'North Carolina ', regex=True)

In [4]:
urls = (
    'https://www.sports-reference.com/cbb/schools/' +
    df_active_teams['URL School'].str.replace(' ', '-', regex=False).str.replace('[)(&.\']', '', regex=True).str.replace('--', '-', regex=False).str.lower() +
    '/women/' + str(season) +
    '-starters.html'
)

urls

0      https://www.sports-reference.com/cbb/schools/a...
1      https://www.sports-reference.com/cbb/schools/a...
2      https://www.sports-reference.com/cbb/schools/a...
3      https://www.sports-reference.com/cbb/schools/a...
4      https://www.sports-reference.com/cbb/schools/a...
                             ...                        
358    https://www.sports-reference.com/cbb/schools/w...
359    https://www.sports-reference.com/cbb/schools/w...
360    https://www.sports-reference.com/cbb/schools/x...
361    https://www.sports-reference.com/cbb/schools/y...
362    https://www.sports-reference.com/cbb/schools/y...
Name: URL School, Length: 363, dtype: object

In [5]:
from urllib.error import HTTPError
from tqdm.notebook import tqdm

problems = []

l = []
loop = tqdm(
    np.array((df_active_teams['School'], urls)).transpose(),  # get both school and url into loop iteration
    bar_format='{bar}{percentage:3.0f}%{r_bar}   {desc}'  # make the description not affect bar location
)

for (school, url) in loop:
  try:
    loop.set_description(f'{school}')
    # print(f'{i:03d}: {url}')
    df_team_season = pd.read_html(url)[0]  # starters should be initial table on site
    time.sleep(2)
    
    df_team_season.insert(4, 'Team', school)
    l.append(df_team_season)
  except HTTPError as e:
    if e.code == 404:
      print(f'{url} | {e}')
      time.sleep(2)
    else:
      print(f'{url} | {e}')
      break

df = pd.concat(l, ignore_index=True)

df

            0%| 0/363 [00:00<?, ?it/s]   

https://www.sports-reference.com/cbb/schools/citadel/women/2023-starters.html | HTTP Error 404: Not Found
https://www.sports-reference.com/cbb/schools/virginia-military-institute/women/2023-starters.html | HTTP Error 404: Not Found


Unnamed: 0,G,Date,Unnamed: 2,Unnamed: 3,Team,Unnamed: 4,Opponent,Unnamed: 6,OT,Tm,Opp,W,L,Unnamed: 12
0,1,"Wed, Nov 9, 2022",,Box Score,Abilene Christian,,Howard Payne,W,,89,37,1,0,B. Earle · Z. Jackson · M. Langstaff · A. Mart...
1,2,"Sun, Nov 13, 2022",,Box Score,Abilene Christian,@,Rice,L,,69,85,1,1,B. Earle · Z. Jackson · M. Langstaff · A. Mart...
2,3,"Wed, Nov 16, 2022",,Box Score,Abilene Christian,,University of the Southwest,W,,77,36,2,1,B. Earle · Z. Jackson · M. Langstaff · A. Mart...
3,4,"Sun, Nov 20, 2022",,Box Score,Abilene Christian,@,UTSA,L,,70,76,2,2,B. Earle · M. Langstaff · A. Martin · M. Mille...
4,5,"Fri, Nov 25, 2022",,Box Score,Abilene Christian,N,Montana State,W,,66,61,3,2,B. Earle · Z. Jackson · M. Langstaff · A. Mart...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11331,26,"Fri, Feb 17, 2023",,Box Score,Youngstown State,,Milwaukee,L,,54,75,18,8,M. Callahan · D. Jarrells · M. Magestro · L. R...
11332,27,"Sun, Feb 19, 2023",,Box Score,Youngstown State,,Green Bay,L,,54,67,18,9,M. Callahan · D. Jarrells · L. Mack · L. Ritz ...
11333,28,"Tue, Feb 21, 2023",,Box Score,Youngstown State,@,IUPUI,L,,80,87,18,10,M. Callahan · D. Jarrells · M. Magestro · L. R...
11334,29,"Sat, Feb 25, 2023",,Box Score,Youngstown State,@,Robert Morris,W,,65,51,19,10,M. Callahan · D. Jarrells · M. Magestro · L. R...


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11336 entries, 0 to 11335
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   G            11336 non-null  int64  
 1   Date         11336 non-null  object 
 2   Unnamed: 2   0 non-null      float64
 3   Unnamed: 3   11336 non-null  object 
 4   Team         11336 non-null  object 
 5   Unnamed: 4   6070 non-null   object 
 6   Opponent     11336 non-null  object 
 7   Unnamed: 6   11336 non-null  object 
 8   OT           526 non-null    object 
 9   Tm           11336 non-null  int64  
 10  Opp          11336 non-null  int64  
 11  W            11336 non-null  int64  
 12  L            11336 non-null  int64  
 13  Unnamed: 12  11336 non-null  object 
dtypes: float64(1), int64(5), object(8)
memory usage: 1.2+ MB


### Data Setup

In [7]:
temp = df.copy()

In [8]:
df = temp.copy()

Fix column names and drop useless columns

In [9]:
df.drop(columns=['G', 'Unnamed: 2', 'Unnamed: 3', 'W', 'L'], inplace=True)

df.columns = ['Date', 'Team', 'Location', 'Opponent', 'Result', 'Overtime', 'Team Score', 'Opponent Score', 'Starters']

df

Unnamed: 0,Date,Team,Location,Opponent,Result,Overtime,Team Score,Opponent Score,Starters
0,"Wed, Nov 9, 2022",Abilene Christian,,Howard Payne,W,,89,37,B. Earle · Z. Jackson · M. Langstaff · A. Mart...
1,"Sun, Nov 13, 2022",Abilene Christian,@,Rice,L,,69,85,B. Earle · Z. Jackson · M. Langstaff · A. Mart...
2,"Wed, Nov 16, 2022",Abilene Christian,,University of the Southwest,W,,77,36,B. Earle · Z. Jackson · M. Langstaff · A. Mart...
3,"Sun, Nov 20, 2022",Abilene Christian,@,UTSA,L,,70,76,B. Earle · M. Langstaff · A. Martin · M. Mille...
4,"Fri, Nov 25, 2022",Abilene Christian,N,Montana State,W,,66,61,B. Earle · Z. Jackson · M. Langstaff · A. Mart...
...,...,...,...,...,...,...,...,...,...
11331,"Fri, Feb 17, 2023",Youngstown State,,Milwaukee,L,,54,75,M. Callahan · D. Jarrells · M. Magestro · L. R...
11332,"Sun, Feb 19, 2023",Youngstown State,,Green Bay,L,,54,67,M. Callahan · D. Jarrells · L. Mack · L. Ritz ...
11333,"Tue, Feb 21, 2023",Youngstown State,@,IUPUI,L,,80,87,M. Callahan · D. Jarrells · M. Magestro · L. R...
11334,"Sat, Feb 25, 2023",Youngstown State,@,Robert Morris,W,,65,51,M. Callahan · D. Jarrells · M. Magestro · L. R...


In [10]:
# remove games that have not been played
df = df.loc[(df['Result'].notna()) & (df['Result'] != 'W/L'), :].reset_index(drop=True)

# get overtime info
df['Overtime'] = df['Overtime'].notna().astype('int8')

# reformat result column
df['Result'] = df['Result'].str[:1].map({'W': 1, 'L': -1}).astype('int8')

# remove games with non-D1 opponents
df = df.loc[df['Opponent'].isin(df['Team']), :].reset_index(drop=True)

# remove games where starters are not listed
df = df.loc[df['Starters'].notna(), :].reset_index(drop=True)

df

Unnamed: 0,Date,Team,Location,Opponent,Result,Overtime,Team Score,Opponent Score,Starters
0,"Sun, Nov 13, 2022",Abilene Christian,@,Rice,-1,0,69,85,B. Earle · Z. Jackson · M. Langstaff · A. Mart...
1,"Sun, Nov 20, 2022",Abilene Christian,@,UTSA,-1,0,70,76,B. Earle · M. Langstaff · A. Martin · M. Mille...
2,"Fri, Nov 25, 2022",Abilene Christian,N,Montana State,1,0,66,61,B. Earle · Z. Jackson · M. Langstaff · A. Mart...
3,"Sun, Nov 27, 2022",Abilene Christian,N,Louisiana,-1,0,38,54,B. Earle · M. Langstaff · A. Martin · M. Mille...
4,"Fri, Dec 2, 2022",Abilene Christian,,Alabama State,1,0,78,67,B. Earle · M. Langstaff · A. Martin · M. Mille...
...,...,...,...,...,...,...,...,...,...
11027,"Fri, Feb 17, 2023",Youngstown State,,Milwaukee,-1,0,54,75,M. Callahan · D. Jarrells · M. Magestro · L. R...
11028,"Sun, Feb 19, 2023",Youngstown State,,Green Bay,-1,0,54,67,M. Callahan · D. Jarrells · L. Mack · L. Ritz ...
11029,"Tue, Feb 21, 2023",Youngstown State,@,IUPUI,-1,0,80,87,M. Callahan · D. Jarrells · M. Magestro · L. R...
11030,"Sat, Feb 25, 2023",Youngstown State,@,Robert Morris,1,0,65,51,M. Callahan · D. Jarrells · M. Magestro · L. R...


Reformat location column for home/neutral/away

In [11]:
df['Location'] = df['Location'].map({np.NaN: 1, 'N': 0, '@': -1}).astype('int8')

df['Location']

0       -1
1       -1
2        0
3        0
4        1
        ..
11027    1
11028    1
11029   -1
11030   -1
11031    1
Name: Location, Length: 11032, dtype: int8

Other general fixes

In [12]:
# fix dates
df['Date'] = pd.to_datetime(df['Date'])

# specify before and after NCAA tournament
df.insert(df.columns.get_loc('Date') + 1, 'NCAA Tournament', (df['Date'] > cutoff_dates[season]).astype(int))

# for starters, we only look at pre-tournament games as we are not looking at past years
df = df.loc[df['NCAA Tournament'] == 0, :].reset_index(drop=True)

# fix column types
df.loc[:, df.columns.str.contains('Team ', regex=False)] = df.loc[:, df.columns.str.contains('Team ', regex=False)].astype('float64')
df.loc[:, df.columns.str.contains('Opponent ', regex=False)] = df.loc[:, df.columns.str.contains('Opponent ', regex=False)].astype('float64')

df

Unnamed: 0,Date,NCAA Tournament,Team,Location,Opponent,Result,Overtime,Team Score,Opponent Score,Starters
0,2022-11-13,0,Abilene Christian,-1,Rice,-1,0,69.0,85.0,B. Earle · Z. Jackson · M. Langstaff · A. Mart...
1,2022-11-20,0,Abilene Christian,-1,UTSA,-1,0,70.0,76.0,B. Earle · M. Langstaff · A. Martin · M. Mille...
2,2022-11-25,0,Abilene Christian,0,Montana State,1,0,66.0,61.0,B. Earle · Z. Jackson · M. Langstaff · A. Mart...
3,2022-11-27,0,Abilene Christian,0,Louisiana,-1,0,38.0,54.0,B. Earle · M. Langstaff · A. Martin · M. Mille...
4,2022-12-02,0,Abilene Christian,1,Alabama State,1,0,78.0,67.0,B. Earle · M. Langstaff · A. Martin · M. Mille...
...,...,...,...,...,...,...,...,...,...,...
10797,2023-02-17,0,Youngstown State,1,Milwaukee,-1,0,54.0,75.0,M. Callahan · D. Jarrells · M. Magestro · L. R...
10798,2023-02-19,0,Youngstown State,1,Green Bay,-1,0,54.0,67.0,M. Callahan · D. Jarrells · L. Mack · L. Ritz ...
10799,2023-02-21,0,Youngstown State,-1,IUPUI,-1,0,80.0,87.0,M. Callahan · D. Jarrells · M. Magestro · L. R...
10800,2023-02-25,0,Youngstown State,-1,Robert Morris,1,0,65.0,51.0,M. Callahan · D. Jarrells · M. Magestro · L. R...


Get opponent starters

In [13]:
# create a pseudo game ID that can be looked at from opponent's perspective to get opposing starters
key_to_starters = dict(zip(
    df['Date'].astype(str) + ' ' + df['Team'] + ' ' + df['Opponent'] + ' ' + df['Team Score'].astype(str) + ' ' + df['Opponent Score'].astype(str),
    df['Starters']
))

df['Opponent Starters'] = (df['Date'].astype(str) + ' ' + df['Opponent'] + ' ' + df['Team'] + ' ' + df['Opponent Score'].astype(str) + ' ' + df['Team Score'].astype(str)).map(key_to_starters)

df

Unnamed: 0,Date,NCAA Tournament,Team,Location,Opponent,Result,Overtime,Team Score,Opponent Score,Starters,Opponent Starters
0,2022-11-13,0,Abilene Christian,-1,Rice,-1,0,69.0,85.0,B. Earle · Z. Jackson · M. Langstaff · A. Mart...,A. Austin · M. Bokunewicz · K. Crosthwait · M....
1,2022-11-20,0,Abilene Christian,-1,UTSA,-1,0,70.0,76.0,B. Earle · M. Langstaff · A. Martin · M. Mille...,E. Coleman · J. Jenkins · S. Love · D. Nwakamm...
2,2022-11-25,0,Abilene Christian,0,Montana State,1,0,66.0,61.0,B. Earle · Z. Jackson · M. Langstaff · A. Mart...,L. Beattie · L. Deden · M. Dykstra · K. Limard...
3,2022-11-27,0,Abilene Christian,0,Louisiana,-1,0,38.0,54.0,B. Earle · M. Langstaff · A. Martin · M. Mille...,N. Benedith · T. Johnson · D. Rice · L. Wheato...
4,2022-12-02,0,Abilene Christian,1,Alabama State,1,0,78.0,67.0,B. Earle · M. Langstaff · A. Martin · M. Mille...,J. Crawford · A. Emmanuel · C. Harris · S. Ste...
...,...,...,...,...,...,...,...,...,...,...,...
10797,2023-02-17,0,Youngstown State,1,Milwaukee,-1,0,54.0,75.0,M. Callahan · D. Jarrells · M. Magestro · L. R...,A. Cera · G. Crowley · J. Donaldson · K. Nead ...
10798,2023-02-19,0,Youngstown State,1,Green Bay,-1,0,54.0,67.0,M. Callahan · D. Jarrells · L. Mack · L. Ritz ...,B. Butler · C. Genke · J. Kondrakiewicz · S. L...
10799,2023-02-21,0,Youngstown State,-1,IUPUI,-1,0,80.0,87.0,M. Callahan · D. Jarrells · M. Magestro · L. R...,N. Andersen · A. Berg · R. Kent · D. Perkins ·...
10800,2023-02-25,0,Youngstown State,-1,Robert Morris,1,0,65.0,51.0,M. Callahan · D. Jarrells · M. Magestro · L. R...,M. Amalia · R. Dwomoh · A. Mastral · S. Morris...


In [14]:
df.loc[df['Opponent Starters'].isna(), :]

Unnamed: 0,Date,NCAA Tournament,Team,Location,Opponent,Result,Overtime,Team Score,Opponent Score,Starters,Opponent Starters


In [15]:
df = df.loc[df['Opponent Starters'].notna(), :].reset_index(drop=True)

df

Unnamed: 0,Date,NCAA Tournament,Team,Location,Opponent,Result,Overtime,Team Score,Opponent Score,Starters,Opponent Starters
0,2022-11-13,0,Abilene Christian,-1,Rice,-1,0,69.0,85.0,B. Earle · Z. Jackson · M. Langstaff · A. Mart...,A. Austin · M. Bokunewicz · K. Crosthwait · M....
1,2022-11-20,0,Abilene Christian,-1,UTSA,-1,0,70.0,76.0,B. Earle · M. Langstaff · A. Martin · M. Mille...,E. Coleman · J. Jenkins · S. Love · D. Nwakamm...
2,2022-11-25,0,Abilene Christian,0,Montana State,1,0,66.0,61.0,B. Earle · Z. Jackson · M. Langstaff · A. Mart...,L. Beattie · L. Deden · M. Dykstra · K. Limard...
3,2022-11-27,0,Abilene Christian,0,Louisiana,-1,0,38.0,54.0,B. Earle · M. Langstaff · A. Martin · M. Mille...,N. Benedith · T. Johnson · D. Rice · L. Wheato...
4,2022-12-02,0,Abilene Christian,1,Alabama State,1,0,78.0,67.0,B. Earle · M. Langstaff · A. Martin · M. Mille...,J. Crawford · A. Emmanuel · C. Harris · S. Ste...
...,...,...,...,...,...,...,...,...,...,...,...
10797,2023-02-17,0,Youngstown State,1,Milwaukee,-1,0,54.0,75.0,M. Callahan · D. Jarrells · M. Magestro · L. R...,A. Cera · G. Crowley · J. Donaldson · K. Nead ...
10798,2023-02-19,0,Youngstown State,1,Green Bay,-1,0,54.0,67.0,M. Callahan · D. Jarrells · L. Mack · L. Ritz ...,B. Butler · C. Genke · J. Kondrakiewicz · S. L...
10799,2023-02-21,0,Youngstown State,-1,IUPUI,-1,0,80.0,87.0,M. Callahan · D. Jarrells · M. Magestro · L. R...,N. Andersen · A. Berg · R. Kent · D. Perkins ·...
10800,2023-02-25,0,Youngstown State,-1,Robert Morris,1,0,65.0,51.0,M. Callahan · D. Jarrells · M. Magestro · L. R...,M. Amalia · R. Dwomoh · A. Mastral · S. Morris...


Split starters into separate columns

In [24]:
df['Opponent Starters'].str.split(r'\xa0· | \·', regex=True, expand=True)

Unnamed: 0,0,1,2,3,4
0,A. Austin,M. Bokunewicz,K. Crosthwait,M. Fisher,D. Jackson
1,E. Coleman,J. Jenkins,S. Love,D. Nwakamma,K. White
2,L. Beattie,L. Deden,M. Dykstra,K. Limardo,D. White
3,N. Benedith,T. Johnson,D. Rice,L. Wheaton,C. Wren
4,J. Crawford,A. Emmanuel,C. Harris,S. Steele,R. Tillis
...,...,...,...,...,...
10797,A. Cera,G. Crowley,J. Donaldson,K. Nead,M. Walstad
10798,B. Butler,C. Genke,J. Kondrakiewicz,S. Levy,C. Schiltz
10799,N. Andersen,A. Berg,R. Kent,D. Perkins,J. Turner
10800,M. Amalia,R. Dwomoh,A. Mastral,S. Morris,D. Vuletich


In [25]:
_.loc[_[4].isna(), :]

Unnamed: 0,0,1,2,3,4
9478,E. Arike,N. Boyd,J. Jackson,A. Tac,
9947,D. DeShields,M. Walsh,D. Ware,,


In [27]:
df['Opponent Starters'].iloc[9947]

'D. DeShields\xa0· M. Walsh\xa0· D. Ware'

In [21]:
df['Opponent Starters'].iloc[9467] = 'H. Butera\xa0· T. Hayes\xa0· S. Joyeuse\xa0· M. Torres\xa0· O. Trice'

df['Opponent Starters'].iloc[9467]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Opponent Starters'].iloc[9467] = 'H. Butera\xa0· T. Hayes\xa0· S. Joyeuse\xa0· M. Torres\xa0· O. Trice'


'H. Butera\xa0· T. Hayes\xa0· S. Joyeuse\xa0· M. Torres\xa0· O. Trice'

In [22]:
# rename starters column
df.rename(columns={'Starters': 'Team Starters'}, inplace=True)

df[[f'Team Starter {i}' for i in range(1, 6)]] = df['Team Starters'].str.split(r'\xa0· | \·', regex=True, expand=True)

for i in range(1, 6):
    df[f'Team Starter {i}'] = df['Team'] + ' ' + df[f'Team Starter {i}']

df[[f'Opponent Starter {i}' for i in range(1, 6)]] = df['Opponent Starters'].str.split(r'\xa0· | \·', regex=True, expand=True)

for i in range(1, 6):
    df[f'Opponent Starter {i}'] = df['Opponent'] + ' ' + df[f'Opponent Starter {i}']

df

ValueError: Columns must be same length as key

Set up adjusted Score Differential

In [None]:
blowout_threshold = 18  # adjusted score differential will only account up to this score difference

df['Score Differential'] = df['Team Score'] - df['Opponent Score']

df['Adjusted Score Differential'] = df['Score Differential'].abs()

df.loc[df['Adjusted Score Differential'] > blowout_threshold, 'Adjusted Score Differential'] = blowout_threshold

df['Adjusted Score Differential'] = df['Adjusted Score Differential']**(np.log(1.5)/np.log(blowout_threshold))

df

Unnamed: 0,Date,NCAA Tournament,Team,Location,Opponent,Result,Overtime,Team Score,Opponent Score,Team Starters,...,Team Starter 3,Team Starter 4,Team Starter 5,Opponent Starter 1,Opponent Starter 2,Opponent Starter 3,Opponent Starter 4,Opponent Starter 5,Score Differential,Adjusted Score Differential
0,2011-11-11,0,Air Force,-1,Lipscomb,1,0,68.0,59.0,K. Hilbig · A. Leipprandt · C. Medina · J. Sat...,...,Air Force C. Medina,Air Force J. Satterfield,Air Force C. Thompson,Lipscomb A. Bowers,Lipscomb M. Faulkner,Lipscomb H. Phillips,Lipscomb O. Smith,Lipscomb E. Thornsberry,9.0,1.361013
1,2011-11-13,0,Air Force,-1,Tennessee-Martin,-1,0,60.0,84.0,K. Hilbig · A. Leipprandt · C. Medina · J. Sat...,...,Air Force C. Medina,Air Force J. Satterfield,Air Force C. Thompson,Tennessee-Martin H. Butler,Tennessee-Martin S. Crawford,Tennessee-Martin J. Haislip,Tennessee-Martin T. Hall,Tennessee-Martin J. Newsome,-24.0,1.500000
2,2011-11-17,0,Air Force,1,Denver,-1,0,52.0,54.0,K. Hilbig · A. Leipprandt · C. Medina · J. Sat...,...,Air Force C. Medina,Air Force J. Satterfield,Air Force C. Thompson,Denver M. Michel,Denver K. Murdoch,Denver Q. Noonan,Denver M. Shell,Denver E. Smith,-2.0,1.102120
3,2011-11-19,0,Air Force,-1,Texas Southern,1,0,64.0,54.0,D. James · A. Leipprandt · C. Medina · M. O'Ne...,...,Air Force C. Medina,Air Force M. O'Neil,Air Force C. Thompson,Texas Southern C. Anyiam,Texas Southern G. Fleming,Texas Southern A. Hall,Texas Southern J. McQueen,Texas Southern K. West,10.0,1.381279
4,2011-11-21,0,Air Force,-1,Texas State,1,0,77.0,69.0,D. James · A. Leipprandt · M. O'Neil · D. Sorr...,...,Air Force M. O'Neil,Air Force D. Sorrera,Air Force C. Thompson,Texas State D. Clinch,Texas State A. Ezeh,Texas State D. Ford,Texas State I. Johnson,Texas State V. Kalu,8.0,1.338710
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10219,2012-02-18,0,Youngstown State,1,Green Bay,-1,1,72.0,77.0,B. Brown · K. Fickiesen · T. Jones · K. Middle...,...,Youngstown State T. Jones,Youngstown State K. Middlebrooks,Youngstown State M. Nortey,Green Bay L. Bauer,Green Bay M. Lukan,Green Bay H. Quilling,Green Bay A. Ritchie,Green Bay J. Wojta,-5.0,1.253292
10220,2012-02-23,0,Youngstown State,-1,Wright State,-1,0,62.0,84.0,B. Brown · K. Fickiesen · T. Jones · K. Middle...,...,Youngstown State T. Jones,Youngstown State K. Middlebrooks,Youngstown State M. Nortey,Wright State C. Boyd,Wright State K. Demmings,Wright State M. Fox,Wright State K. Lamotte,Wright State S. Sandifer,-22.0,1.500000
10221,2012-02-25,0,Youngstown State,-1,Detroit Mercy,-1,0,64.0,67.0,B. Brown · K. Fickiesen · L. Hornberger · K. M...,...,Youngstown State L. Hornberger,Youngstown State K. Middlebrooks,Youngstown State H. Schlegel,Detroit Mercy L. Allen,Detroit Mercy S. Brown,Detroit Mercy M. Hatter,Detroit Mercy J. Jones,Detroit Mercy S. Shearer,-3.0,1.166625
10222,2012-03-03,0,Youngstown State,-1,Cleveland State,-1,0,69.0,79.0,B. Brown · K. Fickiesen · K. Middlebrooks · H....,...,Youngstown State K. Middlebrooks,Youngstown State H. Schlegel,Youngstown State M. Touvelle,Cleveland State J. Adams,Cleveland State C. Coleman,Cleveland State T. Keane,Cleveland State H. King,Cleveland State S. Winton,-10.0,1.381279


### Model Building

##### Starters Ratings

Unfortunately, there is no way to differentiate between 2 starters with the same name on the same team. In such circumstances, we double count the entity. 

In [None]:
X = (
    pd.get_dummies(df['Team Starter 1']).astype('int8')
).add(
    pd.get_dummies(df['Team Starter 2']).astype('int8'), fill_value=0
).add(
    pd.get_dummies(df['Team Starter 3']).astype('int8'), fill_value=0 
).add(
    pd.get_dummies(df['Team Starter 4']).astype('int8'), fill_value=0 
).add(
    pd.get_dummies(df['Team Starter 5']).astype('int8'), fill_value=0 
).add(
    -pd.get_dummies(df['Opponent Starter 1']).astype('int8'), fill_value=0 
).add(
    -pd.get_dummies(df['Opponent Starter 2']).astype('int8'), fill_value=0 
).add(
    -pd.get_dummies(df['Opponent Starter 3']).astype('int8'), fill_value=0 
).add(
    -pd.get_dummies(df['Opponent Starter 4']).astype('int8'), fill_value=0 
).add(
    -pd.get_dummies(df['Opponent Starter 5']).astype('int8'), fill_value=0 
)

X['Home Field Advantage'] = df['Location'].copy()

X

  X['Home Field Advantage'] = df['Location'].copy()


Unnamed: 0,Air Force A. Leipprandt,Air Force A. Wilson,Air Force C. Medina,Air Force C. Thompson,Air Force D. James,Air Force D. Sorrera,Air Force J. Blagowsky,Air Force J. Satterfield,Air Force K. Bohannon,Air Force K. Hilbig,...,Youngstown State B. Brown,Youngstown State H. Schlegel,Youngstown State K. Fickiesen,Youngstown State K. Middlebrooks,Youngstown State L. Hornberger,Youngstown State M. Nortey,Youngstown State M. Thompson,Youngstown State M. Touvelle,Youngstown State T. Jones,Home Field Advantage
0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1
1,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1
2,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1
4,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10219,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1
10220,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,-1
10221,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,-1
10222,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,-1


In [None]:
from sklearn.model_selection import GroupKFold

def get_gkf_data(X, y, w, groups, cv=3):
    """
    Converts training data to list of folds
    """
    np.random.seed(22)
    gkf = GroupKFold(n_splits=cv)

    data = []
    for train_index, test_index in gkf.split(X, y, groups=groups):
        X_train = X[train_index]
        X_test = X[test_index]

        y_train = y[train_index]
        y_test = y[test_index]

        # sample weights
        w_train = w[train_index]

        data.append((X_train, X_test, y_train, y_test, w_train))

    return data

cv_data = get_gkf_data(X.to_numpy(), df['Result'].to_numpy(), df[['Adjusted Score Differential']].to_numpy(), df['Date'].to_numpy())

len(cv_data)

3

In [None]:
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
import warnings
import optuna

from sklearn.preprocessing import MinMaxScaler
optuna.logging.set_verbosity(optuna.logging.WARNING)

def objective(trial, cv_data=cv_data):
    # model tuning
    C = trial.suggest_float('C', 0.1, 10, log=True)
    mod = LogisticRegression(penalty='l2', C=C, fit_intercept=False)
    minimum = trial.suggest_float('minimum', 0.1, 1.0, step=0.1)
    maximum = trial.suggest_float('maximum', 1.0, 8.0, step=0.5)
    if minimum >= maximum:
        maximum = minimum + 0.0001

    # cross validation
    y_actuals = []
    y_preds = []
    for X_train, X_test, y_train, y_test, w_train in cv_data:
        y_actuals.append(y_test)

        weights = MinMaxScaler(feature_range=(minimum, maximum)).fit_transform(w_train).reshape(-1)

        with warnings.catch_warnings():
            warnings.filterwarnings('ignore')  # prevent convergence warnings
            mod.fit(X_train, y_train, sample_weight=weights)

        y_preds.append(mod.predict_proba(X_test)[:, 1])

    return log_loss(np.hstack(y_actuals), np.hstack(y_preds))

study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=22))
study.optimize(objective, n_trials=100, show_progress_bar=True)

study.best_params

  0%|          | 0/100 [00:00<?, ?it/s]

{'C': 0.1040692802336742, 'minimum': 0.2, 'maximum': 3.0}

In [None]:
optuna.importance.get_param_importances(study)

{'C': 0.8614898438672839,
 'minimum': 0.12911488044753716,
 'maximum': 0.009395275685178932}

In [None]:
minimum = study.best_params['minimum']
maximum = study.best_params['maximum']

if minimum >= maximum:
        maximum = minimum + 0.0001

weight = MinMaxScaler(
    feature_range=(minimum, maximum)
).fit_transform(df[['Adjusted Score Differential']]).reshape(-1)

mod = LogisticRegression(penalty='l2', C=study.best_params['C'], fit_intercept=False)

mod.fit(X, df['Result'], sample_weight=weight)

df_ratings = pd.DataFrame(
    {
        'Starter': X.columns,
        'Rating': mod.coef_[0]
    }
).sort_values(by=['Rating'], ascending=False, ignore_index=True)

df_ratings_display = df_ratings.loc[df_ratings['Starter'] != 'Home Field Advantage', :].reset_index(drop=True)
df_ratings_display.index += 1

df_ratings_display.head(25)

Unnamed: 0,Starter,Rating
1,Baylor B. Griner,1.231085
2,Stanford C. Ogwumike,1.149198
3,Baylor K. Hayden,1.082374
4,Maryland A. Thomas,1.051012
5,Stanford T. Kokenis,1.028163
6,Green Bay S. Eichler,1.019723
7,Baylor O. Sims,0.991273
8,Stanford N. Ogwumike,0.984382
9,Tennessee G. Johnson,0.981856
10,Notre Dame N. Novosel,0.955713


### Miscellaneous

##### Save Rankings

Get last available lineup for each team

In [None]:
starter_to_rating = dict(zip(df_ratings_display['Starter'], df_ratings_display['Rating']))

len(starter_to_rating)

3066

In [None]:
df_last_starters = df.sort_values(by=['Date']).groupby(['Team']).tail(1).reset_index(drop=True)[['Team'] + [f'Team Starter {i}' for i in range(1, 6)]]

df_last_starters

Unnamed: 0,Team,Team Starter 1,Team Starter 2,Team Starter 3,Team Starter 4,Team Starter 5
0,New Orleans,New Orleans K. Chaney,New Orleans S. Drake,New Orleans M. Sipos,New Orleans K. Thomas,New Orleans T. Wallace
1,Mercer,Mercer T. Bradshaw,Mercer P. Bridges,Mercer R. Buchanan,Mercer S. Smith,Mercer B. Williams
2,Rhode Island,Rhode Island E. Cloutier,Rhode Island C. Lehnertz,Rhode Island S. Session,Rhode Island S. Turpin,Rhode Island T. Whittaker
3,Southern Utah,Southern Utah T. Anderson,Southern Utah K. Hawkins,Southern Utah A. Jones,Southern Utah T. Maynes,Southern Utah C. Moreland
4,Jacksonville State,Jacksonville State D. Lane,Jacksonville State B. Manning,Jacksonville State A. McCarthy,Jacksonville State B. Morrow,Jacksonville State D. Vaughn
...,...,...,...,...,...,...
337,Cincinnati,Cincinnati C. Chisholm,Cincinnati K. Cook,Cincinnati D. Hollins,Cincinnati J. Randolph,Cincinnati B. Reaves
338,San Diego,San Diego I. Chilcot,San Diego D. Conners,San Diego A. Kame,San Diego F. Wijenberg,San Diego M. Woodrow
339,Syracuse,Syracuse K. Alexander,Syracuse P. Bullard,Syracuse R. Coffey,Syracuse E. Hall,Syracuse I. Hemingway
340,Central Arkansas,Central Arkansas C. Duever,Central Arkansas N. Guiden,Central Arkansas M. Herbert,Central Arkansas M. Rice,Central Arkansas D. Rogers


In [None]:
for i in range(1, 6):
    df_last_starters[f'Team Starter {i} Rating'] = df_last_starters[f'Team Starter {i}'].map(starter_to_rating)

df_last_starters['Rating'] = df_last_starters[[f'Team Starter {i} Rating' for i in range(1, 6)]].mean(axis=1)

df_last_starters

Unnamed: 0,Team,Team Starter 1,Team Starter 2,Team Starter 3,Team Starter 4,Team Starter 5,Team Starter 1 Rating,Team Starter 2 Rating,Team Starter 3 Rating,Team Starter 4 Rating,Team Starter 5 Rating,Rating
0,New Orleans,New Orleans K. Chaney,New Orleans S. Drake,New Orleans M. Sipos,New Orleans K. Thomas,New Orleans T. Wallace,-0.303165,-0.303165,-0.303165,-0.303165,-0.303165,-0.303165
1,Mercer,Mercer T. Bradshaw,Mercer P. Bridges,Mercer R. Buchanan,Mercer S. Smith,Mercer B. Williams,-0.433922,-0.621138,-0.162203,-0.164056,-0.518536,-0.379971
2,Rhode Island,Rhode Island E. Cloutier,Rhode Island C. Lehnertz,Rhode Island S. Session,Rhode Island S. Turpin,Rhode Island T. Whittaker,-0.640069,-0.639889,-0.640069,-0.005205,-0.292038,-0.443454
3,Southern Utah,Southern Utah T. Anderson,Southern Utah K. Hawkins,Southern Utah A. Jones,Southern Utah T. Maynes,Southern Utah C. Moreland,-0.477913,-0.469926,-0.384337,-0.159712,-0.458970,-0.390172
4,Jacksonville State,Jacksonville State D. Lane,Jacksonville State B. Manning,Jacksonville State A. McCarthy,Jacksonville State B. Morrow,Jacksonville State D. Vaughn,-0.689396,-0.741995,-0.187838,-0.591973,-0.741995,-0.590639
...,...,...,...,...,...,...,...,...,...,...,...,...
337,Cincinnati,Cincinnati C. Chisholm,Cincinnati K. Cook,Cincinnati D. Hollins,Cincinnati J. Randolph,Cincinnati B. Reaves,0.174442,0.174442,0.271684,0.019467,0.228470,0.173701
338,San Diego,San Diego I. Chilcot,San Diego D. Conners,San Diego A. Kame,San Diego F. Wijenberg,San Diego M. Woodrow,0.372871,0.397179,0.331101,0.510566,0.397179,0.401779
339,Syracuse,Syracuse K. Alexander,Syracuse P. Bullard,Syracuse R. Coffey,Syracuse E. Hall,Syracuse I. Hemingway,0.620130,0.565841,0.157740,0.394914,0.256423,0.399010
340,Central Arkansas,Central Arkansas C. Duever,Central Arkansas N. Guiden,Central Arkansas M. Herbert,Central Arkansas M. Rice,Central Arkansas D. Rogers,-0.283609,0.045050,0.045050,0.171242,0.373708,0.070288


In [None]:
df_sheet = df_last_starters[['Team', 'Rating']].copy().sort_values(by=['Rating'], ascending=False, ignore_index=True)

df_sheet.head(25)

Unnamed: 0,Team,Rating
0,Baylor,1.022474
1,Notre Dame,0.911295
2,Connecticut,0.898498
3,Stanford,0.872734
4,Delaware,0.787463
5,Tennessee,0.723221
6,Maryland,0.711899
7,Green Bay,0.656885
8,St. Bonaventure,0.629787
9,Duke,0.613086


In [None]:
df_sheet.to_csv(f'../data/preprocessed/womens_starters/womens_starters_{season}.csv', index=False)
df.to_csv(f'../data/unprocessed/womens_sports_reference_starters/womens_sports_reference_starters_{season}.csv', index=False)

'Done'

'Done'