In [82]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


# Open the target URL
def get_html_from_torvik_players(year):

    last_year = year - 1

    url = f"https://barttorvik.com/playerstat.php?link=y&sIndex=53&year={year}&minmin=5&start={last_year}1101&end={year}0501"

    driver = webdriver.Chrome()  # or webdriver.Firefox() if you prefer

    iters = 0

    driver.get(url)
    while True and iters < 40:
        try:
            # Wait up to 10 seconds for the "Show 100 more" element to be clickable
            expand_element = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.ID, "expand"))
            )
            # Locate the <a> tag within the <th id="expand">
            show_more_button = expand_element.find_element(By.TAG_NAME, "a")
            
            # Click the button
            show_more_button.click()
            
            # Optionally wait a bit for the new content to load before the next click.
            time.sleep(2)
            iters += 1
            
        except Exception as e:
            # If the button is no longer found or clickable, exit the loop.
            print("No more 'Show 100 more' button found or an error occurred:", e)
            break

    # Once the loop is complete, you can scrape the loaded content.
    html_source = driver.page_source
    # For example, use BeautifulSoup to parse the html_source if needed.

    driver.quit()

    return html_source


In [70]:
from bs4 import BeautifulSoup
import pandas as pd


In [75]:
def get_data_from_html(html_source):

    # Parse the HTML
    soup = BeautifulSoup(html_source, "html.parser")

    # Locate the table. Here we search for the table by its style attribute.
    table = soup.find("table", {"style": "white-space:nowrap;margin:auto;table-layout:fixed"})
    if not table:
        raise ValueError("Table not found!")

    def extract_complete_row(row):
        to_use = [0,2,3,4,6,7,10,11,13,16, 18, 19, 20, 21, 22, 23, 24, 26,27,28]
        complete_row = [x for i,x in enumerate(row) if i in to_use]
        return complete_row
        

    data = []
    tbody = table.find("tbody")
    rows = tbody.find_all("tr") if tbody else table.find_all("tr")
    for row in rows:
        # Use both <td> and <th> in case some rows use header cells for data
        cells = row.find_all(["td", "th"])
        row_data = [cell.get_text(strip=True) for cell in cells]
        row_data = extract_complete_row(row_data)
        if row_data:
            data.append(row_data)

    headers = ["Rk", "Class", "Height", "Player", "Team", 
            "Conf", "Min%", 'PRPG!', 'BPM', 'ORTG', 
            "USG", "EFG", "TS", "OR", "DR", "AST", "TO",
            "BLK", "STL", "FTR"]

    df = pd.DataFrame(data, columns=headers)
    return df


In [85]:
years = [i for i in range(2008, 2024)]

In [88]:
df_2024["Season"] = 2024

In [89]:
all_dfs = [df_2024]

import tqdm

for year in tqdm.tqdm(years):
    html = get_html_from_torvik_players(year)
    tmp_df = get_data_from_html(html)
    tmp_df["Season"] = year
    all_dfs.append(tmp_df)


100%|██████████| 16/16 [37:21<00:00, 140.08s/it]


In [91]:
final_df = pd.concat(all_dfs, axis=0)

In [98]:
final_df = final_df[final_df["Min%"].notnull()]

In [99]:
final_df.to_csv("torvik_player_data.csv")

In [128]:
tmp = final_df[(final_df.Team == "Houston")
         & (final_df.Season == 2023)].sort_values(by="PRPG!", ascending=False)

In [None]:
tmp["Min%"] = tmp["Min%"].astype(float)
tmp["PRPG!"] = tmp["PRPG!"].astype(float)
tmp["total_value"] = tmp["Min%"] * tmp["PRPG!"]
tmp["total_value_perc"] = tmp["total_value"] / tmp["total_value"].sum() 

In [106]:
final_df[(final_df.Team == "Connecticut")
         & (final_df.Season == 2009)].sort_values(by="PRPG!", ascending=False)

Unnamed: 0,Rk,Class,Height,Player,Team,Conf,Min%,PRPG!,BPM,ORTG,...,EFG,TS,OR,DR,AST,TO,BLK,STL,FTR,Season
11,12,Jr,7-3,Hasheem Thabeet,Connecticut,BE,75.1,4.0,11.2,118.7,...,64.4,64.8,13.4,21.7,3.0,17.5,11.9,1.2,78.3,2009
216,217,Sr,6-2,A.J. Price,Connecticut,BE,73.0,3.7,5.7,108.5,...,50.7,53.9,2.1,9.2,28.5,20.2,0.0,1.2,30.3,2009
219,220,Sr,6-7,Jeff Adrien,Connecticut,BE,82.0,3.6,5.7,110.0,...,51.1,53.4,10.5,19.4,9.3,14.5,3.0,0.9,50.8,2009
133,134,Jr,6-3,Jerome Dyson,Connecticut,BE,47.7,3.1,6.7,106.8,...,45.5,51.0,4.9,9.5,20.5,16.7,1.0,3.7,45.5,2009
244,245,Fr,6-1,Kemba Walker,Connecticut,BE,59.3,2.3,5.4,107.6,...,50.0,55.4,4.0,10.3,21.0,21.7,0.6,2.5,54.7,2009
336,337,Sr,6-3,Craig Austrie,Connecticut,BE,60.7,2.1,4.6,113.3,...,43.7,50.5,2.8,4.7,15.2,13.6,1.1,1.5,41.4,2009
181,182,Jr,6-9,Stanley Robinson,Connecticut,BE,44.9,2.0,6.1,104.7,...,51.9,54.2,9.2,14.9,8.8,20.5,4.1,1.5,30.6,2009
123,124,Jr,6-9,Gavin Edwards,Connecticut,BE,28.1,1.4,6.8,123.9,...,64.6,67.6,12.5,14.6,5.2,19.3,5.9,1.7,59.5,2009
85,86,So,6-4,Donnell Beverly,Connecticut,BE,5.5,0.7,7.7,147.5,...,64.3,68.3,5.7,10.9,25.7,11.0,0.0,2.9,42.9,2009


### Scrape ESPN

In [260]:
all_dates = [
    "20080321", "20080320", "20090320", "20090319", "20100319", "20100318",
    "20110318", "20110317", "20120322", "20120321", "20130321", "20130320",
    "20140321", "20140320", "20150320", "20150319", "20160318", "20160317",
    "20170317", "20170316", "20180316", "20180315", "20190322", "20190321",
    "20200320", "20200319", "20210319", "20210318", "20220318", "20220317",
    "20230317", "20230316", "20240322", "20240321"
]


In [None]:
import requests

def get_game_ids(date: str):
    url = f"https://site.api.espn.com/apis/site/v2/sports/basketball/mens-college-basketball/scoreboard?dates={date}"
    headers = {"User-Agent": "Mozilla/5.0"}
    
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        data = response.json()
        return [game.get("id") for game in data.get("events", [])]
    else:
        print(f"Failed to fetch data: {response.status_code}")
        return []


['401638599', '401638601', '401638600', '401638607', '401638602', '401638612', '401638609', '401638604', '401638613', '401638610', '401638605', '401638603', '401638608', '401638614', '401638611', '401638606']


In [None]:
def get_box_score(game_id):

    url = f'https://site.api.espn.com/apis/site/v2/sports/basketball/mens-college-basketball/summary?event={game_id}'

    response = requests.get(url)

    data = json.loads(response.text)

    columns = ['team', 'player_name'] + data['boxscore']['players'][0]['statistics'][0]["names"]
    team1 = data['boxscore']['players'][0]['statistics'][0]["athletes"]
    team1_name = data['boxscore']['players'][0]['team']['location']
    team2 = data['boxscore']['players'][1]['statistics'][0]["athletes"]
    team2_name = data['boxscore']['players'][1]['team']['location']

    all_data = []

    for player in team1:

        if len(player['stats']) > 0:
            name = player["athlete"]["displayName"]
            stats = player['stats']
            all_data.append([team1_name, name] + stats)

    for player in team2:

        if len(player['stats']) > 0:
            name = player["athlete"]["displayName"]
            stats = player['stats']
            all_data.append([team2_name, name] + stats)

    return pd.DataFrame(all_data,columns=columns)


In [263]:
bs = []

for date in tqdm.tqdm(all_dates):
    game_ids = get_game_ids(date)
    for game in game_ids:
        b = get_box_score(game)
        b["Season"] = date[0:4]
        bs.append(b)
    

100%|██████████| 34/34 [04:00<00:00,  7.07s/it]


In [265]:
ncaam_box_scores = pd.concat(bs, axis = 0)

In [267]:
ncaam_box_scores.to_csv("ncaam_box_scores.csv")

In [277]:
tmp_box = ncaam_box_scores[(ncaam_box_scores.Season == "2024") & 
                 (ncaam_box_scores.team == "Kansas")]

In [274]:
tmp = final_df[(final_df.Team == "Kansas")
         & (final_df.Season == 2024)].copy()

In [307]:
ncaam_box_scores[ncaam_box_scores["team"].str.lower().str.contains("st")].head()

Unnamed: 0,team,player_name,MIN,FG,3PT,FT,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,Season
0,Morehead St.,Leon Buchanan,35,6-16,0-0,5-6,4,0,4,2,1,0,4,4,17,2009
1,Morehead St.,Maze Stallworth,38,5-11,4-8,0-0,1,2,3,1,0,0,2,1,14,2009
2,Morehead St.,Kenneth Faried,33,6-8,0-0,2-5,5,6,11,0,2,0,3,1,14,2009
3,Morehead St.,Brandon Shingles,33,1-5,0-0,3-3,0,1,1,5,1,0,7,2,5,2009
4,Morehead St.,Demonte Harper,37,1-9,0-3,0-0,1,2,3,3,0,0,2,1,2,2009


In [None]:
ncaam_box_scores["team"] = ncaam_box_scores["team"].str.replace(r'\bState\b', 'St.', regex=True)

In [303]:
final_df[final_df["Team"].str.lower().str.contains("st")].head()

Unnamed: 0,Rk,Class,Height,Player,Team,Conf,Min%,PRPG!,BPM,ORTG,...,EFG,TS,OR,DR,AST,TO,BLK,STL,FTR,Season
5,6,Sr,6-1,Jamal Shead,Houston,B12,76.9,4.3,11.6,113.1,...,45.8,49.5,3.3,10.5,40.4,15.4,2.0,4.3,24.0,2024
11,12,Sr,6-7,J'Wan Roberts,Houston,B12,64.1,3.5,10.6,121.4,...,59.6,58.8,11.0,18.4,14.2,13.6,4.5,2.9,35.2,2024
14,15,Sr,6-0,Shahada Wells,McNeese St.,Slnd,83.8,4.4,10.4,117.9,...,51.2,56.2,3.3,13.1,30.0,12.8,1.3,5.6,37.4,2024
16,17,Sr,6-9,Hason Ward,Iowa St.,B12,30.1,2.2,10.1,124.8,...,62.8,62.5,11.1,17.8,10.0,11.3,10.5,4.1,39.8,2024
18,19,Sr,6-9,Jaedon LeDee,San Diego St.,MWC,79.3,5.9,10.1,119.7,...,57.6,62.3,11.7,16.8,10.2,13.9,1.9,2.2,66.7,2024


### Match Torvik and Box Score Team Names

In [309]:
box_score_teams = ncaam_box_scores.team.unique().tolist()

In [310]:
torvik_teams = final_df.Team.unique().tolist()

In [312]:
def match_lists(list1, list2):
    matched = [item if item in list2 else "" for item in list1]
    return pd.DataFrame({'Box_Score_Team': list1, 'Torvik_Team': matched})


In [314]:
team_lookup = match_lists(box_score_teams, torvik_teams)

In [318]:
def update_mapping(mapping_df, col1_name, col2_name, col1_value, col2_value):
    mapping_df.loc[mapping_df[col1_name] == col1_value, col2_name] = col2_value
    return mapping_df

In [394]:
team_lookup = update_mapping(team_lookup, "Box_Score_Team", "Torvik_Team","Grambling", "Grambling St.")

In [395]:
team_lookup

Unnamed: 0,Box_Score_Team,Torvik_Team
0,Morehead St.,Morehead St.
1,Louisville,Louisville
2,East Tennessee St.,East Tennessee St.
3,Pittsburgh,Pittsburgh
4,Robert Morris,Robert Morris
...,...,...
228,Long Beach St.,Long Beach St.
229,Samford,Samford
230,McNeese,McNeese St.
231,Duquesne,Duquesne


In [398]:
final_df["Torvik_Team"] = final_df["Team"].copy()

In [400]:
final_df = final_df.merge(team_lookup, on="Torvik_Team", how="left")

### Find Missing Players

In [575]:
team_seasons = ncaam_box_scores[["team", "Season"]].drop_duplicates()
team_seasons = list(team_seasons.itertuples(index=False, name=None))


In [576]:
from itertools import product
import Levenshtein  

def find_players_with_high_distance(list1, list2, min_threshold=3):
    high_distance_players = []
    
    for str1 in list1:
        min_dist, best_match = min(
            ((Levenshtein.distance(str1, str2), str2) for str2 in list2),
            key=lambda x: x[0]
        )
        if min_dist >= min_threshold:
            high_distance_players.append(str1)
    
    return high_distance_players



In [577]:
import Levenshtein
import pandas as pd

def get_first_initial_and_last(name: str) -> tuple[str, str]:
    """
    Remove common suffixes from the player's name and return the first initial and last name.

    Parameters:
        name (str): The player's full name.

    Returns:
        tuple: A tuple (first_initial, last_name) after cleaning the name.
    """
    parts = name.split()
    # Define common suffixes to remove (case-insensitive)
    suffixes = {"jr", "jr.", "sr", "sr.", "i", "ii", "iii", "iv"}
    
    # Remove any trailing suffixes
    while parts and parts[-1].lower() in suffixes:
        parts.pop()
    
    # If no parts remain, return empty strings
    if not parts:
        return ("", "")
    
    first_initial = parts[0][0].upper() if parts[0] else ""
    # Capitalize the last name for consistency
    last_name = parts[-1].capitalize() if parts[-1] else ""
    return (first_initial, last_name)

def has_initial_last_match(name: str, names_list: list[str]) -> bool:
    """
    Check if a given player's name (after cleaning) has a matching first initial and last name
    in a list of names.

    Parameters:
        name (str): The player's full name to check.
        names_list (list[str]): A list of player names to compare against.

    Returns:
        bool: True if a matching name (by first initial and last name) is found; False otherwise.
    """
    target_initial, target_last = get_first_initial_and_last(name)
    
    for other in names_list:
        other_initial, other_last = get_first_initial_and_last(other)
        if target_initial == other_initial and target_last == other_last:
            return True
    return False

def find_players_with_high_distance(list1: list[str], list2: list[str], min_threshold: int = 3) -> list[str]:
    """
    Identify players from list1 that are considered missing compared to list2 based on Levenshtein 
    distance and cleaned name matching. For each name in list1, the function finds the closest match 
    in list2 using Levenshtein distance. If the distance is above the threshold and no similar name 
    (by first initial and last name) is found in list2, then the player is flagged as missing.

    Parameters:
        list1 (list[str]): List of player names to check.
        list2 (list[str]): List of player names to compare against.
        min_threshold (int): Minimum Levenshtein distance to consider a name as missing.

    Returns:
        list[str]: List of names from list1 that are considered missing.
    """
    missing_players = []
    
    for name1 in list1:
        # Find the best match in list2 based on Levenshtein distance
        min_dist, best_match = min(
            ((Levenshtein.distance(name1, name2), name2) for name2 in list2),
            key=lambda x: x[0]
        )
        
        # If the distance is high enough, check for a first initial + last name match
        if min_dist >= min_threshold:
            if not has_initial_last_match(name1, list2):
                missing_players.append(name1)
    
    return missing_players



In [578]:
injured_players = []
injured_players_value = []

for team, season in team_seasons:
    
    torvik_data = final_df[(final_df["Box_Score_Team"] == team)
                           & (final_df["Season"] == int(season))
                           ].copy()
    
    torvik_data["Min%"] = torvik_data["Min%"].astype(float)
    torvik_data["PRPG!"] = torvik_data["PRPG!"].astype(float)
    torvik_data["total_value"] = torvik_data["Min%"] * torvik_data["PRPG!"]
    torvik_data["total_value_perc"] = torvik_data["total_value"] / torvik_data["total_value"].sum() 
    
    bs_data = ncaam_box_scores[(ncaam_box_scores["team"] == team)
                           & (ncaam_box_scores["Season"] == season)]
    
    torvik_players = torvik_data["Player"].unique().tolist()

    bs_players = bs_data["player_name"].unique().tolist()

    high_distance_players = find_players_with_high_distance(torvik_players, bs_players, min_threshold=6)

    team_missing_player_value = 0

    for player in high_distance_players:
        player_value = torvik_data[torvik_data.Player == player].total_value_perc.iloc[0]
        team_missing_player_value += player_value

    injured_players.append(high_distance_players)
    injured_players_value.append(team_missing_player_value)



In [579]:
team_seasons = pd.DataFrame(team_seasons, columns=["team", "Season"])

In [580]:
team_seasons["injured_players"] = injured_players

In [581]:
team_seasons["injured_players_value"] = injured_players_value

In [582]:
team_seasons["n_injured_players"] = team_seasons["injured_players"].apply(lambda x: len(x))

In [583]:
team_seasons.sort_values(by="injured_players_value", ascending=False)

Unnamed: 0,team,Season,injured_players,injured_players_value,n_injured_players
371,California,2016,"[Tyrone Wallace, Jabari Bird]",0.345017,2
308,Indiana,2015,[Kevin Yogi Ferrell],0.337055,1
508,Kansas St.,2018,[Dean Wade],0.331828,1
413,Miami,2016,"[Sheldon McClellan, Ebuka Izundu]",0.324392,2
478,Mount St. Mary's,2017,"[Elijah Long, Ryan Gomes]",0.316054,2
...,...,...,...,...,...
366,UNC Asheville,2016,[David Robertson],-0.034040,1
456,Northern Kentucky,2017,"[Blake Spellman, Jalen Tate, Brandon Maxwell]",-0.034342,3
657,Norfolk St.,2021,"[Efstratios Kalogerias, Yoro Sidibe]",-0.046601,2
286,Wofford,2014,"[Aerris Smith, Indiana Faithful]",-0.051129,2


In [584]:
team_seasons[(team_seasons.Season == "2022")].sort_values(by="injured_players_value", ascending=False).head(10)

Unnamed: 0,team,Season,injured_players,injured_players_value,n_injured_players
720,San Francisco,2022,"[Yauhen Massalski, Dzmitry Ryuny]",0.17173,2
699,Baylor,2022,"[LJ Cryer, Jonathan Tchamwa Tchatchoua]",0.17113,2
692,Michigan,2022,[DeVante' Jones],0.166891,1
689,Seton Hall,2022,"[Bryce Aiken, Jahari Long]",0.140642,2
681,Arizona,2022,"[Kim Aiken Jr., Kerr Kriisa]",0.118198,2
687,Houston,2022,"[Marcus Sasser, Tramon Mark]",0.088671,2
714,Creighton,2022,"[Ryan Nembhard, Shereef Mitchell]",0.086816,2
718,Akron,2022,"[K.J. Walton, Bryan Trimble Jr.]",0.066078,2
675,Texas,2022,[Tre Mitchell],0.06558,1
701,Tennessee,2022,[Olivier Nkamhoua],0.06109,1


In [585]:
team_seasons.sort_values(by="injured_players_value", ascending=False).iloc[0]["injured_players"]

['Tyrone Wallace', 'Jabari Bird']

In [586]:
ncaam_box_scores[(ncaam_box_scores.team == "North Carolina")
                 & (ncaam_box_scores.Season == "2022")]

Unnamed: 0,team,player_name,MIN,FG,3PT,FT,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,Season
14,North Carolina,Armando Bacot,28,6-13,0-0,5-5,4,6,10,3,1,1,1,0,17,2022
15,North Carolina,Brady Manek,33,10-15,5-10,3-3,3,8,11,3,0,2,2,2,28,2022
16,North Carolina,Leaky Black,34,2-4,0-1,0-0,2,5,7,8,2,0,2,1,4,2022
17,North Carolina,RJ Davis,30,1-10,1-5,1-2,1,3,4,12,0,0,1,0,4,2022
18,North Carolina,Caleb Love,34,6-15,6-13,5-7,0,3,3,1,0,0,2,1,23,2022
19,North Carolina,Duwe Farris,1,0-0,0-0,0-0,0,0,0,0,0,0,0,0,0,2022
20,North Carolina,Justin McKoy,4,0-2,0-1,0-0,0,0,0,0,0,0,0,0,0,2022
21,North Carolina,Ryan McAdoo,1,0-1,0-1,0-0,0,1,1,0,0,0,0,0,0,2022
22,North Carolina,Creighton Lebo,1,0-1,0-1,0-0,0,0,0,0,1,0,0,0,0,2022
23,North Carolina,Puff Johnson,13,5-7,1-2,0-0,0,3,3,0,0,1,0,3,11,2022


In [587]:
final_df[(final_df.Team == "North Carolina")
         & (final_df.Season == 2022)]

Unnamed: 0,Rk,Class,Height,Player,Team,Conf,Min%,PRPG!,BPM,ORTG,...,OR,DR,AST,TO,BLK,STL,FTR,Season,Torvik_Team,Box_Score_Team
56886,32,Sr,6-9,Brady Manek,North Carolina,ACC,75.4,4.4,8.4,119.8,...,5.5,16.1,11.8,13.5,2.3,1.1,17.1,2022,North Carolina,North Carolina
56896,42,Jr,6-10,Armando Bacot,North Carolina,ACC,78.3,5.0,8.2,117.9,...,14.9,29.9,9.9,14.3,5.2,1.5,50.8,2022,North Carolina,North Carolina
57084,230,Sr,6-8,Leaky Black,North Carolina,ACC,71.7,2.3,5.3,120.5,...,4.9,10.7,14.8,18.8,2.3,1.7,26.0,2022,North Carolina,North Carolina
57226,372,So,6-0,RJ Davis,North Carolina,ACC,84.1,3.7,4.3,111.7,...,1.9,11.7,19.5,15.7,0.6,1.7,30.1,2022,North Carolina,North Carolina
57240,386,So,6-8,Puff Johnson,North Carolina,ACC,15.8,1.2,4.2,124.0,...,8.1,12.4,7.7,4.8,1.6,1.6,29.5,2022,North Carolina,North Carolina
57556,702,So,6-4,Caleb Love,North Carolina,ACC,84.6,3.3,2.6,101.5,...,0.7,10.0,19.5,17.9,0.7,1.6,28.7,2022,North Carolina,North Carolina
57761,907,So,6-11,Dawson Garcia,North Carolina,ACC,20.9,2.1,1.9,105.8,...,11.3,17.8,6.8,15.4,1.5,1.2,41.4,2022,North Carolina,North Carolina
58750,1896,So,6-5,Kerwin Walton,North Carolina,ACC,26.1,0.3,-0.8,98.2,...,1.9,8.0,5.4,17.5,1.0,1.0,6.9,2022,North Carolina,North Carolina
59188,2334,Fr,6-6,Dontrez Styles,North Carolina,ACC,11.1,0.2,-1.9,90.5,...,7.1,19.5,2.1,18.3,1.1,1.6,27.3,2022,North Carolina,North Carolina
59484,2630,So,6-4,Anthony Harris,North Carolina,ACC,10.4,0.2,-2.8,97.6,...,1.4,1.9,9.1,29.8,0.0,0.7,33.3,2022,North Carolina,North Carolina


### Match ESPN and Kaggle Team Names

In [549]:
kaggle_team_names = pd.read_csv("../data/MTeams.csv")

In [None]:
all_kaggle_teams = kaggle_team_names.TeamName.unique().tolist()

In [589]:
injury_data = team_seasons.copy()

In [600]:
injury_data["team"] = injury_data["team"].str.replace(r'St\.(?!\w)', 'St', regex=True)


In [602]:
injury_data_teams = injury_data["team"].unique().tolist()

In [603]:
def match_lists(list1, list2):
    matched = [item if item in list2 else "" for item in list1]
    return pd.DataFrame({'Injury_Data_Team': list1, 'Kaggle_Team': matched})

In [604]:
team_lookup2 = match_lists(injury_data_teams, all_kaggle_teams)

In [None]:
missing = team_lookup2[team_lookup2.Kaggle_Team == ""].copy()


In [629]:
team_lookup2


Unnamed: 0,Injury_Data_Team,Kaggle_Team
0,Morehead St,Morehead St
1,Louisville,Louisville
2,East Tennessee St,
3,Pittsburgh,Pittsburgh
4,Robert Morris,Robert Morris
...,...,...
228,Long Beach St,Long Beach St
229,Samford,Samford
230,McNeese,
231,Duquesne,Duquesne


In [637]:
chatgpt_matches = pd.DataFrame([
    ('East Tennessee St', 'ETSU'),
    ('Stephen F. Austin', 'SF Austin'),
    ('North Dakota St', 'N Dakota St'),
    ('Cal St Northridge', 'CS Northridge'),
    ('UConn', 'Connecticut'),
    ('American University', 'American Univ'),
    ('Western Kentucky', 'Kentucky'),
    ('Arkansas-Pine Bluff', 'Ark Pine Bluff'),
    ('Sam Houston', 'Sam Houston St'),
    ("Saint Mary's", "St Mary's CA"),
    ('UTSA', 'UT San Antonio'),
    ('Boston University', 'Boston Univ'),
    ('Long Island University', 'LIU Brooklyn'),
    ("Saint Peter's", 'St Peter\'s'),
    ('Northern Colorado', 'N Colorado'),
    ('Southern', 'Ga Southern'),
    ('North Carolina A&T', 'NC A&T'),
    ('South Dakota St', 'S Dakota St'),
    ('Saint Louis', 'St Louis'),
    ('Eastern Kentucky', 'E Kentucky'),
    ('George Washington', 'G Washington'),
    ('Coastal Carolina', 'Coastal Car'),
    ('North Carolina Central', 'NC Central'),
    ('Western Michigan', 'W Michigan'),
    ('UAlbany', 'SUNY Albany'),
    ("Saint Joseph's", 'St Joseph\'s PA'),
    ('NC St', 'NC State'),
    ('Milwaukee', 'WI Milwaukee'),
    ('Texas Southern', 'TX Southern'),
    ('Ole Miss', 'Mississippi'),
    ('Eastern Washington', 'E Washington'),
    ("Hawai'i", 'Hawaii'),
    ('Middle Tennessee', 'MTSU'),
    ('Cal St Bakersfield', 'CS Bakersfield'),
    ('Green Bay', 'WI Green Bay'),
    ('Little Rock', 'Ark Little Rock'),
    ('Miami', 'Miami FL'),
    ('Florida Gulf Coast', 'FGCU'),
    ('Northern Kentucky', 'N Kentucky'),
    ('Kent St', 'Kent'),
    ("Mount St Mary's", 'Mt St Mary\'s'),
    ('Cal St Fullerton', 'CS Fullerton'),
    ('Charleston', 'Col Charleston'),
    ('Pennsylvania', 'Penn'),
    ('Loyola Chicago', 'Loyola-Chicago'),
    ('Gardner-Webb', 'Gardner Webb'),
    ('Abilene Christian', 'Abilene Chr'),
    ('Fairleigh Dickinson', 'F Dickinson'),
    ('App St', 'Appalachian St'),
    ('Kennesaw St', 'Kennesaw'),
    ('Florida Atlantic', 'FL Atlantic'),
    ('Texas A&M-Corpus Christi', 'TAM C. Christi'),
    ('McNeese', 'McNeese St')
], columns = ["Injury_Data_Team", "Kaggle_Team"]
)

In [638]:
team_lookup2 = team_lookup2.merge(chatgpt_matches, how="left", on="Injury_Data_Team")

In [640]:
import numpy as np
team_lookup2["TeamName"] = np.where(team_lookup2.Kaggle_Team_x == "", team_lookup2.Kaggle_Team_y, team_lookup2.Kaggle_Team_x)

In [645]:
injury_data["Injury_Data_Team"] = injury_data["team"].copy()

In [648]:
injury_data = injury_data.merge(team_lookup2[["Injury_Data_Team", "TeamName"]], how="left", on="Injury_Data_Team")

In [650]:
injury_data_final = injury_data[["TeamName", "Season", "injured_players", "injured_players_value", "n_injured_players"]].copy()

In [655]:
injury_data_final.sort_values(by="injured_players_value", ascending=False).head(10)

Unnamed: 0,TeamName,Season,injured_players,injured_players_value,n_injured_players
371,California,2016,"[Tyrone Wallace, Jabari Bird]",0.345017,2
308,Indiana,2015,[Kevin Yogi Ferrell],0.337055,1
508,Kansas St,2018,[Dean Wade],0.331828,1
413,Miami FL,2016,"[Sheldon McClellan, Ebuka Izundu]",0.324392,2
478,Mt St Mary's,2017,"[Elijah Long, Ryan Gomes]",0.316054,2
523,Missouri,2018,"[Jordan Barnett, Cullen VanLeer, Blake Harris,...",0.296667,4
632,Georgia Tech,2021,[Moses Wright],0.284408,1
535,Miami FL,2018,"[Bruce Brown Jr., Dewan Huell]",0.280188,2
73,Purdue,2010,"[Robbie Hummel, Mark Wohlford]",0.276398,2
504,Murray St,2018,[Temetrius Morant],0.269536,1
