# Scrape Injury Data from https://www.prosportstransactions.com/
### Code adopted from:
- https://github.com/gboogy/nba-injury-data-scraper
- https://github.com/elap733/NBA-Injuries-Analysis/blob/master/src/d01_scrapes/scrape_missedgames.py

In [2]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import datetime as dt
from tqdm import tqdm
import json
from thefuzz import fuzz, process
from tenacity import retry, stop_after_attempt, wait_fixed, Retrying

pd.options.mode.chained_assignment =  None

data_DIR = "../../data/injuries/"
export_DIR = "../../../repos/csv/"
# Pretending to be a browser
header = {
  "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
  "X-Requested-With": "XMLHttpRequest"
}

with open("../../data/NBA.json") as f:
    data = json.load(f)
pID_dict = {v: int(k) for k, v in data.items()}
player_dict = {int(k): v for k, v in data.items()}

In [3]:
# Start Date
start_date = "2023-06-01"

In [4]:
try:
    df0 = pd.read_parquet(data_DIR + "NBA_prosptran_injuries_2023.parquet1")
    start_date = (df0["Date"].iloc[-1] + dt.timedelta(days=-1)).strftime("%Y-%m-%d")
except:
    df0 = pd.DataFrame()
print(start_date)

2023-06-01


## Including these following conditions:
- Movement to/from injured/inactive list (IL)
- Missed games due to injury
- Missed games due to personal reasons
- Missed games due to suspensions

In [5]:
#URL to scrape from 
url = f"https://www.prosportstransactions.com/basketball/Search/SearchResults.php?Player=&Team=&BeginDate={start_date}&EndDate=&ILChkBx=yes&InjuriesChkBx=yes&PersonalChkBx=yes&DisciplinaryChkBx=yes&Submit=Search"

In [6]:
#-------------Scrape web page--------------------------------------

#Get URL HTML
response = requests.get(url)
print(response) # Response [200] means it went through

#Parse HTML with BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")

#-------------Scrape data from the first web page----------------
#Read in html as pandas data frame
df_first_page = pd.read_html(url,storage_options=header)
    
#Select table of interest (the first table)
df_first_page = df_first_page[0]

#Drop first row (column names)
df_first_page.drop([0], inplace = True)
   
#Remove bullet in front of player names
df_first_page[2]=df_first_page[2].str[2:] # "Acquired" column
df_first_page[3]=df_first_page[3].str[2:] # "Relinquished" column
    
#Modify column titles
df_first_page.columns = ['Date','Team','Acquired','Relinquished','Notes']

dfa = []
#data frame list to hold data for concating later
dfa.append(df_first_page)

<Response [200]>


In [7]:
#------------Scrape data from other pages linked at the bottom of the first page------------
# Loop over links (skipping the first 4 (not data) and last 4 ("Next" and other webpage links))
for i in tqdm(range(4,len(soup.findAll('a'))-4)): #'a' tags are for links
   
    #find all links on webpage and select the i-th link
    one_a_tag = soup.findAll('a')[i]
    link = one_a_tag['href']
    
    #Add in the rest of the url
    download_url = 'https://www.prosportstransactions.com/basketball/Search/'+ link
    # print(download_url)
    
    #Read html as pandas data frame
    dfs = pd.read_html(download_url, storage_options=header)
    
    #Select table of interest (the first table)
    df = dfs[0]
    
    #Drop first row (column names)
    df.drop([0], inplace = True)
   
    #Remove bullet in front of names
    df[2]=df[2].str[2:] # "Acquired" column
    df[3]=df[3].str[2:] # "Relinquished" column
    
    #Modify column titles
    df.columns = ['Date','Team','Acquired','Relinquished','Notes']
    #Add a pause to keep web server happy
    time.sleep(0.2)
    dfa.append(df)

100%|██████████| 58/58 [00:51<00:00,  1.13it/s]


In [8]:
def get_missing_pId(player,player_dict):
    # frat = [[v,fuzz.ratio(player,k)] for k, v in pID_dict.items()]
    # frar  = np.array(frat).T
    # pId = frar[:,frar.argmax(axis=1)[1]][0]
    pId = process.extract(player,player_dict,limit=1, scorer=fuzz.partial_ratio)[0][2]
    return pId

In [29]:
df1 = pd.concat(dfa)
df = df1.copy()

In [30]:
acq = df['Acquired']
rel = df['Relinquished']
df['Acquired'] = np.where(
    acq.str.contains('/'), acq.str.split('/ ').str[1], acq)
df['Relinquished'] = np.where(
    rel.str.contains('/'), rel.str.split('/ ').str[1], rel)

# Remove instances where value is like "(some text)"
df['Acquired'] = df.Acquired.str.replace(
    r"[\(\[].*?[\)\]]", "")
df['Relinquished'] = df.Relinquished.str.replace(
    r"[\(\[].*?[\)\]]", "")
df["In"] = ~df["Acquired"].isna()
df["Out"] = ~df["Relinquished"].isna()
df["Player"] =  (df["Acquired"]*~df["Acquired"].isna()).fillna("") +\
                (df["Relinquished"]*~df["Relinquished"].isna()).fillna("")
df = df[["Date","Team","Player","In","Out","Notes"]]

In [19]:
# df = df[df["Player"].str.istitle()].reset_index(drop=True)

In [31]:
df["playerID"] = df["Player"].map(pID_dict)
df1 = df.copy()
df1["playerID"][df["playerID"].isna()] = df["Player"][df["playerID"].isna()].apply(lambda x: get_missing_pId(x,player_dict))
df1["playerID"] = df1["playerID"].astype(int)
df1["Date"] = pd.to_datetime(df1["Date"], format="%Y-%m-%d")
df1.insert(2,"playerID",df1.pop("playerID"))
df2 = pd.concat([df0,df1]).reset_index(drop=True)
df3 =df2[~df2.duplicated(keep='last')].reset_index(drop=True)
df3 = df3[~df3["Notes"].str.contains("fine",case=False)]

In [32]:
df3.query("Team == 'Timberwolves'")

Unnamed: 0,Date,Team,playerID,Player,In,Out,Notes
48,2023-10-24,Timberwolves,1630183,Jaden McDaniels,False,True,placed on IL with calf injury
66,2023-10-25,Timberwolves,1641740,Jaylen Clark,False,True,right Achilles injury (DTD)
111,2023-10-30,Timberwolves,1630183,Jaden McDaniels,True,False,activated from IL
209,2023-11-07,Timberwolves,1629162,Jordan McLaughlin,False,True,placed on IL with sprained MCL in right knee
385,2023-11-21,Timberwolves,1630183,Jaden McDaniels,False,True,placed on IL with sprained right ankle
504,2023-11-30,Timberwolves,1630162,Anthony Edwards,False,True,placed on IL with right hip pointer
594,2023-12-06,Timberwolves,1630162,Anthony Edwards,True,False,activated from IL
625,2023-12-08,Timberwolves,1629162,Jordan McLaughlin,True,False,activated from IL
654,2023-12-11,Timberwolves,1630162,Anthony Edwards,False,True,placed on IL with right hip pointer
655,2023-12-11,Timberwolves,1630183,Jaden McDaniels,True,False,activated from IL


In [None]:
df3.to_csv(data_DIR + 'NBA_prosptran_injuries_2023.csv', index=False)
df3.to_parquet(data_DIR + 'NBA_prosptran_injuries_2023.parquet')
df3.to_csv(export_DIR + 'NBA_prosptran_injuries_2023.csv', index=False)

In [None]:
dfp = df3.query("Player == 'Tyler Herro'").reset_index(drop=True)

In [None]:
dfp

Did Herro Miss the game on 1st November?

In [None]:
game_date = pd.to_datetime(dt.date(2023,11,1))
dfp["Comp"] = dfp["Date"] <= game_date
idxi = dfp[dfp["Comp"]].index
if len(idxi) > 0:
    idx = idxi[-1]
    missed_game = dfp["Out"].loc[idx]
else:
    missed_game = False
missed_game

# Older Seasons Data

In [None]:
sdfdsfs

In [None]:
def get_missing_pId(player,player_dict):
    pId = process.extract(player,player_dict,limit=1, scorer=fuzz.partial_ratio)[0][2]
    return pId

In [None]:
# @retry(stop=stop_after_attempt(5), wait=wait_fixed(0.6))
def update_injury_data(year):

    header = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
    "X-Requested-With": "XMLHttpRequest"
    }
    start_date = f"{year}-07-01"
    end_date = f"{year+1}-06-30"
    try:
        # raise Exception
        df0 = pd.read_parquet(data_DIR + f'NBA_prosptran_injuries_{year}.parquet')
        start_date = (df0["Date"].iloc[-1] + dt.timedelta(days=-1)).strftime("%Y-%m-%d")
    except:
        df0 = pd.DataFrame()
        
    print(start_date)
    url = f"https://www.prosportstransactions.com/basketball/Search/SearchResults.php?Player=&Team=&BeginDate={start_date}&EndDate={end_date}&ILChkBx=yes&InjuriesChkBx=yes&PersonalChkBx=yes&Submit=Search"

    response = requests.get(url)
    # print(response) # Response [200] means it went through
    soup = BeautifulSoup(response.text, "html.parser")
    df_first_page = pd.read_html(url,storage_options=header)
    df_first_page = df_first_page[0]
    df_first_page.drop([0], inplace = True)
    df_first_page[2]=df_first_page[2].str[2:] # "Acquired" column
    df_first_page[3]=df_first_page[3].str[2:] # "Relinquished" column
    df_first_page.columns = ['Date','Team','Acquired','Relinquished','Notes']
    dfa = []
    dfa.append(df_first_page)
    for i in tqdm(range(4,len(soup.findAll('a'))-4)): #'a' tags are for links
        for kk in Retrying(wait=wait_fixed(5)):
            try: 
                tic = time.perf_counter()
                one_a_tag = soup.findAll('a')[i]
                link = one_a_tag['href']
                download_url = 'https://www.prosportstransactions.com/basketball/Search/'+ link
                # print(download_url)
                dfs = pd.read_html(download_url, storage_options=header)
                df = dfs[0]
                df.drop([0], inplace = True)
                df[2]=df[2].str[2:] # "Acquired" column
                df[3]=df[3].str[2:] # "Relinquished" column
                df.columns = ['Date','Team','Acquired','Relinquished','Notes']
                toc = time.perf_counter()
                if (toc - tic) >10:
                    raise Exception("Website Timeout")
                time.sleep(0.2)
                dfa.append(df)
                break
            except Exception as error:
                 print(download_url)
                 print(error)
                 continue

    df1 = pd.concat(dfa)
    df = df1.copy()
    acq = df['Acquired']
    rel = df['Relinquished']
    df['Acquired'] = np.where(
        acq.str.contains('/'), acq.str.split('/ ').str[1], acq)
    df['Relinquished'] = np.where(
        rel.str.contains('/'), rel.str.split('/ ').str[1], rel)

    # Remove instances where value is like "(some text)"
    df['Acquired'] = df.Acquired.str.replace(
        r"[\(\[].*?[\)\]]", "")
    df['Relinquished'] = df.Relinquished.str.replace(
        r"[\(\[].*?[\)\]]", "")
    df["In"] = ~df["Acquired"].isna()
    df["Out"] = ~df["Relinquished"].isna()
    df["Player"] =  (df["Acquired"]*~df["Acquired"].isna()).fillna("") +\
                    (df["Relinquished"]*~df["Relinquished"].isna()).fillna("")
    df = df[["Date","Team","Player","In","Out","Notes"]]
    df = df[df["Player"].str.istitle()].reset_index(drop=True)
    df["Player"].loc[df["Player"].str.contains("Enes")] = "Enes Kanter"
    df["playerID"] = df["Player"].map(pID_dict)
    df1 = df.copy()
    df1["playerID"][df["playerID"].isna()] = df["Player"][df["playerID"].isna()].apply(lambda x: get_missing_pId(x,player_dict))
    df1["playerID"] = df1["playerID"].astype(int)
    df1["Date"] = pd.to_datetime(df1["Date"], format="%Y-%m-%d")
    df1.insert(2,"playerID",df1.pop("playerID"))
    df2 = pd.concat([df0,df1]).reset_index(drop=True)
    df3 =df2[~df2.duplicated(keep='last')].reset_index(drop=True)
    df3.to_csv(data_DIR + f'NBA_prosptran_injuries_{year}.csv', index=False)
    df3.to_parquet(data_DIR + f'NBA_prosptran_injuries_{year}.parquet')

    return df3

In [None]:
for year in range(2000,2024):
    dfy = update_injury_data(year)