# Scrape Injury Data from https://www.prosportstransactions.com/
### Code adopted from:
- https://github.com/gboogy/nba-injury-data-scraper
- https://github.com/elap733/NBA-Injuries-Analysis/blob/master/src/d01_scrapes/scrape_missedgames.py

In [52]:
import numpy as np
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup
import time
import datetime
from tqdm import tqdm
import json
from thefuzz import fuzz

pd.options.mode.chained_assignment =  None
# Pretending to be a browser
header = {
  "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
  "X-Requested-With": "XMLHttpRequest"
}

In [53]:
f = open("../data/NBA.json")
data = json.load(f)
data = data["players"]
pID_dict = {v: int(k) for k, v in data.items()}
player_dict = {int(k): v for k, v in data.items()}
# pID = pID_dict.get(player, np.nan)

In [54]:
# Start Date
try:
    df0 = pd.read_parquet('../fdata/NBA_prosptran_injuries_2023.parquet')
    start_date = (df0["Date"].iloc[-1] + datetime.timedelta(days=-1)).strftime("%Y-%m-%d")
except:
    start_date = "2023-06-01"
print(start_date)

2023-12-15


In [55]:
#URL to scrape from 
url = f"https://www.prosportstransactions.com/basketball/Search/SearchResults.php?Player=&Team=&BeginDate={start_date}&EndDate=&ILChkBx=yes&InjuriesChkBx=yes&PersonalChkBx=yes&Submit=Search"

In [56]:
#-------------Scrape web page--------------------------------------

#Get URL HTML
response = requests.get(url)
print(response) # Response [200] means it went through

#Parse HTML with BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")

#-------------Scrape data from the first web page----------------
#Read in html as pandas data frame
df_first_page = pd.read_html(url,storage_options=header)
    
#Select table of interest (the first table)
df_first_page = df_first_page[0]

#Drop first row (column names)
df_first_page.drop([0], inplace = True)
   
#Remove bullet in front of player names
df_first_page[2]=df_first_page[2].str[2:] # "Acquired" column
df_first_page[3]=df_first_page[3].str[2:] # "Relinquished" column
    
#Modify column titles
df_first_page.columns = ['Date','Team','Acquired','Relinquished','Notes']

dfa = []
#data frame list to hold data for concating later
dfa.append(df_first_page)

<Response [200]>


In [57]:
#------------Scrape data from other pages linked at the bottom of the first page------------
# Loop over links (skipping the first 4 (not data) and last 4 ("Next" and other webpage links))
for i in tqdm(range(4,len(soup.findAll('a'))-4)): #'a' tags are for links
   
    #find all links on webpage and select the i-th link
    one_a_tag = soup.findAll('a')[i]
    link = one_a_tag['href']
    
    #Add in the rest of the url
    download_url = 'https://www.prosportstransactions.com/basketball/Search/'+ link
    # print(download_url)
    
    #Read html as pandas data frame
    dfs = pd.read_html(download_url, storage_options=header)
    
    #Select table of interest (the first table)
    df = dfs[0]
    
    #Drop first row (column names)
    df.drop([0], inplace = True)
   
    #Remove bullet in front of names
    df[2]=df[2].str[2:] # "Acquired" column
    df[3]=df[3].str[2:] # "Relinquished" column
    
    #Modify column titles
    df.columns = ['Date','Team','Acquired','Relinquished','Notes']
    #Add a pause to keep web server happy
    time.sleep(1)
    dfa.append(df)

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:01<00:00,  1.48s/it]


In [58]:
def get_missing_pId(player):
    frat = [[v,fuzz.ratio(player,k)] for k, v in pID_dict.items()]
    frar  = np.array(frat).T
    pId = frar[:,frar.argmax(axis=1)[1]][0]
    return pId

In [59]:
df1 = pd.concat(dfa)
df = df1.copy()
acq = df['Acquired']
rel = df['Relinquished']
df['Acquired'] = np.where(
    acq.str.contains('/'), acq.str.split('/ ').str[1], acq)
df['Relinquished'] = np.where(
    rel.str.contains('/'), rel.str.split('/ ').str[1], rel)

# Remove instances where value is like "(some text)"
df['Acquired'] = df.Acquired.str.replace(
    r"[\(\[].*?[\)\]]", "")
df['Relinquished'] = df.Relinquished.str.replace(
    r"[\(\[].*?[\)\]]", "")
df["Act"] = ~df["Acquired"].isna()
df["DAct"] = ~df["Relinquished"].isna()
df["Player"] =  (df["Acquired"]*~df["Acquired"].isna()).fillna("") +\
                (df["Relinquished"]*~df["Relinquished"].isna()).fillna("")
df = df[["Date","Team","Player","Act","DAct","Notes"]]
df = df[df["Player"].str.istitle()].reset_index(drop=True)
df["playerID"] = df["Player"].map(pID_dict)
df1 = df.copy()
df1["playerID"][df["playerID"].isna()] = df["Player"][df["playerID"].isna()].apply(lambda x: get_missing_pId(x))
df1["playerID"] = df1["playerID"].astype(int)
df1["Date"] = pd.to_datetime(df1["Date"], format="%Y-%m-%d")
df1.insert(2,"playerID",df1.pop("playerID"))
df2 = pd.concat([df0,df1]).reset_index(drop=True)
df3 =df2[~df2.duplicated(keep='last')].reset_index(drop=True)

In [60]:
df3.to_csv('../fdata/NBA_prosptran_injuries_2023.csv', index=False)
df3.to_parquet('../fdata/NBA_prosptran_injuries_2023.parquet')

In [61]:
dfp = df3.query("Player == 'Tyler Herro'").reset_index(drop=True)

In [62]:
dfp

Unnamed: 0,Date,Team,playerID,Player,Act,DAct,Notes
0,2023-06-12,Heat,1629639,Tyler Herro,True,False,activated from IL
1,2023-11-09,Heat,1629639,Tyler Herro,False,True,sprained right ankle (DTD)


Did Herro Miss the game on 1st November?

In [63]:
game_date = pd.to_datetime(datetime.date(2023,11,1))
dfp["Comp"] = dfp["Date"] <= game_date
idxi = dfp[dfp["Comp"]].index
if len(idxi) > 0:
    idx = idxi[-1]
    missed_game = dfp["DAct"].loc[idx]
else:
    missed_game = False
missed_game

False