In [2]:
# import libraries for scraping and writing data to a CSV
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd

In [3]:
# set base URL
base_url = "https://www.nfl.com/injuries/league/"

# need to loop thru all weeks, and update the base_url with each iteration
weeks = ["REG1", "REG2", "REG3", "REG4", "REG5", "REG6", "REG7", "REG8", "REG9", "REG10", "REG11", "REG12", "REG13", "REG14", "REG15", "REG16", "REG17","POST1", "POST2", "POST3", "POST4"]


# Write function to scrape data
def scrape_data(url, year):
    with open(f'../data/injuries_scrape_{year}.csv', 'w', newline='') as csvfile:
        csvwriter = csv.writer(csvfile)

        # Write the column names
        header = ["Week", "Player", "Position", "Injury", "Game Status", "Game Type"] # manually written list
        csvwriter.writerow(header)

        # Loop thru each week from the 'weeks' list
        for week in weeks:
            current_url = url+year+'/'+week
            response = requests.get(current_url)

            if response.status_code == 200: # this is a checkpoint - if status code is 200, all OK, continue
                # instantiate BeautifulSoup class
                soup = BeautifulSoup(response.text, 'html')

                # create a list of tables from soup object
                tables = soup.find_all('table')

                # loop thru each table 
                for table in tables:
                    rows = table.find_all('tr') # find the rows in each table

                    # loop thru each row
                    for row in rows:
                        # collect data
                        columns = row.find_all(['th', 'td'])
                        data = [week] + [col.text.strip() for col in columns]

                        # write the row data to csv
                        csvwriter.writerow(data)

                print(f"Scraping done for {week}")
                
            else: # if status_code != 200, something is wrong
                print(f"Error! Status code: {response.status_code}")

In [4]:
# scrape for 2020
scrape_data(base_url, '2020')

Scraping done for REG1
Scraping done for REG2
Scraping done for REG3
Scraping done for REG4
Scraping done for REG5
Scraping done for REG6
Scraping done for REG7
Scraping done for REG8
Scraping done for REG9
Scraping done for REG10
Scraping done for REG11
Scraping done for REG12
Scraping done for REG13
Scraping done for REG14
Scraping done for REG15
Scraping done for REG16
Scraping done for REG17
Scraping done for POST1
Scraping done for POST2
Scraping done for POST3
Scraping done for POST4


In [40]:
# scrape for 2021
scrape_data(base_url, '2021')

Scraping done for REG1
Scraping done for REG2
Scraping done for REG3
Scraping done for REG4
Scraping done for REG5
Scraping done for REG6
Scraping done for REG7
Scraping done for REG8
Scraping done for REG9
Scraping done for REG10
Scraping done for REG11
Scraping done for REG12
Scraping done for REG13
Scraping done for REG14
Scraping done for REG15
Scraping done for REG16
Scraping done for REG17
Scraping done for POST1
Scraping done for POST2
Scraping done for POST3
Scraping done for POST4


---

In [5]:
# now that the data is scraped - let's clean and combine both data frames

# writing a function to clean both dfs

def clean_scraped_csvs(year):
    
    # read csv
    df = pd.read_csv(f"../data/injuries_scrape_{year}.csv")
    
    # split 'Week' into --> 'game_type', can merge the 'week' number later on
    for i in df['Week']:
        if 'REG' in i:
            df.loc[df['Week'] == i, 'game_type'] = 'REG'
        elif i == 'POST1':
            df.loc[df['Week'] == i, 'game_type'] = 'WC'
        elif i == 'POST2':
            df.loc[df['Week'] == i, 'game_type'] = 'DIV'
        elif i == 'POST3':
            df.loc[df['Week'] == i, 'game_type'] = 'CON'
        elif i == 'POST4':
            df.loc[df['Week'] == i, 'game_type'] = 'SB'
    
    # drop this col now
    df.drop('Week', axis = 1, inplace = True)
    
    # drop these rows --> this part of the scrape pulled headers after every table iteration
    df = df[df['Player'] != 'Player']
    
    # add a season column to ensure the data aligns w/ the proper year
    df['season'] = year
    
    return df

In [6]:
# assign dfs
df_2020 = clean_scraped_csvs('2020')
df_2021 = clean_scraped_csvs('2021')

In [7]:
# concat both dfs
(pd.concat([df_2020, df_2021])).to_csv('../data/cleaned_scraped_data.csv') # NOTE to self: concat funct needs a list to iterate on in order to concat