In [70]:
# IMPORT STATEMENTS
from bs4 import BeautifulSoup
from requests_html import HTMLSession
from urllib.parse import urljoin
import requests as r
import pandas as pd
import datetime as dt

current_year = int(dt.datetime.today().strftime("%Y"))
EARLIEST_YEAR = 2003 # checked sources to see how far the data goes back

In [21]:
def read_data(year, game_type):
    """
    Reads the data from the website extracting the values of the table based on the year
    Param: 
    year - year of data to be collected
    game_type: either playoff ('playoff') or regular season ("regular_season")

    Return: dataframe of the regular season win loss total for NFL teams
    """
    url = f"https://www.teamrankings.com/nfl/trends/win_trends/?sc=is_{game_type}&range=yearly_"
    url = url + str(year)
    response = r.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    specific_div = soup.find('table') # finding the table with the record of each team
    rows = specific_div.find_all('tr') # get the rows of data

    # Now that we have the data as a list of rows, we can parse the data to construct a data frame
    data = []
    for row in rows:
            cells = row.find_all(['td', 'th'])  # 'td' for regular cells, 'th' for header cells
            row_data = [cell.text.strip() for cell in cells] #extract the contents in each cell
            data.append(row_data)
    columns = data[0]
    df = pd.DataFrame(data[1:], columns=columns)
    df["Year"] = year #helps identify the data points based on year
    return df

In [42]:
def capture_all_data(game_type):
    all_data = []
    current_year = int(dt.datetime.today().strftime("%Y"))
    for i in range(EARLIEST_YEAR, current_year):
        x = read_data(i, game_type)
        all_data.append(x) # adding data frame objects to a list
    data = pd.concat(all_data)
    data[["Wins", "Losses", "Ties"]] = data['Win-Loss Record'].apply(lambda x: pd.to_numeric(pd.Series(x.split('-'))))
    return data

In [61]:
playoff_win_loss = capture_all_data('playoff')
regular_seasonn_win_loss = capture_all_data('regular_season')

playoff_win_loss = playoff_win_loss[["Team", "Year", "Wins", "Losses"]]
team_records = regular_seasonn_win_loss.merge(playoff_win_loss, how='left', on=["Team", "Year"])

team_records = team_records.fillna(0)
team_records["Playoffs?"] = pd.to_numeric(team_records["Wins_y"]) + pd.to_numeric(team_records["Losses_y"]) > 0


Saves the team records into 1 .csv file

In [66]:
team_records.to_csv("C:/Users/rchap/Git/NFL_TEAM_DATA/NFL_Team_Records.csv")

### Saving information about team statistics into a .csv file

In [71]:
defense_data_dict = {'Passing': [], 'Pushing': [], 'Downs': [], 'Fumbles': [], 'Interceptions': []}
offense_data_dict = {'Passing': [], 'Rushing': []}
def collect_team_stats(key):
    if len(key) > 2:
        side = 'defense'
    else:
        side = 'offense'
    for i in key.keys():
        for year in range(EARLIEST_YEAR, current_year):
            url = f"https://www.nfl.com/stats/team-stats/{side}"
            url = url + (f'/{i}/{year}/reg/all')
            response = r.get(url)
            soup = BeautifulSoup(response.text, 'html.parser')
            specific_div = soup.find('table')
            rows = specific_div.find_all('tr')
            data = []
            for row in rows:
                cells = row.find_all(['td', 'th'])  # 'td' for regular cells, 'th' for header cells
                row_data = [cell.text.strip() for cell in cells] #extract the contents in each cell
                data.append(row_data)
            columns = data[0]
            data_table = pd.DataFrame(data[1:], columns=columns)
            data_table["Year"] = year
            tables = key[i]
            tables.append(data_table)
        df = pd.concat(tables)
        df['Team'] = df['Team'].apply(lambda x: x.split('\n')[0])
        if 'Lng' in df.columns:
            df['Lng'] = df['Lng'].apply(lambda x: str(x).split('T')[0])
        key[i] = df

In [74]:
off_passing_data = offense_data_dict['Passing']
off_rushing_data = offense_data_dict['Rushing']


In [76]:
offensive_stats = pd.merge(off_passing_data, off_rushing_data, how='left',
                           on=["Team", "Year"])
new_columns = {}
for column in offensive_stats.columns:
    if column.endswith('_x'):
        new_columns[column] = column.replace('_x', '_Pass')
    elif column.endswith('_y'):
        new_columns[column] = column.replace('_y', '_Rush')
offensive_stats.rename(columns = new_columns, inplace=True)
offensive_stats

Unnamed: 0,Team,AttPass,Cmp,Cmp %,Yds/Att,Pass Yds,TDPass,INT,Rate,1st,...,AttRush,Rush Yds,YPC,TDRush,20+Rush,40+Rush,LngRush,Rush 1st,Rush 1st%,Rush FUM
0,Giants,616,344,55.8,5.9,3642,16,20,68.4,184,...,387,1559,4,6,6,0,27,89,23,9
1,Rams,600,377,62.8,7.2,4287,23,23,81,211,...,411,1496,3.6,19,7,1,52,97,23.6,5
2,Buccaneers,592,369,62.3,6.7,3941,27,22,81.5,190,...,421,1648,3.9,5,3,2,61,86,20.4,6
3,Lions,588,319,54.2,5.1,2988,17,24,61.1,152,...,376,1338,3.6,5,8,0,39,69,18.4,7
4,Colts,569,381,67,7.5,4289,29,10,99,212,...,453,1695,3.7,16,5,1,43,104,23,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,Broncos,513,337,65.7,7,3566,28,9,96.7,154,...,451,1810,4,8,9,0,38,116,25.7,5
668,Steelers,506,323,63.8,6.8,3421,13,9,84.6,153,...,487,2010,4.1,16,14,1,74,115,23.6,9
669,Ravens,494,328,66.4,7.9,3881,27,7,102.5,180,...,541,2661,4.9,26,21,4,60,144,26.6,7
670,Titans,494,304,61.5,7.1,3512,14,11,83.2,158,...,444,1846,4.2,16,10,2,69,103,23.2,6
