## Import Required Dependencies

In [None]:
import numpy as np
import pandas as pd
from pprint import pprint
import re
from bs4 import BeautifulSoup
import requests
from collections import defaultdict
ses = requests.Session()

## Define Function for Auditing Dataframes
This is a function I use to give a snapshot of the data in the dataframe, so I can (hopefully) quickly identify problems that need to be fixed. It gives me a count of all records in the dataframe, null counts for each column, and then outputs a random sample of 20 values from each column.

In [None]:
# check for problem data
def audit_df(df):
    # show total records for dataset
    print("Total records in dataset:")
    print(len(df))
    print()
    # show how many null values for each column
    print("null values by column:")
    pprint(df.isnull().sum())
    print()
    print("Max value by column:")
    pprint(df.max())
    print("Min value by column:")
    pprint(df.min())
    for column in df.columns.values:
        print(column + ":")
        # convert column values to strings for better printing
        column_as_strings = set([str(i) for i in df[column]])
        # grab a random sample of 20 elements of each column
        pprint(random.sample(column_as_strings, min(20, len(column_as_strings))))
        print()

## Grab Offensive DVOA Data for Years 1986 to 2017
This data is coming from the offensive statistics tables at Football Outsiders. The script below accesses those pages for each season from 1986 to 2017, and parses out all the statistics.

In [None]:
def get_soup(ses, url):
    try:
        resp = ses.get(url)
        if resp.ok:
            soup = BeautifulSoup(resp.text, "lxml")
            return soup
    except:
        return None

def get_stats_table(soup):
    body = soup.find('body')
    content = body.find('div', id='content')
    content_main = content.find('div', id='content_main')
    columns = content_main.find('div', id='columns')
    center = columns.find('div', id='center')
    content_body = center.find("div", {"class": "content-body"})
    stats_table = content_body.find("table", {"class": "stats"})
    return stats_table

def format_cell_text(td):
    formatted_text = td.text.replace("\n","")
    formatted_text = re.sub('\.|-', '_', formatted_text)
    formatted_text = re.sub('_$', '', formatted_text).lower()
    return formatted_text

def format_records(headers, records):
    formatted_records = []
    headers += ['year', 'stat_type']
    for row in records:
        temp_dict = {}
        for idx, val in enumerate(row):
            temp_dict[headers[idx]] = val
        formatted_records.append(temp_dict)
    return formatted_records

def extract_off_headers(headers, formatted_text):
    if formatted_text in ["rk", "rank"]:
        return [headers[-1] + "_rank"]
    elif formatted_text == 'non_adjusted':
        return ['na_' + x for x in ['total', 'pass', 'rush']]
    elif formatted_text == "weightedoffense":
        return ["wei_offense"]
    else:
        return [formatted_text]   
    
def extract_skillpos_headers(headers, formatted_text):
    if formatted_text in ["rk", "rank"]:
        return [headers[-1] + "_rank"]
    elif formatted_text == "pass":
        return ["passes"]
    else:
        return [formatted_text]

def extract_stats(soup, year, stat_type):
    stats_table = get_stats_table(soup)
    row_counter = 0
    records = []
    headers = []
    for tr in stats_table.findAll('tr'):
        temp_row = []
        for td in tr.findAll('td'):            
            if row_counter == 0:
                if td.text != "":
                    formatted_text = format_cell_text(td)
                    if stat_type == "teamoff":
                        if headers == []:
                            headers = ["off_dvoa_rank"]
                        headers += extract_off_headers(headers, formatted_text)
                    elif stat_type == "qb":
                        headers += extract_skillpos_headers(headers, formatted_text)
            else:                
                if "%" in td.text:
                    # convert percentage strings to floats
                    try:
                        temp_row.append(float(td.text.strip("%")) / 100)
                    except:
                        temp_row.append("")
                else:
                    try:
                        temp_row.append(float(td.text))
                    except:
                        temp_row.append(td.text)
        if temp_row != [] and temp_row[0] != "" and temp_row[0] != "Player":
            records.append(temp_row + [year, stat_type])                
        row_counter += 1
    formatted_records = format_records(headers, records)
    return formatted_records

stats_uri = "http://www.footballoutsiders.com/stats/"
stats = defaultdict(list)
stats_tables = ['teamoff', 'qb']
for st in stats_tables:
    for year in range(1986, 2018):
        soup = get_soup(ses, stats_uri + str(st) + str(year))
        if soup:
            stats[st] += extract_stats(soup, year, st)
        print("Completed year {} for stat {}".format(year, st))

teamoff_df = pd.DataFrame(stats['teamoff'])
qb_df = pd.DataFrame(stats['qb'])

## Grab Head Coach and Offensive Coordinators for Each Team, 1986-2017
This data is gleaned from the team pages at Football Reference.

In [None]:
pfr_teams = ['nwe','nyj','mia','buf','oti','jax','htx','clt','rav','pit','cin','cle','kan','den','rai','sdg','phi','was',
 'dal','nyg','tam','car','atl','nor','det','min','chi','gnb','ram','sea','crd','sfo']

def convert_team_name(team, year):
    team_mapping = {'nwe': 'NE', 'jax': 'JAC', 'htx': 'HOU', 'clt': 'IND', 'rav': 'BAL', 'kan': 'KC',
                   'tam': 'TB', 'nor': 'NO', 'gnb': 'GB', 'sfo': 'SF'}
    if team_mapping.get(team):
        return team_mapping[team]
    elif team == 'sdg':
        if year == 2017:
            return 'LACH'
        else:
            return 'SD'
    elif team == 'ram':
        if year < 2016 and year > 1994:
            return 'STL'
        else:
            return 'LARM'
    elif team == "oti":
        if year < 1997:
            return 'HOIL'
        else:
            return 'TEN'
    elif team == "cle":
        if year < 1996:
            return 'CLE1'
        else:
            return 'CLE'
    elif team == "crd":
        if year < 1988:
            return "STLC"
        elif year < 1994:
            return "PHX"
        else:
            return "ARI"
    elif team == "rai":
        if year > 1994 or year < 1989:
            return "OAK"
        else:
            return "LARD" 
    else:
        return team.upper()

def extract_coaches(soup, team, year):
    coaches = []
    team_name = convert_team_name(team, year)
    body = soup.find('body')
    wrap = body.find('div', id='wrap')
    info = wrap.find('div', id='info')
    meta = info.find('div', id='meta')
    for p in meta.findAll('p'):
        if "Coach:" in p.text:
            # only grab the first coach; if there were two, the first one listed will be the last one to coach that year
            coaches.append({'type': 'head coach', 'name': p.find('a').text, 'team': team_name, 'year': year})
        if "Offensive Coordinator:" in p.text:
            coaches.append({'type': 'offensive coordinator', 'name': p.find('a').text, 'team': team_name, 'year': year})
    return coaches

coaches_uri = "https://www.pro-football-reference.com/teams/"
coaches_by_team = []
for team in pfr_teams:
    for year in range(1986,2018):
        # skip team/year combinations that did not exist
        if team == 'htx' and year < 2002:
            continue
        elif team in ["jax", "car"] and year < 1995:
            continue
        elif team == "cle" and year in [1996, 1997, 1998]:
            continue
        elif team == "rav" and year < 1996:
            continue
        soup = get_soup(ses, coaches_uri + str(team) + "/" + str(year) + ".htm")
        if soup:
            coaches_by_team += extract_coaches(soup, team, year)
        print("Completed year {} for team {}".format(year, team))
coaches_df = pd.DataFrame(coaches_by_team)

### Convert Changing Team Names for Consistency
Yes, I could have just fixed the names as the data came in, but I didn't, so...

In [None]:
# use STL for all Rams teams
coaches_df.loc[coaches_df['team'] == 'LARM', 'team'] = 'STL'
teamoff_df.loc[teamoff_df['team'] == 'LARM', 'team'] = 'STL'
qb_df.loc[qb_df['team'] == 'LARM', 'team'] = 'STL'

# use ARI for all Cardinals teams
coaches_df.loc[(coaches_df['team'] == 'STLC') | (coaches_df['team'] == 'PHX'), 'team'] = 'ARI'
teamoff_df.loc[(teamoff_df['team'] == 'STLC') | (teamoff_df['team'] == 'PHX'), 'team'] = 'ARI'
qb_df.loc[(qb_df['team'] == 'STLC') | (qb_df['team'] == 'PHX'), 'team'] = 'ARI'

# use TEN for all Oilers/Titans teams
coaches_df.loc[coaches_df['team'] == 'HOIL', 'team'] = 'TEN'
teamoff_df.loc[teamoff_df['team'] == 'HOIL', 'team'] = 'TEN'
qb_df.loc[qb_df['team'] == 'HOIL', 'team'] = 'TEN'

# use BAL for the first Browns team and the Ravens
coaches_df.loc[coaches_df['team'] == 'CLE1', 'team'] = 'BAL'
teamoff_df.loc[teamoff_df['team'] == 'CLE1', 'team'] = 'BAL'
qb_df.loc[qb_df['team'] == 'CLE1', 'team'] = 'BAL'

# use OAK for all Raiders teams
coaches_df.loc[coaches_df['team'] == 'LARD', 'team'] = 'OAK'
teamoff_df.loc[teamoff_df['team'] == 'LARD', 'team'] = 'OAK'
qb_df.loc[qb_df['team'] == 'LARD', 'team'] = 'OAK'

# use SD for all Chargers teams
coaches_df.loc[coaches_df['team'] == 'LACH', 'team'] = 'SD'
teamoff_df.loc[teamoff_df['team'] == 'LACH', 'team'] = 'SD'
qb_df.loc[qb_df['team'] == 'LACH', 'team'] = 'SD'

### For Teams with No Offensive Coordinator, Add the Head Coach as the OC

In [None]:
for year in range(1986, 2018):
    for team in list(set(coaches_df['team'].values)):
        subset = coaches_df.query("year == {0} & team == '{1}'".format(year, team))
        subset_ocs = subset.query("type == 'offensive coordinator'")
        if subset_ocs.empty:
            subset_hcs = subset.query("type == 'head coach'")
            head_coaches = list(set(subset_hcs['name']))
            for hc in head_coaches:
                coaches_df.loc[len(coaches_df)+1] = {'name': hc, 'team': team, 'type': 'offensive coordinator', 'year': year}

## Calculate Advanced Stats

#### Add Previous Year's DVOA for Each Team Offense

In [None]:
def calc_prev_dvoa(df, stat_name):    
    temp_df = df.sort_values(by=['team', 'year'], ascending=True, axis=0, inplace=False)
    temp_df.loc[temp_df.year > 1986, 'prev_dvoa'] = temp_df[stat_name].shift(1)
    return temp_df
teamoff_df = calc_prev_dvoa(teamoff_df, 'off_dvoa')

#### Calculate how many years away each coordinator is from Kyle Shanahan on each given team

In [None]:
def years_from_shanny(df, row):
    if row['name'] == "Kyle Shanahan" or row['type'] == 'head coach':
        return None
    shanny_subset = df.query("team == '{}' & name == 'Kyle Shanahan'".format(row['team']))
    if not shanny_subset.empty:
        shanny_min = shanny_subset['year'].min()
        shanny_max = shanny_subset['year'].max()
        return min(abs(row['year'] - shanny_min), abs(row['year'] - shanny_max))
        
coaches_df['years_from_shanny'] = coaches_df.apply(lambda x: years_from_shanny(coaches_df, x), axis=1)

#### Identify team's previous coach

In [None]:
def calc_prev_name(df, stat_name, coach_type):
    # first sort on team and year, so we can just grab the coach/player from the previous record
    temp_df = df.query("type == '{}'".format(coach_type)).sort_values(by=['team', 'year'], ascending=True, axis=0, inplace=False)
    # calculate the first year that each coach/player has been with each team
    for name in list(set(temp_df[stat_name])):
        subset_df = temp_df.query("{0} == \"{1}\"".format(stat_name, name))
        for team in list(set(subset_df['team'])):            
                first_year = subset_df.query("team == '{}'".format(team))['year'].max()
                temp_df.loc[temp_df.year > first_year, 'prev_' + stat_name] = temp_df[stat_name].shift(1)
    return temp_df

def update_df_with_names(df):
    tempdf1 = calc_prev_name(df, 'name', 'head coach')
    tempdf2 = calc_prev_name(df, 'name', 'offensive coordinator')
    return tempdf1.append(tempdf2)
                        
coaches_df = update_df_with_names(coaches_df)

#### Identify number of years a coach has been with his current team

In [None]:
def years_with_current_team(df, stat_name, coach_type):
    temp_df = df.sort_values(by=['team', 'year'], ascending=True, axis=0, inplace=False).query("type == '{}'".format(coach_type))
    year_counter = 1
    for row in temp_df.iterrows():
        team = row[1]['team']
        year = row[1]['year']
        name = row[1][stat_name]
        prev_name = row[1]["prev_" + stat_name]
        if name == prev_name:
            year_counter += 1
        else:
            year_counter = 1
        temp_df.loc[(temp_df['team'] == team) & (temp_df['year'] == year), 'years_with_team'] = year_counter
    return temp_df

def update_df_years_with_current_team(df):
    tempdf1 = years_with_current_team(df, "name", "head coach")
    tempdf2 = years_with_current_team(df, "name", "offensive coordinator")
    return tempdf1.append(tempdf2)

coaches_df = update_df_years_with_current_team(coaches_df)

#### Calculate difference in DVOA from the previous coach

In [None]:
def calc_diff_prev_coach(df, coaches_df, stat_name, coach_type):
    temp_df = df.sort_values(by=['team', 'year'], ascending=True, axis=0, inplace=False)
    diff_dvoa = 0
    initial_dvoa = np.nan
    for row in temp_df.merge(coaches_df.query("type == '{}'".format(coach_type)), on=['team', 'year']).iterrows():
        if row[1]['name'] != row[1]['prev_name']:
            initial_dvoa = row[1]['prev_dvoa']
        diff_dvoa = (row[1][stat_name] - initial_dvoa)
        team = row[1]['team']
        year = row[1]['year']
        temp_df.loc[(temp_df.team == team) & (temp_df.year == year), 'diff_from_prev_coach'] = diff_dvoa
    return temp_df

teamoff_df = calc_diff_prev_coach(teamoff_df, coaches_df, 'off_dvoa', "head coach")
teamoff_df = calc_diff_prev_coach(teamoff_df, coaches_df, 'off_dvoa', "offensive coordinator")

## Write DataFrames to Excel
These will be used in Tableau, and also for the machine learning predictions.

In [None]:
writer = pd.ExcelWriter("dvoa_stats.xlsx")
teamoff_df.to_excel(writer, sheet_name="teamoff", index=False)
qb_df.to_excel(writer, sheet_name="qb", index=False)
coaches_df.to_excel(writer, sheet_name="coaches", index=False)
writer.save()