## Load Dependencies

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
%matplotlib inline

## Load DVOA Data

In [None]:
coaches_df = pd.read_excel("dvoa_stats.xlsx", sheetname="coaches")
teamoff_df = pd.read_excel("dvoa_stats.xlsx", sheetname="teamoff")
qb_df = pd.read_excel("dvoa_stats.xlsx", sheetname="qb")

## Calculate Required Data for Machine Learning Model
Below are the required fields we'll use for this model:
* hc_years
* hc_years_with_team
* hc_career_dvoa
* hc_last_year_dvoa
* hc_career_differential (improvement over final dvoa from previous coach on that team)
* oc_years
* oc_years_with_team
* oc_career_dvoa
* oc_last_year_dvoa
* oc_career_differential (same as hc_career_differential)
* qb_years
* qb_years_with_team
* qb_career_dvoa
* qb_last_year_dvoa
* team_last_year_dvoa (the team's offensive dvoa from the last year)

Each of these fields will be calculated for each team and season, with the label/target being that team's offensive dvoa for that year.

#### Calculate Stats for Players and Coaches

In [None]:
print("Block running...")
start_time = datetime.now()

def get_row_stats(row_dict, temp_df, stat_type):
    if stat_type == 'qb':
        name = row_dict['player']
        this_guy = temp_df.query("player == \"{}\"".format(name))
    else:
        name = row_dict['name']
        this_guy = temp_df.query("name == \"{}\"".format(name))        
    num_years = row_dict['year'] - this_guy['year'].min()
    this_team = this_guy.query("team == '{}'".format(row_dict['team']))
    first_year_with_team = this_team['year'].min()
    prev_years = this_guy.query("year < {}".format(row_dict['year']))
    num_years_with_team = 0
    career_diff_from_prev = 0
    if not prev_years.empty:
        career_dvoa = prev_years['dvoa'].mean()
        if stat_type != 'qb':
            career_diff_from_prev = prev_years['diff_from_prev_coach'].mean()
        if not prev_years['year'].max() < (row_dict['year'] - 1):            
            for r in prev_years.sort_values("year", ascending=False, axis=0, inplace=False).iterrows():
                if r[1]['team'] == row_dict['team']:
                    num_years_with_team += 1
                else:
                    break
    else:
        career_dvoa = None
    last_year = this_guy.query("year == {}".format(row_dict['year'] - 1))
    if not last_year.empty:
        last_year_dvoa = last_year['dvoa'].max()
    else:
        last_year_dvoa = 0
    team_last_year = temp_df.query("year == {} & team == '{}'".format(row_dict['year'] - 1, row_dict['team']))
    if stat_type == 'hc':
        if not team_last_year.empty:
            team_last_year_dvoa = team_last_year['dvoa'].max()
        else:
            team_last_year_dvoa = 0
        return {
                'team': row_dict['team'],
                'year': row_dict['year'],
                "{}_years".format(stat_type) : num_years,
                '{}_years_with_team'.format(stat_type) : num_years_with_team,
                '{}_last_year_dvoa'.format(stat_type) : last_year_dvoa,
                '{}_career_dvoa'.format(stat_type) : career_dvoa,
                '{}_career_diff_from_prev'.format(stat_type) : career_diff_from_prev,
                'team_last_year_dvoa' : team_last_year_dvoa
        }
    else:
        return {
                'team': row_dict['team'],
                'year': row_dict['year'],
                "{}_years".format(stat_type) : num_years,
                '{}_years_with_team'.format(stat_type) : num_years_with_team,
                '{}_last_year_dvoa'.format(stat_type) : last_year_dvoa,
                '{}_career_dvoa'.format(stat_type) : career_dvoa,
                '{}_career_diff_from_prev'.format(stat_type) : career_diff_from_prev
        }        

def get_stats(df, stat_type):
    temp_values = []
    temp_df = df.sort_values(by=['team', 'year'], ascending=True, axis=0, inplace=False)
    for row in temp_df.iterrows():
        row_dict = dict(row[1])
        temp_values.append(get_row_stats(row_dict, temp_df, stat_type))
    return temp_values

merged_df = coaches_df.merge(teamoff_df.loc[:, ['team', 'year', 'off_dvoa', 'diff_from_prev_coach']], on=['team', 'year'])
merged_df.rename(columns={'off_dvoa': 'dvoa'}, inplace=True)

hc_stats_df = pd.DataFrame(get_stats(merged_df.query("type == 'head coach'"), 'hc'))
oc_stats_df = pd.DataFrame(get_stats(merged_df.query("type == 'offensive coordinator'"), 'oc'))

qb_stats_df = pd.DataFrame(get_stats(qb_df, 'qb'))
qb_stats_df.query("team != '2TM'", inplace=True) # remove players who had 2 teams for the same season
qb_stats_df.drop(['qb_career_diff_from_prev'], axis=1, inplace=True)
print("Block complete in {} seconds.".format((datetime.now() - start_time).seconds))

#### Merge DataFrames, Extract Features and Labels

In [None]:
combined_stats_df = hc_stats_df.merge(oc_stats_df, on=['team', 'year']).merge(
    qb_stats_df, on=['team', 'year']).merge(teamoff_df.loc[:, ['team', 'year', 'off_dvoa']], on=['team', 'year'])
combined_stats_df.fillna(0, inplace=True)
# group on year and team, average totals where there's more than one record (as in more than one QB for a team for a season)
combined_stats_df = combined_stats_df.groupby(by=['team', 'year'], as_index=False).agg(np.mean)
combined_stats_df.to_csv("combined_stats.csv")

#### Split Out 2017 Data (this will be for predictions only)

In [None]:
combined_stats_2017_df = combined_stats_df.query("year == 2017")
combined_stats_df.query("year < 2017", inplace=True)

## Extract Features and Labels from Data

In [None]:
#combined_stats_df = pd.read_csv("combined_stats.csv")
features = combined_stats_df.query(
    "year > 1987").loc[:, combined_stats_df.columns.difference(['off_dvoa', 'team', 'year'])].values
feature_fields = combined_stats_df.query(
    "year > 1987").loc[:, combined_stats_df.columns.difference(['off_dvoa', 'team', 'year'])].columns
labels = combined_stats_df.query("year > 1987")['off_dvoa'].values

## Scale Data

In [None]:
from sklearn.preprocessing import scale

scaled_features = scale(features)

## Split Training and Testing Datasets

In [None]:
from sklearn.model_selection import train_test_split
features_train, features_test, labels_train, labels_test = train_test_split(scaled_features, labels, test_size = 0.3)

## Fit Model, Make and Test Predictions

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score

reg = RandomForestRegressor(max_depth=20, min_samples_leaf=2, min_samples_split=8, min_weight_fraction_leaf=0)
reg.fit(features_train, labels_train)
predicted = reg.predict(features_test)

ev_score = explained_variance_score(labels_test, predicted)
print("Explained variance score: ", ev_score)

r2_score = r2_score(labels_test, predicted)
print("r2 score: ", r2_score)

fig, ax = plt.subplots()
ax.scatter(labels_test, labels_test, color='red', alpha=0.1)
ax.scatter(predicted, labels_test, alpha=0.3)
plt.xlabel("Predicted DVOA")
plt.ylabel("Actual DVOA")
plt.title("Predictions of Offensive DVOA for NFL Teams")
plt.show()

## Feature Importance

In [None]:
for x in sorted(zip(map(lambda x: round(x, 4), reg.feature_importances_), feature_fields), 
             reverse=True):
    print(x)

## Make Predictions for 49ers for 2017 and 2018 Seasons

#### Make model predictions for 2017 season

In [None]:
from sklearn.preprocessing import scale
def make_preds(df, team, reg, year, scaled_features, labels):    
    subset_df = df.query("team == '{}'".format(team))
    pred_features = subset_df.loc[:, subset_df.columns.difference(['off_dvoa', 'team', 'year'])].values
    scaled_pred_features = scale(pred_features)
    # run the model 100 times and take the average of the predicted DVOA
    predicted_to_team = []
    for i in range(0, 100):
        reg = RandomForestRegressor(max_depth=20, min_samples_leaf=2, min_samples_split=8, min_weight_fraction_leaf=0)
        reg.fit(scaled_features, labels)
        pred = reg.predict(scaled_pred_features)
        for p in pred:
            predicted_to_team.append({'team': team, 'off_dvoa': p, 'year': year})
    predictions_df = pd.DataFrame(predicted_to_team)
    return predictions_df
predictions_2017_df = make_preds(combined_stats_2017_df, 'SF', reg, 2017, scaled_features, labels)

#### Make model predictions for 2018 season

In [None]:
dvoa_2016 = combined_stats_df.query("team == 'SF' & year == 2016")["off_dvoa"].max()
dvoa_2017_predicted = predictions_2017_df['off_dvoa'].mean()
dvoa_oc_career = combined_stats_df.query("team == 'SF' & year == 2016")["oc_career_dvoa"].max()
dvoa_oc_career_diff = combined_stats_df.query("team == 'SF' & year == 2016")["oc_career_diff_from_prev"].max()
data_for_2018 = [{
    'team': 'SF',
    'year': 2018,
    'hc_career_diff_from_prev': dvoa_2017_predicted - dvoa_2016,
    'hc_career_dvoa': dvoa_2017_predicted,
    'hc_last_year_dvoa': dvoa_2017_predicted,
    'hc_years': 1.0,
    'hc_years_with_team': 1.0,
    'team_last_year_dvoa': dvoa_2017_predicted,
    'oc_career_dvoa': ((dvoa_oc_career * 9) + dvoa_2017_predicted) / 10.0,
    'oc_career_diff_from_prev': ((dvoa_oc_career_diff * 9) + (dvoa_2017_predicted - dvoa_2016)) / 10.0,
    'oc_last_year_dvoa': dvoa_2017_predicted,
    'oc_years': 1.0,
    'oc_years_with_team': 1.0,
    'qb_career_dvoa': 0.0,
    'qb_last_year_dvoa': 0.0,
    'qb_years': 0.0,
    'qb_years_with_team': 0.0,
    'off_dvoa': 0.0
}]
data_2018_df = pd.DataFrame(data_for_2018)
predictions_2018_df = make_preds(data_2018_df, 'SF', reg, 2018, scaled_features, labels)

#### Add data for 2016 season, 2017 model prediction, and 2017-2018 Shanahan coefficient predictions

In [None]:
predicted_list = []
predicted_list.append({
        'year': 2017,
        'team': 'SF',
        'off_dvoa': predictions_2017_df['off_dvoa'].mean(),
        'type': 'model prediction'
    })
predicted_list.append({
        'year': 2018,
        'team': 'SF',
        'off_dvoa': predictions_2018_df['off_dvoa'].mean(),
        'type': 'model prediction'
    })
predicted_list.append({
        'year': 2016,
        'team': 'SF',
        'off_dvoa': combined_stats_df.query("year == 2016 & team == 'SF'")['off_dvoa'].max(),
        'type': 'actual'
    })
predicted_list.append({
        'year': 2017,
        'team': 'SF',
        'off_dvoa': -0.18,
        'type': 'actual (through week 6)'
    })
predicted_list.append({
        'year': 2017,
        'team': 'SF',
        'off_dvoa': combined_stats_df.query("year == 2016 & team == 'SF'")['off_dvoa'].max() + -0.0215 ,
        'type': 'Shanahan coefficient prediction'
    })
predicted_list.append({
        'year': 2018,
        'team': 'SF',
        'off_dvoa': combined_stats_df.query("year == 2016 & team == 'SF'")['off_dvoa'].max() + 0.0903,
        'type': 'Shanahan coefficient prediction'
    })
predicted_df = pd.DataFrame(predicted_list)

#### Output to CSV

In [None]:
predicted_df.to_csv("predicted_dvoa.csv", index=False)