In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

In [2]:
##Read in CSVs reformat to keep player ID consistent across data
qb_2018_base_df = pd.read_csv("QB_stats_base/fantasy-football-leaders2018.csv")
qb_2018_extra_df = pd.read_csv("QB_stats_extra/advanced-qb-metrics2018.csv").rename(columns={'PlayerPlayerId': 'PlayerID'})
qb_2018_extra_df = qb_2018_extra_df.drop(columns = ["PlayerShortName"], axis = 1)
qb_2019_base_df = pd.read_csv("QB_stats_base/fantasy-football-leaders2019.csv")
qb_2019_extra_df = pd.read_csv("QB_stats_extra/advanced-qb-metrics2019.csv").rename(columns={'PlayerPlayerId': 'PlayerID'})
qb_2019_extra_df = qb_2019_extra_df.drop(columns = ["PlayerShortName"], axis = 1)
qb_2020_base_df = pd.read_csv("QB_stats_base/fantasy-football-leaders2020.csv")
qb_2020_extra_df = pd.read_csv("QB_stats_extra/advanced-qb-metrics2020.csv").rename(columns={'PlayerPlayerId': 'PlayerID'})
qb_2020_extra_df = qb_2020_extra_df.drop(columns = ["PlayerShortName"], axis = 1)
qb_2021_base_df = pd.read_csv("QB_stats_base/fantasy-football-leaders2020.csv")
qb_2021_extra_df = pd.read_csv("QB_stats_extra/advanced-qb-metrics2020.csv").rename(columns={'PlayerPlayerId': 'PlayerID'})
qb_2021_extra_df = qb_2021_extra_df.drop(columns = ["PlayerShortName"], axis = 1)

#Remove Duplicate Columns between the extra stats df and base df
diff_cols = qb_2018_extra_df.columns.difference(qb_2018_base_df.columns)
diff_cols = diff_cols.union(['PlayerID'])

qb_2018_extra_df = qb_2018_extra_df[diff_cols]
qb_2019_extra_df = qb_2019_extra_df[diff_cols]
qb_2020_extra_df = qb_2020_extra_df[diff_cols]
qb_2021_extra_df = qb_2021_extra_df[diff_cols]

In [3]:
#Creating df with target to merge with provided DF
qb_2019_target_df = qb_2019_base_df.loc[:,["PlayerID","FantasyPoints"]]
qb_2019_target_names_df = qb_2019_base_df.loc[:,["PlayerID","Name"]]
qb_2020_target_df = qb_2020_base_df.loc[:,["PlayerID","FantasyPoints"]]
qb_2020_target_names_df = qb_2019_base_df.loc[:,["PlayerID","Name"]]

In [4]:
#Create merged dataframe to prepare for model 
qb_2018_merged_df = pd.merge(qb_2018_base_df, qb_2018_extra_df, on="PlayerID")
qb_2018_merged_df = qb_2018_merged_df.drop(["FantasyPoints"], axis = 1)
qb_2018_targetmerged_df = pd.merge(qb_2018_merged_df, qb_2019_target_df, on="PlayerID").set_index("PlayerID")
qb_2019_merged_df = pd.merge(qb_2019_base_df, qb_2019_extra_df, on="PlayerID")
qb_2019_merged_df = qb_2019_merged_df.drop(["FantasyPoints"], axis = 1)
qb_2019_targetmerged_df = pd.merge(qb_2019_merged_df, qb_2020_target_df, on="PlayerID").set_index("PlayerID")
qb_2020_mergedtest_df = pd.merge(qb_2020_base_df, qb_2020_extra_df, on="PlayerID")

In [5]:
#Create target (next years Fantasy Point total)
y = qb_2018_targetmerged_df["FantasyPoints"]

In [6]:
#Remove non-numerical columns and unwanted features.
X = qb_2018_targetmerged_df.drop(columns = ["Rank","FantasyPoints","Name","Team","Position","PassingCompletions","PassingYardsPerAttempt","PassingRating","FantasyPointsPerGame","AirYardsPerAttempt","AirYardsPerGame","MoneyThrows","PassingCompletionPercentage","CompletionPercentage","ProtectionRate","TouchdownRate","PassAttemptsPerGame","AirYards","PassingAttempts","PassingInterceptions","RushingYardsPerAttempt","Played"], axis = 1)

#Scale data
scaler = StandardScaler().fit(X)
X_scaled = scaler.transform(X)



In [7]:
#Perform linear regression on features from 2018 and fantasy football point totals from 2019 
LinReg_model  = LinearRegression(fit_intercept=True).fit(X_scaled,y)

#Print coefficients and associated features

for i, col in enumerate(X.columns):
    print('{} is associated with {}'.format(col,LinReg_model.coef_[i]))
    
#print intercept, in this case to remove negative values the intercept was removed.
LinReg_model.intercept_

PassingYards is associated with -66.92061545800192
PassingTouchdowns is associated with 53.332822863833414
RushingAttempts is associated with 38.4062098968159
RushingYards is associated with -57.957167587090716
RushingTouchdowns is associated with 38.58158889524103
DeepBallAttempts is associated with 46.48148062489677
DeepBallCompletionPercentage is associated with 42.00025979244419
Interceptions is associated with -29.431950274007818
PlayerAgeExact is associated with -23.980672311782886
PressuredCompletionPercentage is associated with -26.990023219181086


192.87870967741932

In [8]:
#predict 2020 values using 2019 features
X_2019 = qb_2019_merged_df.drop(columns = ["PlayerID","Rank","Name","Team","Position","PassingCompletions","PassingYardsPerAttempt","PassingRating","FantasyPointsPerGame","AirYardsPerAttempt","AirYardsPerGame","MoneyThrows","PassingCompletionPercentage","CompletionPercentage","ProtectionRate","TouchdownRate","PassAttemptsPerGame","AirYards","PassingAttempts","PassingInterceptions","RushingYardsPerAttempt","Played"], axis = 1)
scaler = StandardScaler().fit(X_2019)
X_2019_scaled = scaler.transform(X_2019)

QB_proj = LinReg_model.predict(X_2019_scaled)

#values were 3 orders of magnitude larger than realistic values
QB_proj = QB_proj

In [9]:
#create df for a visual representation of effectiveness
test_2019 = qb_2019_merged_df.loc[:,["Rank","PlayerID","Name","Team"]].rename(columns={'Rank': '2019 Rank','Team':'2019 Team'})
test_2019["QB2020_proj"] = QB_proj
test_2020 = qb_2020_mergedtest_df.loc[:,["Rank","PlayerID","Team","FantasyPoints"]].rename(columns={'Rank': '2020 Actual Rank','Team':'2020 Team'})

In [10]:
test_2019_2020_df = pd.merge(test_2019,test_2020,on = "PlayerID")
test_2019_2020_df.sort_values(["QB2020_proj"], ascending=False)

Unnamed: 0,2019 Rank,PlayerID,Name,2019 Team,QB2020_proj,2020 Actual Rank,2020 Team,FantasyPoints
3,6,19801,Josh Allen,BUF,383.864986,1,BUF,395.06
0,1,19781,Lamar Jackson,BAL,354.540688,10,BAL,332.78
1,3,14536,Russell Wilson,SEA,331.331346,6,SEA,359.78
2,4,18857,Deshaun Watson,HOU,328.332873,5,HOU,369.32
4,7,18890,Patrick Mahomes,KC,270.069987,4,KC,374.4
5,8,20889,Kyler Murray,ARI,260.825555,3,ARI,378.74
15,19,20880,Gardner Minshew,JAX,251.206975,26,JAX,159.66
6,9,2593,Aaron Rodgers,GB,248.155384,2,GB,382.26
23,29,9038,Matthew Stafford,DET,246.282844,15,DET,260.56
20,25,18811,Mitchell Trubisky,CHI,220.917047,27,CHI,153.7


In [15]:
x_range = np.linspace(test_2019_2020_df["FantasyPoints"].min(), test_2019_2020_df["FantasyPoints"].max(), 100)
y_range = x_range
fig = px.scatter(x = test_2019_2020_df["FantasyPoints"], y = test_2019_2020_df["QB2020_proj"], color = test_2019_2020_df["Name"], opacity=0.65, labels = dict(x="2020 Actual", y= "2020 Projected",color = "Player Name"))
fig.add_traces(go.Scatter(x=x_range, y=y_range, name='Ideal Fit'))
fig.update_layout(legend_traceorder="reversed")
fig.show()