In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

In [2]:
##Read in CSVs reformat to keep player ID consistent across data
qb_2018_base_df = pd.read_csv("QB_stats_base/fantasy-football-leaders2018.csv")
qb_2018_extra_df = pd.read_csv("QB_stats_extra/advanced-qb-metrics2018.csv").rename(columns={'PlayerPlayerId': 'PlayerID'})
qb_2018_extra_df = qb_2018_extra_df.drop(columns = ["PlayerShortName"], axis = 1)
qb_2019_base_df = pd.read_csv("QB_stats_base/fantasy-football-leaders2019.csv")
qb_2019_extra_df = pd.read_csv("QB_stats_extra/advanced-qb-metrics2019.csv").rename(columns={'PlayerPlayerId': 'PlayerID'})
qb_2019_extra_df = qb_2019_extra_df.drop(columns = ["PlayerShortName"], axis = 1)
qb_2020_base_df = pd.read_csv("QB_stats_base/fantasy-football-leaders2020.csv")
qb_2020_extra_df = pd.read_csv("QB_stats_extra/advanced-qb-metrics2020.csv").rename(columns={'PlayerPlayerId': 'PlayerID'})
qb_2020_extra_df = qb_2020_extra_df.drop(columns = ["PlayerShortName"], axis = 1)
qb_2021_base_df = pd.read_csv("QB_stats_base/fantasy-football-leaders2020.csv")
qb_2021_extra_df = pd.read_csv("QB_stats_extra/advanced-qb-metrics2020.csv").rename(columns={'PlayerPlayerId': 'PlayerID'})
qb_2021_extra_df = qb_2021_extra_df.drop(columns = ["PlayerShortName"], axis = 1)

#Remove Duplicate Columns between the extra stats df and base df
diff_cols = qb_2018_extra_df.columns.difference(qb_2018_base_df.columns)
diff_cols = diff_cols.union(['PlayerID'])

qb_2018_extra_df = qb_2018_extra_df[diff_cols]
qb_2019_extra_df = qb_2019_extra_df[diff_cols]
qb_2020_extra_df = qb_2020_extra_df[diff_cols]
qb_2021_extra_df = qb_2021_extra_df[diff_cols]

In [3]:
#Creating df with target to merge with provided DF
qb_2019_target_df = qb_2019_base_df.loc[:,["PlayerID","FantasyPoints"]]
qb_2019_target_names_df = qb_2019_base_df.loc[:,["PlayerID","Name"]]
qb_2020_target_df = qb_2020_base_df.loc[:,["PlayerID","FantasyPoints"]]
qb_2020_target_names_df = qb_2019_base_df.loc[:,["PlayerID","Name"]]

In [4]:
#Create merged dataframe to prepare for model 
qb_2018_merged_df = pd.merge(qb_2018_base_df, qb_2018_extra_df, on="PlayerID")
qb_2018_merged_df = qb_2018_merged_df.drop(["FantasyPoints"], axis = 1)
qb_2018_targetmerged_df = pd.merge(qb_2018_merged_df, qb_2019_target_df, on="PlayerID").set_index("PlayerID")
qb_2019_merged_df = pd.merge(qb_2019_base_df, qb_2019_extra_df, on="PlayerID")
qb_2019_merged_df = qb_2019_merged_df.drop(["FantasyPoints"], axis = 1)
qb_2019_targetmerged_df = pd.merge(qb_2019_merged_df, qb_2020_target_df, on="PlayerID").set_index("PlayerID")
qb_2020_mergedtest_df = pd.merge(qb_2020_base_df, qb_2020_extra_df, on="PlayerID")

In [5]:
#Create target (next years Fantasy Point total)
y = qb_2018_targetmerged_df["FantasyPoints"]

In [6]:
#Remove non-numerical columns and unwanted features.
X = qb_2018_targetmerged_df.drop(columns = ["Rank","FantasyPoints","Name","Team","Position","PassingCompletions","PassingYardsPerAttempt","PassingRating","FantasyPointsPerGame","AirYardsPerAttempt","AirYardsPerGame","MoneyThrows","PressuredCompletionPercentage","PassingCompletionPercentage","CompletionPercentage","ProtectionRate","TouchdownRate","RushingYards","PassAttemptsPerGame","AirYards","PassingAttempts","PassingInterceptions","RushingYardsPerAttempt","Played"], axis = 1)

#Scale data
scaler = StandardScaler().fit(X)
X_scaled = scaler.transform(X)



In [7]:
#Perform linear regression on features from 2018 and fantasy football point totals from 2019 
LinReg_model  = LinearRegression(fit_intercept=True).fit(X_scaled,y)

#Print coefficients and associated features

for i, col in enumerate(X.columns):
    print('{} is associated with {}'.format(col,LinReg_model.coef_[i]))
    
#print intercept, in this case to remove negative values the intercept was removed.
LinReg_model.intercept_

PassingYards is associated with -54.39178449395101
PassingTouchdowns is associated with 52.396118581687354
RushingAttempts is associated with -19.66763588112037
RushingTouchdowns is associated with 36.00371743154233
DeepBallAttempts is associated with 40.48761486963539
DeepBallCompletionPercentage is associated with 34.59008551129594
Interceptions is associated with -35.54231795173512
PlayerAgeExact is associated with -31.334136286726274


192.87870967741935

In [8]:
#predict 2020 values using 2019 features
X_2019 = qb_2019_merged_df.drop(columns = ["PlayerID","Rank","Name","Team","Position","PassingCompletions","PassingYardsPerAttempt","PassingRating","FantasyPointsPerGame","AirYardsPerAttempt","AirYardsPerGame","MoneyThrows","PressuredCompletionPercentage","PassingCompletionPercentage","CompletionPercentage","ProtectionRate","TouchdownRate","RushingYards","PassAttemptsPerGame","AirYards","PassingAttempts","PassingInterceptions","RushingYardsPerAttempt","Played"], axis = 1)
scaler = StandardScaler().fit(X_2019)
X_2019_scaled = scaler.transform(X_2019)

QB_proj = LinReg_model.predict(X_2019_scaled)

#values were 3 orders of magnitude larger than realistic values
QB_proj = QB_proj

In [9]:
#create df for a visual representation of effectiveness
test_2019 = qb_2019_merged_df.loc[:,["Rank","PlayerID","Name","Team"]].rename(columns={'Rank': '2019 Rank','Team':'2019 Team'})
test_2019["QB2020_proj"] = QB_proj
test_2020 = qb_2020_mergedtest_df.loc[:,["Rank","PlayerID","Team","FantasyPoints"]].rename(columns={'Rank': '2020 Actual Rank','Team':'2020 Team'})

In [10]:
test_2019_2020_df = pd.merge(test_2019,test_2020,on = "PlayerID")
test_2019_2020_df.sort_values(["QB2020_proj"], ascending=False)

Unnamed: 0,2019 Rank,PlayerID,Name,2019 Team,QB2020_proj,2020 Actual Rank,2020 Team,FantasyPoints
0,1,19781,Lamar Jackson,BAL,380.857533,10,BAL,332.78
2,4,18857,Deshaun Watson,HOU,346.164497,5,HOU,369.32
4,7,18890,Patrick Mahomes,KC,320.297452,4,KC,374.4
1,3,14536,Russell Wilson,SEA,307.845143,6,SEA,359.78
3,6,19801,Josh Allen,BUF,284.787338,1,BUF,395.06
5,8,20889,Kyler Murray,ARI,244.709689,3,ARI,378.74
11,15,14252,Kirk Cousins,MIN,230.896332,11,MIN,306.2
6,9,2593,Aaron Rodgers,GB,229.782898,2,GB,382.26
15,19,20880,Gardner Minshew,JAX,224.675373,26,JAX,159.66
18,22,13799,Ryan Tannehill,TEN,222.718079,7,TEN,343.36


In [11]:
## trial 1 pre age 1702.8527398007755
## trial 2 pre age 1809.794153695599

sum(abs(test_2019_2020_df["FantasyPoints"] - test_2019_2020_df["QB2020_proj"]))

1980.4319973483828

In [12]:
## trial 1 pre age 237064.32478727997
## trial 2 pre age 238131.301966411

sum(abs(test_2019_2020_df["FantasyPoints"] - test_2019_2020_df["QB2020_proj"])**2)

264294.32314647606