In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import sqlite3

In [2]:
##Read in data from sqlite and reformat to keep player ID consistent across data
conn = sqlite3.connect("sqlite_db.sqlite")
c = conn.cursor()



def data_set(train_year, qb_td, ppr):
    
    next_year = train_year+1
    
    position_list = ['QB', 'RB', 'TE', 'WR']
    target_merged_df_list = []
    
    if ppr:
        ppr = 1
    else:
        ppr = 0 
    
    for position in position_list:
        
        #get train year data 
        base_df = pd.DataFrame(c.execute(f"SELECT * FROM '{position}_Stats' WHERE Year = '{train_year}'"))
        base_df.columns = list(map(lambda x: x[0], c.description))
        extra_df = pd.DataFrame(c.execute(f"SELECT * FROM '{position}_Extra_Stats' WHERE Year = '{train_year}'"))
        extra_df.columns = list(map(lambda x: x[0], c.description))
        extra_df = extra_df.rename(columns={'PlayerPlayerId': 'PlayerID'})
        extra_df = extra_df.drop(columns = ["PlayerShortName"], axis = 1)

        #Remove Duplicate Columns between the extra stats df and base df
        diff_cols = extra_df.columns.difference(base_df.columns)
        diff_cols = diff_cols.union(['PlayerID'])
        extra_df = extra_df[diff_cols]

        #create model data
        merged_df = pd.merge(base_df, extra_df, on="PlayerID")
        merged_df = merged_df.drop(columns = ["FantasyPoints", "index"], axis = 1)

        #get next year data to create target
        next_df = pd.DataFrame(c.execute(f"SELECT * FROM '{position}_Stats' WHERE Year = '{next_year}'"))
        next_df.columns = list(map(lambda x: x[0], c.description))

        target_df = pd.DataFrame(next_df["PlayerID"])

        #use fantasy point calculation funtion in order to calculate fantasy points based on inputs
        if position == 'QB':
            target_df["FantasyPoints"] = qb_calc(next_df,qb_td)
            targetmerged_df = pd.merge(merged_df, target_df, on="PlayerID").set_index("PlayerID")
        elif position == "RB":
            target_df["FantasyPoints"] = wr_rb_te_calc(next_df,ppr)
            targetmerged_df = pd.merge(merged_df, target_df, on="PlayerID").set_index("PlayerID").fillna(0)
            targetmerged_df = targetmerged_df.loc[targetmerged_df["RushingAttempts"]>20]
        else:
            target_df["FantasyPoints"] = wr_rb_te_calc(next_df,ppr)
            targetmerged_df = pd.merge(merged_df, target_df, on="PlayerID").set_index("PlayerID").fillna(0)
            targetmerged_df = targetmerged_df.loc[targetmerged_df["ReceivingTargets"]>10]
        
        target_merged_df_list.append(targetmerged_df)
        
    return target_merged_df_list
    

def predict_year_data(predict_year, qb_td, ppr):
    predict_year = predict_year - 1
    
    position_list = ['QB', 'RB', 'TE', 'WR']
    merged_df_list = []
    results_df_list = []
    last_year_end_df_list = [] 
    
    if ppr:
        ppr = 1
    else:
        ppr = 0 
        
    for position in position_list:

        #query data by position
        base_df = pd.DataFrame(c.execute(f"SELECT * FROM '{position}_Stats' WHERE Year = '{predict_year}'"))
        base_df.columns = list(map(lambda x: x[0], c.description))
        extra_df = pd.DataFrame(c.execute(f"SELECT * FROM '{position}_Extra_Stats' WHERE Year = '{predict_year}'"))
        extra_df.columns = list(map(lambda x: x[0], c.description))
        extra_df = extra_df.rename(columns={'PlayerPlayerId': 'PlayerID'})
        extra_df = extra_df.drop(columns = ["PlayerShortName"], axis = 1)

        #Remove Duplicate Columns between the extra stats df and base df
        diff_cols = extra_df.columns.difference(base_df.columns)
        diff_cols = diff_cols.union(['PlayerID'])
        extra_df = extra_df[diff_cols]

        merged_df = pd.merge(base_df, extra_df, on="PlayerID")
        if position == "RB":
            merged_df = merged_df.drop(columns = ["FantasyPoints", "index"], axis = 1).fillna(0)
            merged_df = merged_df.loc[merged_df["RushingAttempts"]>20]
        elif position == "WR" or position == "TE":
            merged_df = merged_df.drop(columns = ["FantasyPoints", "index"], axis = 1).fillna(0)
            merged_df = merged_df.loc[merged_df["ReceivingTargets"]>10]

        merged_df_list.append(merged_df)
        
        #create dataframe for final display dataframe with last years Name, Rank, and team
        last_year_end_df = merged_df.loc[:,["Rank","PlayerID","Name","Team"]].rename(columns={'Rank': f'{predict_year} Rank','Team':f'{predict_year} Team'})
        last_year_end_df_list.append(last_year_end_df)
        
        
        #create dataframe for final display dataframe to show how the players finished the season
        if (predict_year+1) < 2022:
            prediction_year = predict_year + 1
            results_df =   pd.DataFrame(c.execute(f"SELECT * FROM '{position}_Stats' WHERE Year = '{prediction_year}'"))
            results_df.columns = list(map(lambda x: x[0], c.description))
            if position == "QB":
                results_df["FantasyPoints"] = qb_calc(results_df,qb_td)
                results_df = results_df.loc[:,["Rank","PlayerID","Team","FantasyPoints"]].rename(columns={'Rank': f'{prediction_year} Actual Rank',f'Team':f'{prediction_year} Team'})        
                results_df_list.append(results_df)
            else:
                results_df["FantasyPoints"] = wr_rb_te_calc(results_df,ppr)
                results_df = results_df.loc[:,["Rank","PlayerID","Team","FantasyPoints"]].rename(columns={'Rank': f'{prediction_year} Actual Rank',f'Team':f'{prediction_year} Team'})        
                results_df_list.append(results_df)
            
    return last_year_end_df_list,results_df_list, merged_df_list

def qb_calc(df, qb_td):
    fantasy_points = (df["PassingYards"])/25 + (df["PassingTouchdowns"])*qb_td + (df["RushingYards"])/10 + (df["RushingTouchdowns"])*6 -(df["PassingInterceptions"])*2
    return fantasy_points

def wr_rb_te_calc(df, ppr):
    fantasy_points = (df["RushingYards"])/10 + (df["RushingTouchdowns"])*6 + (df["Receptions"])*ppr + (df["ReceivingYards"])/10 -(df["ReceivingTouchdowns"])*6
    return fantasy_points

train_year = int(input("what year would you like to train the data on? "))
predict_year = int(input("what year would you like to predict? "))
qb_td = int(input("how many points per passing td? "))
ppr = input("(True or False) is your league PPR ")



model_data = data_set(train_year, qb_td,ppr)
last_year_end_df, results_df, predict_data = predict_year_data(predict_year, qb_td,ppr)

what year would you like to train the data on? 2018
what year would you like to predict? 2020
how many points per passing td? 4
(True or False) is your league PPR True


In [3]:
#Create target per position (next years Fantasy Point total)
y_qb = model_data[0]["FantasyPoints"]
y_rb = model_data[1]["FantasyPoints"]
y_te = model_data[2]["FantasyPoints"]
y_wr = model_data[3]["FantasyPoints"]

In [4]:
#Remove non-numerical columns and unwanted features.
X_qb = model_data[0].drop(columns = ["Rank","FantasyPoints","Name","Team","Position","PassingCompletions","PassingYardsPerAttempt","PassingRating","FantasyPointsPerGame","AirYardsPerAttempt","AirYardsPerGame","MoneyThrows","PassingCompletionPercentage","CompletionPercentage","ProtectionRate","TouchdownRate","PassAttemptsPerGame","AirYards","PassingAttempts","PassingInterceptions","RushingYardsPerAttempt","Played","Year"], axis = 1)
X_rb = model_data[1].drop(columns = ["Rank","FantasyPoints","Name","Team","Position","BaseFrontCarryRate","LightFrontCarryRate","ShotgunCarryRate","StackedFrontCarryRate","Targets","ReceivingTargets","Carries","ReceivingTDs","Fumbles","UnderCenterCarryRate","UnderCenterYardsPerCarry","YardsPerCarry","FantasyPointsPerGame","AverageDefendersInTheBox","UnderCenterYardsPerCarry","ShotgunYardsPerCarry","Year"], axis = 1)
X_te = model_data[2].drop(columns = ["Rank","FantasyPoints","Name","Team","Position","Targets","Targets.1","ReceivingTDs","RushingAttempts","AirYards","ReceivingLong","ReceivingYardsPerTarget","RushingYards","RushingYardsPerAttempt","RushingTouchdowns","AirYardsPerReception","HogRate","TargetAccuracy","RedZoneTargets","ReceivingYardsPerReception","Fumbles","FumblesLost","FantasyPointsPerGame","ReceivingTargets","AirYardsPerGame","Year"], axis = 1)
X_wr = model_data[3].drop(columns = ["Rank","FantasyPoints","Name","Team","Position","Targets","Targets.1","ReceivingTDs","AirYards","AirYardsPerGame","AverageTargetDistance","ReceivingLong","ReceivingYardsPerTarget","HogRate","Fumbles","FumblesLost","FantasyPointsPerTarget","FantasyPointsPerGame","AirYardsPerReception","RedZoneTargets","TargetAccuracy","RushingTouchdowns","RushingAttempts","Played","Year"], axis = 1)


#Scale data
scaler = StandardScaler().fit(X_qb)
X_scaled_qb = scaler.transform(X_qb)

scaler = StandardScaler().fit(X_rb)
X_scaled_rb = scaler.transform(X_rb)

scaler = StandardScaler().fit(X_te)
X_scaled_te = scaler.transform(X_te)

scaler = StandardScaler().fit(X_wr)
X_scaled_wr = scaler.transform(X_wr)

In [5]:
#Perform multiple linear regression on qb data
LinReg_model_qb  = LinearRegression(fit_intercept=True).fit(X_scaled_qb,y_qb)

#Print coefficients and associated features

for i, col in enumerate(X_qb.columns):
    print('{} is associated with {}'.format(col,LinReg_model_qb.coef_[i]))
    
#print intercept
LinReg_model_qb.intercept_

PassingYards is associated with -65.78538534375647
PassingTouchdowns is associated with 53.643582319235364
RushingAttempts is associated with 37.64010070330123
RushingYards is associated with -58.236377377089674
RushingTouchdowns is associated with 38.58368984022048
DeepBallAttempts is associated with 45.67280711564335
DeepBallCompletionPercentage is associated with 42.11143871605656
Interceptions is associated with -29.8045162640277
PlayerAgeExact is associated with -24.770882471555947
PressuredCompletionPercentage is associated with -27.450733217410068


195.7109677419355

In [6]:
#Perform multiple linear regression on rb data
LinReg_model_rb  = LinearRegression(fit_intercept=True).fit(X_scaled_rb,y_rb)

#Print coefficients and associated features

for i, col in enumerate(X_rb.columns):
    print('{} is associated with {}'.format(col,LinReg_model_rb.coef_[i]))
    
#print intercept
LinReg_model_rb.intercept_

Played is associated with -15.455060795214077
RushingAttempts is associated with -59.165040369228635
RushingYards is associated with 118.32763435272024
RushingYardsPerAttempt is associated with -13.095013036799333
RushingTouchdowns is associated with 16.544710359991843
Receptions is associated with 101.84719024273501
ReceivingYards is associated with -96.24492304175159
ReceivingTouchdowns is associated with 26.89470447481708
FumblesLost is associated with 13.220028219502463
OpportunityShare is associated with -32.43032771700204
PlayerAgeExact is associated with -5.928099844390036
SnapShare is associated with 6.765645096825048


109.4688311688312

In [7]:
#Perform multiple linear regression on te data
LinReg_model_te  = LinearRegression(fit_intercept=True).fit(X_scaled_te,y_te)

#Print coefficients and associated features

for i, col in enumerate(X_te.columns):
    print('{} is associated with {}'.format(col,LinReg_model_te.coef_[i]))
    
#print intercept
LinReg_model_te.intercept_

Played is associated with -5.7505348021691685
Receptions is associated with 20.1757373427057
ReceptionPercentage is associated with 8.86272759829209
ReceivingYards is associated with 43.12579602809996
ReceivingTouchdowns is associated with -6.861116525215129
AverageTargetDistance is associated with -4.623927540694199
EndzoneTargets is associated with -2.9869549105297777
FantasyPointsPerTarget is associated with -0.7584289235117562
SnapShare is associated with -20.551323777746813
TargetShare is associated with -3.2724383985776244


64.98181818181818

In [8]:
#Perform multiple linear regression on wr data
LinReg_model_wr  = LinearRegression(fit_intercept=True).fit(X_scaled_wr,y_wr)

#Print coefficients and associawrd features

for i, col in enumerate(X_wr.columns):
    print('{} is associated with {}'.format(col,LinReg_model_wr.coef_[i]))
    
#print inwrrcept
LinReg_model_wr.intercept_

ReceivingTargets is associated with -135.3847345008651
Receptions is associated with 120.92141466090501
ReceptionPercentage is associated with -27.222334011194643
ReceivingYards is associated with 10.099612907467456
ReceivingTouchdowns is associated with 2.1429942885372157
ReceivingYardsPerReception is associated with 2.4859158469241462
RushingYards is associated with 12.247453736535267
RushingYardsPerAttempt is associated with 6.963128762580964
EndzoneTargets is associated with 11.694617918043745
SnapShare is associated with 1.7776189008424088
TargetShare is associated with 29.57374683002212


94.85131578947359

In [9]:
#predict qb stats
X_predict_qb = predict_data[0].drop(columns = ["PlayerID","Rank","Name","Team","Position","PassingCompletions","PassingYardsPerAttempt","PassingRating","FantasyPointsPerGame","AirYardsPerAttempt","AirYardsPerGame","MoneyThrows","PassingCompletionPercentage","CompletionPercentage","ProtectionRate","TouchdownRate","PassAttemptsPerGame","AirYards","PassingAttempts","PassingInterceptions","RushingYardsPerAttempt","Played","Year","index","FantasyPoints"], axis = 1)
scaler = StandardScaler().fit(X_predict_qb)
X_predict_scaled_qb = scaler.transform(X_predict_qb)

QB_proj = LinReg_model_qb.predict(X_predict_scaled_qb)

In [10]:
#predict rb stats
X_predict_rb = predict_data[1].drop(columns = ["Rank","PlayerID","Name","Team","Position","BaseFrontCarryRate","LightFrontCarryRate","ShotgunCarryRate","StackedFrontCarryRate","Targets","ReceivingTargets","Carries","ReceivingTDs","Fumbles","UnderCenterCarryRate","UnderCenterYardsPerCarry","YardsPerCarry","FantasyPointsPerGame","AverageDefendersInTheBox","UnderCenterYardsPerCarry","ShotgunYardsPerCarry","Year"], axis = 1)
scaler = StandardScaler().fit(X_predict_rb)
X_predict_scaled_rb = scaler.transform(X_predict_rb)

rb_proj = LinReg_model_rb.predict(X_predict_scaled_rb)

In [11]:
#predict te stats
X_predict_te = predict_data[2].drop(columns = ["PlayerID","Rank","Name","Team","Position","Targets.1","Targets","ReceivingTDs","RushingAttempts","AirYards","ReceivingLong","ReceivingYardsPerTarget","RushingYards","RushingYardsPerAttempt","RushingTouchdowns","AirYardsPerReception","HogRate","TargetAccuracy","RedZoneTargets","ReceivingYardsPerReception","Fumbles","FumblesLost","FantasyPointsPerGame","ReceivingTargets","AirYardsPerGame","Year"], axis = 1)
scaler = StandardScaler().fit(X_predict_te)
X_predict_scaled_te = scaler.transform(X_predict_te)

te_proj = LinReg_model_te.predict(X_predict_scaled_te)

In [12]:
#predict wr stats
X_predict_wr = predict_data[3].drop(columns = ["PlayerID","Rank","Name","Team","Position","Targets","Targets.1","ReceivingTDs","AirYards","AirYardsPerGame","AverageTargetDistance","ReceivingLong","ReceivingYardsPerTarget","HogRate","Fumbles","FumblesLost","FantasyPointsPerTarget","FantasyPointsPerGame","AirYardsPerReception","RedZoneTargets","TargetAccuracy","RushingTouchdowns","RushingAttempts","Year","Played"], axis = 1)
scaler = StandardScaler().fit(X_predict_wr)
X_predict_scaled_wr = scaler.transform(X_predict_wr)

wr_proj = LinReg_model_wr.predict(X_predict_scaled_wr)

In [13]:
#create df for a visual representation of effectiveness for qbs
last_year_end_df[0][f"{predict_year}_proj"] = QB_proj

In [14]:
visual_df = pd.merge(last_year_end_df[0], results_df[0],on = "PlayerID")
visual_df.sort_values([f"{predict_year}_proj"], ascending=False)

Unnamed: 0,2019 Rank,PlayerID,Name,2019 Team,2020_proj,2020 Actual Rank,2020 Team,FantasyPoints
5,6,19801,Josh Allen,BUF,385.551152,1,BUF,399.86
0,1,19781,Lamar Jackson,BAL,354.443943,10,BAL,338.78
2,3,14536,Russell Wilson,SEA,333.862209,6,SEA,365.78
3,4,18857,Deshaun Watson,HOU,329.889082,5,HOU,373.32
1,2,18055,Dak Prescott,DAL,283.404112,32,DAL,129.54
6,7,18890,Patrick Mahomes,KC,274.114964,4,KC,372.4
7,8,20889,Kyler Murray,ARI,263.17074,3,ARI,386.74
18,19,20880,Gardner Minshew,JAX,255.09667,26,JAX,165.66
8,9,2593,Aaron Rodgers,GB,250.036843,2,GB,386.86
28,29,9038,Matthew Stafford,DET,249.007105,15,DET,258.56


In [15]:
#create df for a visual representation of effectiveness for qbs
last_year_end_df[1][f"{predict_year}_proj"] = rb_proj

In [16]:
visual_df = pd.merge(last_year_end_df[1], results_df[1],on = "PlayerID")
visual_df.sort_values([f"{predict_year}_proj"], ascending=False).head(50)

Unnamed: 0,2019 Rank,PlayerID,Name,2019 Team,2020_proj,2020 Actual Rank,2020 Team,FantasyPoints
1,2,17959,Derrick Henry,TEN,285.524249,1,TEN,335.1
0,1,18877,Christian McCaffrey,CAR,277.057369,51,CAR,78.4
3,4,17923,Ezekiel Elliott,DAL,272.516029,11,DAL,207.7
8,9,19119,Chris Carson,SEA,268.439502,16,SEA,139.8
5,6,19798,Nick Chubb,CLE,263.144332,9,CLE,209.7
7,8,13337,Mark Ingram II,BAL,251.963684,71,BAL,52.9
2,3,19045,Aaron Jones,GB,233.626294,5,GB,234.9
15,16,18878,Alvin Kamara,NO,212.010946,2,NO,317.8
11,12,16771,Todd Gurley II,LAR,195.953519,23,ATL,163.2
42,44,19003,Tarik Cohen,CHI,189.841567,117,CHI,17.5


In [17]:
#create df for a visual representation of effectiveness for qbs
last_year_end_df[2][f"{predict_year}_proj"] = te_proj

In [18]:
visual_df = pd.merge(last_year_end_df[2], results_df[2],on = "PlayerID")
visual_df.sort_values([f"{predict_year}_proj"], ascending=False)

Unnamed: 0,2019 Rank,PlayerID,Name,2019 Team,2020_proj,2020 Actual Rank,2020 Team,FantasyPoints
3,4,16964,Darren Waller,LV,191.025173,2,LV,172.6
0,1,15048,Travis Kelce,KC,185.705834,1,KC,180.6
2,3,19063,George Kittle,SF,174.101697,19,SF,101.1
8,9,18032,Tyler Higbee,LAR,138.144295,16,LAR,66.2
6,7,17963,Austin Hooper,ATL,124.520988,22,CLE,65.5
4,5,14856,Zach Ertz,PHI,121.531088,35,PHI,63.5
1,2,19803,Mark Andrews,BAL,120.453822,4,BAL,86.1
9,10,19863,Dallas Goedert,PHI,86.334781,20,PHI,80.4
11,12,722,Jason Witten,DAL,77.060946,61,LV,7.9
7,8,17975,Hunter Henry,LAC,73.370525,15,LAC,97.3


In [19]:
#create df for a visual representation of effectiveness for qbs
last_year_end_df[3][f"{predict_year}_proj"] = wr_proj

In [20]:
visual_df = pd.merge(last_year_end_df[3], results_df[3],on = "PlayerID")
visual_df.sort_values([f"{predict_year}_proj"], ascending=False).head(50)

Unnamed: 0,2019 Rank,PlayerID,Name,2019 Team,2020_proj,2020 Actual Rank,2020 Team,FantasyPoints
0,1,17960,Michael Thomas,NO,196.715364,102,NO,83.9
9,10,14986,DeAndre Hopkins,HOU,168.999531,9,ARI,219.8
17,18,16906,Stefon Diggs,MIN,157.671704,3,BUF,232.6
25,26,20932,Deebo Samuel,SF,156.312471,96,SF,68.7
8,9,21042,A.J. Brown,TEN,155.807138,10,TEN,111.5
51,54,17961,Sterling Shepard,NYG,144.060782,51,NYG,124.5
1,2,18880,Chris Godwin,TB,140.896303,30,TB,107.0
10,11,15076,Keenan Allen,LAC,139.296597,18,LAC,151.1
28,29,16470,Davante Adams,GB,136.362808,1,GB,144.4
16,17,19800,Courtland Sutton,DEN,134.225753,172,DEN,9.6


In [21]:
# x_range = np.linspace(test_2019_2020_df["FantasyPoints"].min(), test_2019_2020_df["FantasyPoints"].max(), 100)
# y_range = x_range
# fig = px.scatter(x = test_2019_2020_df["FantasyPoints"], y = test_2019_2020_df["QB2020_proj"], color = test_2019_2020_df["Name"], opacity=0.65, labels = dict(x="2020 Actual", y= "2020 Projected",color = "Player Name"))
# fig.add_traces(go.Scatter(x=x_range, y=y_range, name='Ideal Fit'))
# fig.update_layout(legend_traceorder="reversed")
# fig.show()