# Setup for Dynasty Fantasy Football Analysis

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob

# Data Import, Cleaning, and Separation by Position

Using the **glob** library, we can loop through every year's datasets, and this loop will add them all to our DataFrame, along with a **Year** column so we know which season the row of data is from.

In [24]:
fnames = glob('data/yearly/*.csv')
df_list = []
Year = 1970
for filename in fnames:
    #print(filename)
    df = pd.read_csv(filename)
    df['Year'] = Year
    Year +=1
    df_list.append(df)
df = pd.concat(df_list, axis=0, ignore_index=True)

display(df.head())
df.shape

Unnamed: 0.1,Unnamed: 0,Player,Tm,Pos,Age,G,GS,Cmp,Att,Yds,...,PassingTD,PassingAtt,RushingYds,RushingTD,RushingAtt,ReceivingYds,ReceivingTD,FantasyPoints,Year,Tgt
0,0,Ron Johnson,NYG,RB,23.0,14.0,14.0,0.0,0.0,0.0,...,0.0,0.0,1027.0,8.0,263.0,487.0,4.0,261.4,1970,
1,1,Dick Gordon,CHI,WR,26.0,14.0,14.0,0.0,0.0,0.0,...,0.0,0.0,17.0,0.0,4.0,1026.0,13.0,249.3,1970,
2,2,Gene Washington,SFO,WR,23.0,13.0,13.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1100.0,12.0,235.0,1970,
3,3,Gary Garrison,SDG,WR,26.0,14.0,14.0,0.0,0.0,0.0,...,0.0,0.0,7.0,0.0,4.0,1006.0,12.0,217.3,1970,
4,4,MacArthur Lane,STL,RB,28.0,14.0,14.0,0.0,0.0,0.0,...,0.0,0.0,977.0,11.0,206.0,365.0,2.0,240.2,1970,


(26128, 29)

## Usual Data Cleaning

In [25]:
df.reset_index(inplace=True)

drop_columns = ['index','Unnamed: 0','Att','Yds','Att.1','Yds.1','Yds.2','FantasyPoints']
df.drop(drop_columns,axis=1,inplace=True)

df.rename({
    'Tm':'Team',
    'Pos':'Position',
    'G':'GamesPlayed',
    'GS':'GamesStarted',
    'Cmp':'Completions',
    'Rec':'Receptions',
    'Y/R':'Yards/Reception',
    'Int':'Interceptions',
    'Tgt':'Targets'
}, axis=1, inplace=True)

PPR = 1
df['FantasyPoints'] = (df['PassingYds']*0.04 + df['PassingTD']*4 - df['Interceptions']*2
                       + df['RushingYds']*0.1 + df["RushingTD"]*6
                       + df['Receptions']*PPR + df['ReceivingYds']*0.1 + df['ReceivingTD']*6
                       - df['FumblesLost']*2)

df['FantasyPoints/Game'] = round(df['FantasyPoints']/df['GamesPlayed'],2)

df_rb = df[df['Position'] == 'RB']
df_qb = df[df['Position'] == 'QB']
df_wr = df[df['Position'] == 'WR']
df_te = df[df['Position'] == 'TE']

In [26]:
rushing_columns = ['RushingAtt','RushingYds','RushingTD']
receiving_columns = ['Targets','Receptions','ReceivingYds','Yards/Reception','ReceivingTD']
passing_columns = ['PassingAtt','PassingYds','PassingTD','Interceptions']

def transform_columns(df, new_column_list):
    df = df[['Player','Team','Age','GamesPlayed'] + new_column_list + ['Fumbles','FantasyPoints','FantasyPoints/Game','Year']]
    return df

df_rb = transform_columns(df_rb, rushing_columns+receiving_columns)
df_wr = transform_columns(df_wr, receiving_columns+rushing_columns)
df_te = transform_columns(df_te, receiving_columns)
df_qb = transform_columns(df_qb, passing_columns+rushing_columns)

# Running Back Model

## Feature Engineering

In [41]:
df_rb['Yards/Att'] = round(df_rb['RushingYds']/df_rb['RushingAtt'],2)

df_rb['Usage'] = round(df_rb['RushingAtt']+df_rb['Targets'],2)

df_rb['Usage/Game'] = round(df_rb['Usage']/df_rb['GamesPlayed'],2)

df_rb['Touches'] = round(df_rb['RushingAtt']+df_rb['Receptions'],2)

df_rb['Touches/Game'] = round(df_rb['Touches']/df_rb['GamesPlayed'],2)

df_rb['RushingAtt/Game'] = round(df_rb['RushingAtt']/df_rb['GamesPlayed'],2)

In [42]:
df_rb

Unnamed: 0,Player,Team,Age,GamesPlayed,RushingAtt,RushingYds,RushingTD,Targets,Receptions,ReceivingYds,...,Fumbles,FantasyPoints,FantasyPoints/Game,Year,Yards/Att,Usage,Usage/Game,Touches,Touches/Game,RushingAtt/Game
0,Ron Johnson,NYG,23.0,14.0,263.0,1027.0,8.0,,48.0,487.0,...,5.0,271.4,19.39,1970,3.90,,,311.0,22.21,18.79
4,MacArthur Lane,STL,28.0,14.0,206.0,977.0,11.0,,32.0,365.0,...,2.0,244.2,17.44,1970,4.74,,,238.0,17.00,14.71
9,Larry Brown,WAS,23.0,13.0,237.0,1125.0,5.0,,37.0,341.0,...,6.0,225.6,17.35,1970,4.75,,,274.0,21.08,18.23
13,Ken Willard,SFO,27.0,14.0,236.0,789.0,7.0,,31.0,259.0,...,3.0,195.8,13.99,1970,3.34,,,267.0,19.07,16.86
14,Bo Scott,CLE,27.0,13.0,151.0,625.0,7.0,,40.0,351.0,...,4.0,203.6,15.66,1970,4.14,,,191.0,14.69,11.62
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26043,Dexter Williams,GNB,22.0,4.0,5.0,11.0,0.0,0.0,0.0,0.0,...,0.0,1.1,0.28,2019,2.20,5.0,1.25,5.0,1.25,1.25
26058,James Develin,NWE,31.0,2.0,2.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.3,0.15,2019,1.50,2.0,1.00,2.0,1.00,1.00
26091,Alfred Morris,ARI,31.0,1.0,1.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.4,0.40,2019,4.00,1.0,1.00,1.0,1.00,1.00
26094,Roosevelt Nix,PIT,27.0,3.0,0.0,0.0,0.0,3.0,2.0,4.0,...,0.0,2.4,0.80,2019,,3.0,1.00,2.0,0.67,0.00


## RB Model

In [43]:
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

## Selecting Features

In [44]:
X = df_rb.drop(['Player','FantasyPoints','Year'],axis=1)
y = df_rb['FantasyPoints']

In [45]:
X_train, X_validtest, y_train, y_validtest = train_test_split(X, y, test_size=0.5, random_state=0)
X_valid, X_test, y_valid, y_test = train_test_split(X_validtest,y_validtest,test_size=0.2, random_state=0)
print(f"X train shape: {X_train.shape}")
print(f'X valid shape: {X_valid.shape}')
print(f"X test  shape: {X_test.shape}")

X train shape: (3883, 19)
X valid shape: (3107, 19)
X test  shape: (777, 19)


In [46]:
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object', 'bool']).columns
print(f'Numerical features are: {numerical_features}')
print(f'Categorical features are: {categorical_features}')

Numerical features are: Index(['Age', 'GamesPlayed', 'RushingAtt', 'RushingYds', 'RushingTD',
       'Targets', 'Receptions', 'ReceivingYds', 'Yards/Reception',
       'ReceivingTD', 'Fumbles', 'FantasyPoints/Game', 'Yards/Att', 'Usage',
       'Usage/Game', 'Touches', 'Touches/Game', 'RushingAtt/Game'],
      dtype='object')
Categorical features are: Index(['Team'], dtype='object')


## Creating Baseline Data Pipelines

In [47]:
numerical_features = ['Age', 'GamesPlayed', 'RushingAtt', 'RushingYds', 'RushingTD',
       'Targets', 'Receptions', 'ReceivingYds', 'Yards/Reception',
       'ReceivingTD', 'Fumbles', 'FantasyPoints/Game', 'Yards/Att',
       'Usage', 'Usage/Game', 'Touches', 'Touches/Game', 'RushingAtt/Game']
numerical_pipeline = Pipeline([
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
])

In [48]:
categorical_features = ['Team']
categorical_pipeline = Pipeline([
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('OHE',OneHotEncoder()),    
])

In [49]:
data_pipeline = ColumnTransformer( transformers= [
        ("numerical_pipe", numerical_pipeline, numerical_features),
        ("categorical_pipe", categorical_pipeline, categorical_features),
    ], 
        n_jobs=-1 #Minimizes runtime by using all cores of machine
    )
X_train_transformed = data_pipeline.fit_transform(X_train)
column_names = numerical_features  + \
               list(data_pipeline.transformers_[1][1].named_steps["OHE"].get_feature_names(categorical_features))

display(pd.DataFrame(X_train_transformed,  columns=column_names).head())

Unnamed: 0,Age,GamesPlayed,RushingAtt,RushingYds,RushingTD,Targets,Receptions,ReceivingYds,Yards/Reception,ReceivingTD,...,Team_PIT,Team_RAI,Team_RAM,Team_SDG,Team_SEA,Team_SFO,Team_STL,Team_TAM,Team_TEN,Team_WAS
0,-0.309211,-1.19202,-0.850189,-0.816085,-0.682657,-0.253959,-0.891253,-0.801331,1.392812,-0.5447,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.476433,0.888463,-0.37646,-0.430622,0.227006,-0.253959,-0.355121,-0.196476,0.736476,-0.5447,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,-1.380598,0.888463,0.097269,0.139652,-0.076215,2.821668,2.206402,1.483677,-0.148363,0.429987,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.762175,-0.72969,-0.342622,-0.156046,0.227006,-0.034271,0.002301,0.011863,0.269748,-0.5447,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,-1.023469,-1.423184,-0.838909,-0.813445,-0.682657,-1.022866,-0.891253,-0.828214,0.420462,-0.5447,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Running Linear Regression

In [50]:
clf_pipe = Pipeline([                       #Preprocesses the data and runs Linear Regression on it
            ('data_pipe',data_pipeline),
            ('LR',LinearRegression())
        ])
clf_pipe.fit(X_train, y_train)              #Trains the model
y_train_preds = clf_pipe.predict(X_train)   #Predicts Train Set's Fat-Free Masses based on trained model
y_valid_preds = clf_pipe.predict(X_valid)
y_test_preds = clf_pipe.predict(X_test)     #Predicts Test Set's Fat-Free Masses based on trained model

In [51]:
Accuracy_train = clf_pipe.score(X_train,y_train)
Accuracy_valid = clf_pipe.score(X_valid,y_valid)
Accuracy_test = clf_pipe.score(X_test,y_test)

In [52]:
print(Accuracy_train)

0.9996325730147105


In [53]:
print(Accuracy_valid)

0.9996552906209444


In [54]:
print(Accuracy_test)

0.999603064489763


In [55]:
clf_pipe['LR'].coef_

array([ 4.43519843e-03, -8.52544867e-02,  5.39107739e-01,  3.58756099e+01,
        1.88900465e+01,  7.01819784e-02,  1.68551992e+01,  1.46069780e+01,
       -6.58593277e-02,  5.85825583e+00, -4.96995796e-01,  3.61173385e+00,
       -3.28139345e-02, -1.74664459e+00,  2.07642109e-01,  3.31746071e+00,
       -8.95186904e+00,  5.54581912e+00,  4.60463856e-02, -7.51787974e-01,
        8.37812743e-01, -5.50827818e-01,  2.01875391e-02, -1.67053971e-01,
        2.05663908e-01,  1.05308045e-01, -3.17598160e-01,  2.46581905e-01,
       -9.27186499e-02,  2.76121220e-02,  1.51091284e-02, -8.04085273e-02,
       -9.04451229e-02,  2.74055006e-01,  1.06587820e-01, -1.15277069e-01,
       -3.53848683e-01,  1.08049072e-01, -6.99930969e-01, -1.22770425e-01,
       -1.45462305e-01, -6.92007598e-02,  1.54116647e-01,  9.29787892e-02,
        1.01033707e-01,  1.86630444e-01, -9.98562071e-02,  3.20674487e-01,
        3.44029440e-01, -5.13767737e-02,  8.51399811e-01,  3.67075058e-01,
        3.21698285e-01, -

# Conclusion

Although the setup of this model is relatively reusable, it is not a good model.  An accuracy of 99 percent looks good, but it is a major red flag and the model is likely overfit.  Because most of the features that go into the model are just stats, of course they predict Fantasy Points extremely well!  They are the numbers that calculate it!

In order to actually try to create a predictive model, a lot more categorical data is needed, and my next step is to identify what data is needed, how to get it, and how best to apply it to the model.