In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import scipy as sp
import seaborn as sns
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
career = pd.read_csv('Data/career1.csv')
career = career.set_index('Unnamed: 0')
#filter bad columns
anyonecan = ['Longest Pass','Longest Reception','Longest Rushing Run']
messedup = ['Seasons','Fumbles']
career = career.drop(messedup + anyonecan,axis=1)

###### make new columns
career['SB'] = career['SB Win'] + career['SB Loss']
career['RRYd'] = career['Rushing Yards adj'] + career['Receiving Yards adj']
career['RRTD'] = career['Rushing TDs adj'] + career['Receiving TDs adj']

#make appendix
appendix = pd.DataFrame()
appendix['Columns'] = career.columns
for a in career.columns:
    dtype = career[str(a)].dtype
    appendix.loc[appendix['Columns']==a,'Data Type'] = dtype


#current players
currentplayers= career[career['Last Year']>2005]
#training data
career= career[career['First Year']>1960]
career = career[career['Last Year']<2006]
career.shape

(2153, 52)

## Lasso

In [3]:
from sklearn.linear_model import Lasso, Ridge, LassoCV, LogisticRegression
from sklearn.feature_selection import SelectKBest, SelectFromModel
#separate predictor columns
text_hof = ['Name','Player Id','HOF','Position']
data = career.drop(text_hof,axis=1)
#standardize features
scaler = StandardScaler()
X = scaler.fit_transform(data)
#make new df and calculate # of features for different alpha levels
new_df = pd.DataFrame(columns=['alpha','# of features'])
rowcount = 0
for a in [1,.1,.01,.001,.0001]:
    clf = Lasso(alpha=a)
    clf.fit(X,career['HOF'])
    var_count = pd.DataFrame(abs(clf.coef_),columns=['coef'])
    var_count = len(var_count[var_count['coef']>0])
    new_df.loc[rowcount,'alpha'] = a
    new_df.loc[rowcount,'# of features'] = var_count
    rowcount += 1
new_df



Unnamed: 0,alpha,# of features
0,1.0,0
1,0.1,0
2,0.01,10
3,0.001,24
4,0.0001,40


In [4]:
#choose optimal alpha level and get our new features
clf = Lasso(alpha=.01)
clf.fit(X,career['HOF'])
df = pd.DataFrame({'columns': data.columns,
              'doef': abs(clf.coef_)}).sort_values('doef',ascending=False)
columns_wanted = list(df.loc[df['doef']>0,'columns'])
columns_wanted

['RRTD',
 'SB MVP',
 'Receiving Yards Per Game',
 'Rushing Yards adj',
 'MVP',
 'PGWD',
 'TD Passes adj',
 'RRYd',
 'Passing Yards Per Game',
 'SB']

In [5]:
career = career[columns_wanted + text_hof]
currentplayers = currentplayers[columns_wanted + text_hof]
career.shape

(2153, 14)

In [6]:
#save csv files
career.to_csv('Data/career.csv')
currentplayers.to_csv('Data/currentplayers.csv')

In [7]:
#create an appendix
appendix = pd.DataFrame()
appendix['Columns'] = career.columns
for a in career.columns:
    dtype = career[str(a)].dtype
    appendix.loc[appendix['Columns']==a,'Data Type'] = dtype

appendix

Unnamed: 0,Columns,Data Type
0,RRTD,float64
1,SB MVP,float64
2,Receiving Yards Per Game,float64
3,Rushing Yards adj,float64
4,MVP,float64
5,PGWD,float64
6,TD Passes adj,float64
7,RRYd,float64
8,Passing Yards Per Game,float64
9,SB,float64
