In [1]:
import numpy as np
import pandas as pd
import clean as c
%load_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings('ignore')
import rolling_mean as rm

## Read in the data

In [None]:
s2004 = pd.read_csv('data/2004.csv')
s2005 = pd.read_csv('data/2005.csv')
s2006 = pd.read_csv('data/2006.csv')
s2007 = pd.read_csv('data/2007.csv')
s2008 = pd.read_csv('data/2008.csv')
s2009 = pd.read_csv('data/2009.csv')
s2010 = pd.read_csv('data/2010.csv')
s2011 = pd.read_csv('data/2011.csv')
s2012 = pd.read_csv('data/2012.csv')
s2013 = pd.read_csv('data/2013.csv')
s2014 = pd.read_csv('data/2014.csv')
s2015 = pd.read_csv('data/2015.csv')
s2016 = pd.read_csv('data/2016.csv')
s2017 = pd.read_csv('data/2017.csv')
s2018 = pd.read_csv('data/2018.csv')

## Clean all season.
There are two different ways of cleaning that are needed because the New Orleans used to be the "Hornets" but now
Charlotte is the "Hornets". When changing team names to be consistent through all seasons, the order of cleaning matters.

In [None]:
s2004 = c.all_clean_pels(s2004)
s2005 = c.all_clean_pels(s2005)
s2006 = c.all_clean_pels(s2006)
s2007 = c.all_clean_pels(s2007)
s2008 = c.all_clean_pels(s2008)
s2009 = c.all_clean_pels(s2009)
s2010 = c.all_clean_pels(s2010)
s2011 = c.all_clean_pels(s2011)
s2012 = c.all_clean_pels(s2012)
s2013 = c.all_clean_pels(s2013)
s2014 = c.all_clean(s2014)
s2015 = c.all_clean(s2015)
s2016 = c.all_clean(s2016)
s2017 = c.all_clean(s2017)
s2018 = c.all_clean(s2018)

## Testing how to set up the rolling means
With rolling means there are a lot of rows that are filled with zeros. These need to be taken out of the feature matrix in order to properly train and test.

In [None]:
roll2018 = rm.complete_rolling_means(s2018,games_back=7,games_needed=5)

In [None]:
roll2018[roll2018['Min']==0]

## Setting the feature matrix for modelling
As I have it set up to this point, the feature matrix has two rows per game, one for each team. This needs to be combined into a single row.

In [None]:
mat = roll2018.as_matrix()

In [None]:
mat

In [None]:
mat2 = []
for i in range(int(len(mat)/2)):
    mat2.append(list(mat[2*i])+(list(mat[2*i+1])))

In [None]:
cols = ('team1_'+roll2018.columns).append('team2_'+roll2018.columns)

In [None]:
len(cols)

## Setting the indices
In order to set the indices for the new feature matrix, I need to grab the game ids from the old one. This would be grabbing each game id twice, so grabbing only the even row game id indices is required

In [None]:
roll2018.index.get_level_values('game_id')[::2]

In [None]:
df = pd.DataFrame(mat2,columns=cols,index=roll2018.index.get_level_values('game_id')[::2])

In [None]:
df.drop(['team2_Total_PTS','team1_Min','team2_Min','team1_home_team','team2_home_team','team1_starter','team2_starter'],axis=1,inplace=True)

## Final steps of setting up the feature matrix
I removed the first 100 rows of the feature matrix to ensure that no row has a team where the rolling mean is not calculated.

In [None]:
df_use = df[100:]

In [None]:
df_use

In [None]:
df_use.columns

# First Rolling Mean Model
This model uses a train test split on a season to model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

In [None]:
X_train,X_test,y_train,y_test=train_test_split(df_use.drop('team1_Total_PTS',axis=1),df_use['team1_Total_PTS'],test_size=.5)

## Grid search for lasso


In [None]:
parameters = {'alpha':[.93,.95,.97,.96,.94], 'normalize':[True, False],'tol':[.001,.002,.003,.00008]}

In [None]:
las = Lasso()

In [None]:
clf = GridSearchCV(las, parameters)

In [None]:
clf.fit(X_train,y_train)

In [None]:
clf.best_estimator_

In [None]:
las = Lasso(alpha=.97,tol=.003)

In [None]:
las.fit(X_train,y_train)

In [None]:
preds = las.predict(X_test)

## My predictions vs the actual total scores
A dataframe that contains the real score, my predictions, and the difference between the two

In [None]:
comp_df = pd.DataFrame(y_test.values,columns=['real'],index = y_test.index.get_level_values('game_id'))
comp_df['predictions']=preds
comp_df['differences']=comp_df['real']-comp_df['predictions']

In [None]:
comp_df

# Average difference between my prediction and the real score

In [None]:
sum(np.abs(y_test-preds))/len(preds)

# Second Rolling Means Model
This model will combine seasons. 

In [None]:
import rolling_model as rmod
import matplotlib.pyplot as plt

## All seasons
Train on seasons 2004-2011. train=True will end up removing all game that went to overtime as this adds more time to games. These games should not be trained on.

In [None]:
df2018=rmod.get_train_test(s2018,6,5)
df2017=rmod.get_train_test(s2017,6,5)
df2016=rmod.get_train_test(s2016,6,5)
df2015=rmod.get_train_test(s2015,6,5)
df2014=rmod.get_train_test(s2014,6,5)
#df2013,(X_train2013,X_test2013,y_train2013,y_test2013)=rmod.get_train_test(s2013)
df2012=rmod.get_train_test(s2012,6,5)
df2011=rmod.get_train_test(s2011,6,5,train=True)
df2010=rmod.get_train_test(s2010,6,5,train=True)
df2009=rmod.get_train_test(s2009,6,5,train=True)
df2008=rmod.get_train_test(s2008,6,5,train=True)
df2007=rmod.get_train_test(s2007,6,5,train=True)
df2006=rmod.get_train_test(s2006,6,5,train=True)
df2005=rmod.get_train_test(s2005,6,5,train=True)
df2004=rmod.get_train_test(s2004,6,5,train=True)

## Train and Test sets
These dataframes need to be combined. Then the target needs to be taken out.

In [None]:
train = pd.concat((df2011,df2010,df2009,df2008,df2007,df2006,df2005,df2004))

In [None]:
test=pd.concat((df2018,df2017,df2016,df2015,df2014,df2012))

In [None]:
X_train=train.drop('team1_Total_PTS',axis=1)
X_test=test.drop('team1_Total_PTS',axis=1)
y_train=train['team1_Total_PTS']
y_test=test['team1_Total_PTS']

### Find the best parameters for the Lasso model

In [None]:
rmod.find_params(X_train,y_train)

### Get the lasso model, the R^2 value average difference, and the comparison dataframe

In [None]:
las, r2,diff, df2 = rmod.model(X_train,X_test,y_train,y_test,1.5,.00001)

### Look to see which features the lasso captured

In [None]:
names = list(train.columns)
names.remove('team1_Total_PTS')

In [None]:
names = np.array(names)

In [None]:
names

In [None]:
names[las.coef_!=0]

In [None]:
r2

In [None]:
diff

## Plot of real score vs predicted score

In [None]:
plt.figure(figsize=(15,8))
plt.scatter(df2['real'],df2['predictions'])
plt.xlabel('Real')
plt.ylabel('Predictions')
plt.title('Real Score vs Predicted Score')
plt.show()

# Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [None]:
rf = RandomForestRegressor()

In [None]:
parameters = {'n_estimators':[40,50,60,70],'min_samples_leaf':[12,14,16]}

### Quick grid search

In [None]:
clf = GridSearchCV(rf,parameters)

In [None]:
clf.fit(X_train,y_train)

In [None]:
clf.best_estimator_

In [None]:
rf = RandomForestRegressor(n_estimators=60,min_samples_leaf=12)

In [None]:
rf = rf.fit(X_train,y_train)

In [None]:
pred = rf.predict(X_test)

### Both the R^2 and differences were worse

In [None]:
rf.score(X_test,y_test)

In [None]:
sum(np.abs(y_test-pred))/len(y_test)

# Ridge Model

In [None]:
from sklearn.linear_model import Ridge

In [None]:
rid = Ridge()

In [None]:
rid = rid.fit(X_train,y_train)

In [None]:
pred = rid.predict(X_test)

#### Out of the box score and difference

In [None]:
rid.score(X_test,y_test)

In [None]:
sum(np.abs(y_test-pred))/len(y_test)

In [None]:
rid = Ridge()

## Quick grid search

In [None]:
parameters = parameters = {'alpha':[1000,10000,3000,5000,7000],'tol':[.001,.002,.003,.0009,.0008,.0007]}

In [None]:
clf = GridSearchCV(rid,parameters)

In [None]:
clf.fit(X_train,y_train)

In [None]:
clf.best_estimator_

## Something wrong with the grid search
The grid search is returning best alpha as 1000, however the model can be improved by using a much higher alpha

In [None]:
rid = Ridge(alpha=1000)

In [None]:
rid = rid.fit(X_train,y_train)

In [None]:
rid.score(X_test,y_test)

In [None]:
pred = rid.predict(X_test)

In [None]:
sum(np.abs(y_test-pred))/len(y_test)

### proof a higher alpha is better

In [None]:
rid = Ridge(alpha=45000)
rid = rid.fit(X_train,y_train)
print(rid.score(X_test,y_test))
pred = rid.predict(X_test)
sum(np.abs(y_test-pred))/len(y_test)

# Model using only 2018 as test

In [None]:
df2018=rmod.get_train_test(s2018,6,5)
df2017=rmod.get_train_test(s2017,6,5,train=True)
df2016=rmod.get_train_test(s2016,6,5,train=True)
df2015=rmod.get_train_test(s2015,6,5,train=True)
df2014=rmod.get_train_test(s2014,6,5,train=True)
#df2013,(X_train2013,X_test2013,y_train2013,y_test2013)=rmod.get_train_test(s2013)
df2012=rmod.get_train_test(s2012,6,5,train=True)
df2011=rmod.get_train_test(s2011,6,5,train=True)
df2010=rmod.get_train_test(s2010,6,5,train=True)
df2009=rmod.get_train_test(s2009,6,5,train=True)
df2008=rmod.get_train_test(s2008,6,5,train=True)
df2007=rmod.get_train_test(s2007,6,5,train=True)
df2006=rmod.get_train_test(s2006,6,5,train=True)
df2005=rmod.get_train_test(s2005,6,5,train=True)
df2004=rmod.get_train_test(s2004,6,5,train=True)

In [None]:
train = pd.concat((df2017,df2016,df2015,df2014,df2012,df2011))#,df2010,df2009,df2008,df2007,df2006,df2005,df2004))
test=df2018
X_train=train.drop('team1_Total_PTS',axis=1)
X_test=test.drop('team1_Total_PTS',axis=1)
y_train=train['team1_Total_PTS']
y_test=test['team1_Total_PTS']

In [None]:
rmod.find_params(X_train,y_train)

In [None]:
las, r2,diff, df2 = rmod.model(X_train,X_test,y_train,y_test,.9,.003)

In [None]:
r2

In [None]:
diff

In [None]:
df2

# Linear Regression Model
Check to see how the linear regression model using only the performs

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
linreg = LinearRegression()

In [None]:
linreg = linreg.fit(X_train[['team1_+/-', 'team1_PTS', 'team1_REB', 'team1_TO',
       'team1_3PT_attempts', 'team1_FG_attempts', 'team1_DEF_PTS',
       'team2_+/-', 'team2_AST', 'team2_PTS', 'team2_REB', 'team2_TO',
       'team2_FT_attempts', 'team2_3PT_attempts', 'team2_FG_attempts',
       'team2_DEF_PTS']],y_train)

In [None]:
preds = linreg.predict(X_test[['team1_+/-', 'team1_PTS', 'team1_REB', 'team1_TO',
       'team1_3PT_attempts', 'team1_FG_attempts', 'team1_DEF_PTS',
       'team2_+/-', 'team2_AST', 'team2_PTS', 'team2_REB', 'team2_TO',
       'team2_FT_attempts', 'team2_3PT_attempts', 'team2_FG_attempts',
       'team2_DEF_PTS']])

In [None]:
linreg.score(X_test[['team1_+/-', 'team1_PTS', 'team1_REB', 'team1_TO',
       'team1_3PT_attempts', 'team1_FG_attempts', 'team1_DEF_PTS',
       'team2_+/-', 'team2_AST', 'team2_PTS', 'team2_REB', 'team2_TO',
       'team2_FT_attempts', 'team2_3PT_attempts', 'team2_FG_attempts',
       'team2_DEF_PTS']],y_test)

In [None]:
sum(np.abs(y_test-preds))/len(y_test)