In [24]:
# import required libraries
import numpy as np
import pandas as pd

from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [2]:
# import datasets into dataframes, drop unnecessary columns
game_df = pd.read_csv("game_data.csv", sep=',')
player_df = pd.read_csv("player_data.csv", sep=',')
train_df = pd.read_csv("training_set.csv", sep=',')
test_df = pd.read_csv("test_set.csv", sep=',')
team_df = pd.read_csv("team_data.csv", sep=',')

game_df = game_df[['Season', 'Game_ID', 'Game_Date', 'Team', 'Location', 'Wins_Entering_Gm', 'Losses_Entering_Gm']]
player_df = player_df[['Season', 'Game_ID', 'Game_Date', 'Team', 'Person_ID', 'Name', 'ASG_Team', 'Active_Status']]


In [3]:
print list(game_df)
print list(player_df)
print list(train_df)
print list(test_df)
print list(team_df)

print test_df[0:3]
print team_df[0:3] # my outside data

['Season', 'Game_ID', 'Game_Date', 'Team', 'Location', 'Wins_Entering_Gm', 'Losses_Entering_Gm']
['Season', 'Game_ID', 'Game_Date', 'Team', 'Person_ID', 'Name', 'ASG_Team', 'Active_Status']
['Season', 'Game_ID', 'Game_Date', 'Away_Team', 'Home_Team', 'Country', 'Rounded Viewers']
['Season', 'Game_ID', 'Game_Date', 'Away_Team', 'Home_Team', 'Total_Viewers']
['Team', 'Market_Size', 'Championships', 'Playoffs', 'Twitter']
    Season   Game_ID   Game_Date Away_Team Home_Team  Total_Viewers
0  2016-17  21600010  10/26/2016       DEN       NOP            NaN
1  2016-17  21600023  10/28/2016       HOU       DAL            NaN
2  2016-17  21600027  10/29/2016       BOS       CHA            NaN
  Team  Market_Size  Championships  Playoffs  Twitter
0  MIL         2255              1         7    0.462
1  GSW         6750              4         5    1.688
2  MIN         4667              0         4    0.458


In [4]:
# major rivalries aquired by Wikipedia
rivalry = [('CLE', 'GSW'), ('BOS', 'LAL'), ('LAL', 'DET'), ('PHI', 'BOS'), ('BOS', 'NYK'), 
           ('NYK', 'BKN'), ('CHI', 'DET'), ('CHI', 'CLE'), ('CHI', 'MIA'), ('CHI', 'NYK'), 
           ('BOS', 'DET'), ('MIA', 'NYK'), ('NYK', 'IND'), ('LAL', 'LAC'), ('DAL', 'HOU'),
           ('HOU', 'SAS'), ('UTA', 'HOU'), ('LAL', 'SAS'), ('SAS', 'PHX')]

In [5]:
# create training/test set for 2016 model, we add features to this later
model_train_X = train_df[['Season', 'Game_ID', 'Game_Date', 'Away_Team', 'Home_Team']].drop_duplicates().reset_index(drop=True) 
model_output_X = test_df[['Season', 'Game_ID', 'Game_Date', 'Away_Team', 'Home_Team']]

games = list(model_train_X['Game_ID'])
# add up total viewers for each game in training set
model_train_y = []
for game in games:
    model_train_y.append(sum(list(train_df.loc[train_df['Game_ID'] == game]['Rounded Viewers']))) 
    
print len(model_train_X), len(model_train_y), len(model_output_X)


2000 2000 460


In [7]:
# columns to add to train/test sets
cols = ['Home_All_Stars', 'Away_All_Stars', 'Home_Market_Size', 'Away_Market_Size', 
            'Home_Championships', 'Away_Championships', 'Home_Playoffs', 'Away_Playoffs',
            'Home_Twitter', 'Away_Twitter', 'Home_Win_Pct', 'Away_Win_Pct']

model_train_X['Year'] = 0
model_train_X['Rivalry'] = 0
model_output_X['Year'] = 0
model_output_X['Rivalry'] = 0

for c in cols:
    model_train_X[c] = 0.0
    model_output_X[c] = 0.0
    model_train_X[c] = 0.0
    model_output_X[c] = 0.0
    
metrics = ['Market_Size', 'Championships', 'Playoffs', 'Twitter']

def add_features(set_X, num_rows):
    
    i = 0
    games = list(set_X['Game_ID'])
    

    for game in games[0:num_rows]:
        # determine home and away team and winning percentages
        
        h_row = game_df.loc[(game_df['Game_ID'] == game) & (game_df['Location'] == 'H')]
        a_row = game_df.loc[(game_df['Game_ID'] == game) & (game_df['Location'] == 'A')]
        
        h_team = list(h_row['Team'])[0]
        a_team = list(a_row['Team'])[0]
        
        h_win = float(list(h_row['Wins_Entering_Gm'])[0])
        h_loss = float(list(h_row['Losses_Entering_Gm'])[0])
        
        a_win = float(list(a_row['Wins_Entering_Gm'])[0])
        a_loss = float(list(a_row['Losses_Entering_Gm'])[0])
        
        if (h_win + h_loss != 0):
            h_pct = h_win / (h_win + h_loss)
        else:
            h_pct = 0 # opening night, add in case where first games were away
            
        if (a_win + a_loss != 0):
            a_pct = a_win / (a_win + a_loss)
        else:
            a_pct = 0
    
        # determine home and away active all stars
        h_all_stars = len(player_df.loc[(player_df['Team'] == h_team) 
                                        & (player_df['Game_ID'] == game) 
                                        & (player_df['ASG_Team'] != 'None') 
                                        & (player_df['Active_Status'] == 'Active')])
    
        a_all_stars = len(player_df.loc[(player_df['Team'] == a_team) 
                                        & (player_df['Game_ID'] == game) 
                                        & (player_df['ASG_Team'] != 'None') 
                                        & (player_df['Active_Status'] == 'Active')])
        
        # determine home and away team metrics
        h_team_df = team_df.loc[team_df['Team'] == h_team]
        a_team_df = team_df.loc[team_df['Team'] == a_team]
        for metric in metrics:
            set_X.at[i, 'Home_' + metric] = h_team_df[metric]
            set_X.at[i, 'Away_' + metric] = a_team_df[metric]
    
        set_X.at[i, 'Home_All_Stars'] = h_all_stars
        set_X.at[i, 'Away_All_Stars'] = a_all_stars
        set_X.at[i, 'Home_Win_Pct'] = h_pct
        set_X.at[i, 'Away_Win_Pct'] = a_pct
        
        # convert season to a new binary feature
        if (set_X['Season'][i] == '2016-17'):
            set_X.at[i, 'Year'] = 0
        elif (set_X['Season'][i] == '2017-18'):
            set_X.at[i, 'Year'] = 1
            
        # add in if there was a rivalry as a binary feature
        if ((h_team, a_team) in rivalry) or ((a_team, h_team) in rivalry):
            set_X.at[i, 'Rivalry'] = 1
        
        # print game, h_team, a_team, h_all_stars, a_all_stars, h_pct, a_pct
        
        i += 1
        
    print set_X[0:3]

add_features(model_train_X, len(model_train_X))
add_features(model_output_X, len(model_output_X))


    Season   Game_ID   Game_Date Away_Team Home_Team  Year  Rivalry  \
0  2016-17  21600001  10/25/2016       NYK       CLE     0        0   
1  2016-17  21600003  10/25/2016       SAS       GSW     0        0   
2  2016-17  21600002  10/25/2016       UTA       POR     0        0   

   Home_All_Stars  Away_All_Stars  Home_Market_Size  Away_Market_Size  \
0             3.0             1.0            4053.0           19995.0   
1             4.0             1.0            6750.0            2193.0   
2             0.0             0.0            3010.0            2505.0   

   Home_Championships  Away_Championships  Home_Playoffs  Away_Playoffs  \
0                 1.0                 2.0            7.0            5.0   
1                 4.0                 5.0            5.0           16.0   
2                 1.0                 0.0            9.0            8.0   

   Home_Twitter  Away_Twitter  Home_Win_Pct  Away_Win_Pct  
0         1.138         1.393           0.0           0.0  
1

In [11]:
# subsetting the training dataset
print model_train_X[0:3]
drop_cols = ['Season', 'Game_ID','Game_Date', 'Away_Team', 'Home_Team']

# make train set of 1200
model_sub_X = model_train_X.drop(drop_cols, axis=1)[0 : 1500]
model_sub_y = model_train_y[0 : 1500]

# make validation set that we will use to optimise our model
model_val_X = model_train_X.drop(drop_cols, axis=1)[1500 : 2000]
model_val_y = model_train_y[1500 : 2000]

# make test set that we won't touch until we're completely done
model_test_X = model_train_X.drop(drop_cols, axis=1)[1600 : ]
model_test_y = model_train_y[1600 : ]


    Season   Game_ID   Game_Date Away_Team Home_Team  Year  Rivalry  \
0  2016-17  21600001  10/25/2016       NYK       CLE     0        0   
1  2016-17  21600003  10/25/2016       SAS       GSW     0        0   
2  2016-17  21600002  10/25/2016       UTA       POR     0        0   

   Home_All_Stars  Away_All_Stars  Home_Market_Size  Away_Market_Size  \
0             3.0             1.0            4053.0           19995.0   
1             4.0             1.0            6750.0            2193.0   
2             0.0             0.0            3010.0            2505.0   

   Home_Championships  Away_Championships  Home_Playoffs  Away_Playoffs  \
0                 1.0                 2.0            7.0            5.0   
1                 4.0                 5.0            5.0           16.0   
2                 1.0                 0.0            9.0            8.0   

   Home_Twitter  Away_Twitter  Home_Win_Pct  Away_Win_Pct  
0         1.138         1.393           0.0           0.0  
1

In [12]:
def mape(predict, actual):
    n = len(predict)
    sum_score = 0.0
    for i in xrange(n):
        sum_score += abs(predict[i] - actual[i]) / float(actual[i])
    return (1 / float(len(predict))) * sum_score

In [23]:
# linear regression

reg = linear_model.LinearRegression()
reg.fit(model_sub_X, model_sub_y)
model_val_pred = reg.predict(model_val_X)
print model_val_pred[0:3]
print model_val_y[0:3]

print mape(model_val_pred, model_val_y)
print r2_score(model_val_y, model_val_pred)

[ 17343.4577342   14929.47494793  20616.01329005]
[15158, 12349, 17122]
0.631129994089
0.348098673079


In [34]:
def reg_model(model):
    reg = model
    reg.fit(model_sub_X, model_sub_y)
    model_val_pred = reg.predict(model_val_X)
    print model_val_pred[0:3], model_val_y[0:3]
    print mape(model_val_pred, model_val_y)
    print r2_score(model_val_y, model_val_pred)
    return None

#random forest
reg = RandomForestRegressor(max_depth=10, random_state=0)
reg.fit(model_sub_X, model_sub_y)
print list(model_sub_X)
print(reg.feature_importances_)
model_val_pred = reg.predict(model_val_X)
print model_val_pred[0:3], model_val_y[0:3]
print mape(model_val_pred, model_val_y)
print r2_score(model_val_y, model_val_pred)

# naive bayes
reg_model(GaussianNB())

# support vector machine
reg_model(SVR(C=1.0, epsilon=0.2))

['Year', 'Rivalry', 'Home_All_Stars', 'Away_All_Stars', 'Home_Market_Size', 'Away_Market_Size', 'Home_Championships', 'Away_Championships', 'Home_Playoffs', 'Away_Playoffs', 'Home_Twitter', 'Away_Twitter', 'Home_Win_Pct', 'Away_Win_Pct']
[ 0.01136299  0.00765029  0.1533608   0.24872643  0.03476273  0.02926996
  0.01881039  0.02741924  0.01881583  0.02007946  0.09460584  0.08445324
  0.12585543  0.12482736]
[ 10898.58982599  14809.10302813  13575.48324672] [15158, 12349, 17122]
0.505964676605
0.531104385885
[ 9587 10052 11563] [15158, 12349, 17122]
0.588333110667
0.139321413023
[ 14615.27408585  14617.83495799  14615.32939243] [15158, 12349, 17122]
0.740152655462
-0.00276369397916


In [None]:
new_df = test_df

# new_df.to_csv('test_set_Columbia.csv')
