In [1]:
import ruamel.yaml as yaml
import os
import sys
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split

from sklearn.cross_validation import *
from sklearn.grid_search import GridSearchCV

NO_CONFIG_ERR_MSG = """No config file found. Root directory is determined by presence of "config.yaml" file."""

original_wd = os.getcwd()

# Number of times to move back in directory
num_retries = 10
for x in range(0, num_retries):
    # try to load config file
    try:
        with open("config.yaml", 'r') as stream:
            cfg = yaml.safe_load(stream)
    # If not found move back one directory level
    except FileNotFoundError:
        os.chdir('../')
        # If reached the max number of directory levels change to original wd and print error msg
        if x+1 == num_retries:
            os.chdir(original_wd)
            print(NO_CONFIG_ERR_MSG)

# Add directory to PATH
path = os.getcwd()

if path not in sys.path:
    sys.path.append(path)




## Load and Process Data

In [2]:
df = pd.read_csv('data/processed/all_game_data.csv')

One Hot Encode the season

In [3]:
pd.options.display.max_columns=150
df.head()

Unnamed: 0,Game_id,season_id,Season,Date,day_of_week,Team_id,Team_Name,Team_Short,Home_or_Away,Ticket_Value,TV_Rate,market_size,Salaries,Wins_Entering_Gm,Losses_Entering_Gm,WIN_Perc,superteam_flg,player1,player1_minutes,player1_WINS,player1_RPM,player1_DRPM,player1_ORPM,player1_WINS_portion,player1_RPM_portion,player2,player2_minutes,player2_WINS,player2_RPM,player2_DRPM,player2_ORPM,player2_WINS_portion,player2_RPM_portion,player3,player3_minutes,player3_WINS,player3_RPM,player3_DRPM,player3_ORPM,player3_WINS_portion,player3_RPM_portion,player4,player4_minutes,player4_WINS,player4_RPM,player4_DRPM,player4_ORPM,player4_WINS_portion,player4_RPM_portion,player5,player5_minutes,player5_WINS,player5_RPM,player5_DRPM,player5_ORPM,player5_WINS_portion,player5_RPM_portion,player6,player6_minutes,player6_WINS,player6_RPM,player6_DRPM,player6_ORPM,player6_WINS_portion,player6_RPM_portion,player7,player7_minutes,player7_WINS,player7_RPM,player7_DRPM,player7_ORPM,player7_WINS_portion,player7_RPM_portion,player8,player8_minutes,player8_WINS,player8_RPM,player8_DRPM,player8_ORPM,player8_WINS_portion,player8_RPM_portion,player9,player9_minutes,player9_WINS,player9_RPM,player9_DRPM,player9_ORPM,player9_WINS_portion,player9_RPM_portion,player10,player10_minutes,player10_WINS,player10_RPM,player10_DRPM,player10_ORPM,player10_WINS_portion,player10_RPM_portion,player11,player11_minutes,player11_WINS,player11_RPM,player11_DRPM,player11_ORPM,player11_WINS_portion,player11_RPM_portion,player12,player12_minutes,player12_WINS,player12_RPM,player12_DRPM,player12_ORPM,player12_WINS_portion,player12_RPM_portion,player13,player13_minutes,player13_WINS,player13_RPM,player13_DRPM,player13_ORPM,player13_WINS_portion,player13_RPM_portion,Tot_WINS,Tot_RPM
0,21400001,2014,2014-15,10/28/2014,Tue,1610612740,New Orleans,NOP,Home,0.69867,1.594693,0.64,80012866,,,,0,203076,36,15.86,8.18,4.2,3.98,0.001836,0.000947,201936,35,9.02,3.15,-0.65,3.8,0.001074,0.000375,201600,34,1.41,-1.18,1.83,-3.01,0.000173,-0.000145,201569,32,1.17,-1.4,-1.88,0.48,0.000152,-0.000182,201950,27,4.52,3.22,0.77,2.45,0.000698,0.000497,201583,22,0.66,-1.7,-3.75,2.05,0.000125,-0.000322,203085.0,20.0,-1.64,-4.05,-1.61,-2.44,-0.000342,-0.000844,202690.0,18.0,-0.15,-2.83,-1.95,-0.88,-3.5e-05,-0.000655,2422.0,10.0,-0.25,-3.88,-1.33,-2.55,-0.000104,-0.001617,201582.0,2.0,1.27,-0.11,0.9,-1.01,0.002646,-0.000229,202337.0,2.0,-0.59,-3.51,-1.85,-1.66,-0.001229,-0.007313,203481.0,2.0,0.13,-1.53,-0.7,-0.83,0.000271,-0.003187,,,,,,,,,0.005264,-0.012675
1,21400002,2014,2014-15,10/28/2014,Tue,1610612759,San Antonio,SAS,Home,0.739665,7.326316,0.94,70770209,,,,0,2225,35,-0.98,-3.15,-3.33,0.18,-0.000117,-0.000375,201980,34,11.0,5.41,2.16,3.25,0.001348,0.000663,2564,33,2.93,0.06,0.26,-0.2,0.00037,8e-06,201158,31,-0.66,-3.12,-3.12,0.0,-8.9e-05,-0.000419,1495,30,10.33,5.2,4.67,0.53,0.001435,0.000722,1938,28,5.51,3.28,1.75,1.53,0.00082,0.000488,203382.0,18.0,1.75,0.12,1.25,-1.13,0.000405,2.8e-05,202709.0,17.0,2.83,0.88,0.93,-0.05,0.000694,0.000216,2588.0,15.0,-0.94,-3.97,-2.52,-1.45,-0.000261,-0.001103,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.004605,0.000227
2,21400003,2014,2014-15,10/28/2014,Tue,1610612747,L.A. Lakers,LAL,Home,0.723345,7.733333,5.48,77546500,,,,0,977,29,0.15,-2.15,-3.06,0.91,2.2e-05,-0.000309,202391,29,4.75,1.66,0.27,1.39,0.000682,0.000239,2430,28,-1.82,-4.1,-2.36,-1.74,-0.000271,-0.00061,101179,27,1.78,0.56,0.39,0.17,0.000275,8.6e-05,201941,23,-2.34,-4.37,-1.42,-2.95,-0.000424,-0.000792,202325,23,2.12,-0.8,-0.6,-0.2,0.000384,-0.000145,202334.0,22.0,1.35,-1.15,-0.89,-0.26,0.000256,-0.000218,203903.0,20.0,-0.01,-2.36,-3.0,0.64,-2e-06,-0.000492,202333.0,18.0,0.04,-1.52,-1.19,-0.33,9e-06,-0.000352,203944.0,14.0,0.0,-1.71,-0.19,-1.52,0.0,-0.000509,203135.0,7.0,0.93,-1.02,1.59,-2.61,0.000554,-0.000607,,,,,,,,,,,,,,,,,0.001484,-0.003708
3,21400004,2014,2014-15,10/29/2014,Wed,1610612766,Charlotte,CHA,Home,0.641699,3.287069,1.19,66792937,,,,0,2744,40,1.75,-0.91,0.56,-1.47,0.000182,-9.5e-05,202362,40,,,,,,,202689,39,1.69,-1.03,-2.0,0.97,0.000181,-0.00011,203077,34,5.07,2.81,3.63,-0.82,0.000621,0.000344,101107,33,3.48,0.43,1.03,-0.6,0.000439,5.4e-05,202390,24,-0.97,-3.68,-2.51,-1.17,-0.000168,-0.000639,203469.0,20.0,6.08,4.35,3.23,1.12,0.001267,0.000906,203148.0,14.0,-0.53,-3.0,-2.36,-0.64,-0.000158,-0.000893,101131.0,13.0,-0.75,-3.72,0.01,-3.73,-0.00024,-0.001192,201945.0,9.0,3.84,0.36,1.01,-0.65,0.001778,0.000167,,,,,,,,,,,,,,,,,,,,,,,,,0.003901,-0.001457
4,21400005,2014,2014-15,10/29/2014,Wed,1610612754,Indiana,IND,Home,0.591983,3.367151,1.09,74793526,,,,0,202388,34,-0.03,-2.39,-1.28,-1.11,-4e-06,-0.000293,101139,33,4.54,1.58,1.17,0.41,0.000573,0.000199,201579,32,2.73,-0.01,2.43,-2.44,0.000355,-1e-06,203142,26,-0.18,-2.71,-1.77,-0.94,-2.9e-05,-0.000434,2449,25,3.03,0.57,0.87,-0.3,0.000505,9.5e-05,203524,23,0.45,-2.04,-1.36,-0.68,8.1e-05,-0.00037,202730.0,20.0,2.88,2.04,1.62,0.42,0.0006,0.000425,101133.0,16.0,1.13,-0.75,2.47,-3.22,0.000294,-0.000195,201155.0,16.0,2.31,-0.34,-0.15,-0.19,0.000602,-8.8e-05,204014.0,15.0,-0.68,-3.4,-2.09,-1.31,-0.000189,-0.000944,,,,,,,,,,,,,,,,,,,,,,,,,0.00279,-0.001607


In [4]:
id_cols = ['Game_id',
           'season_id',
           'Team_id',
           'Season',
           'Date',
           'Team_Short'
          ]
id_cols += df.filter(regex='player\d$').columns.values.tolist()

# Remove any player after 8th most played player
remove_features = df.iloc[:2, df.columns.get_loc('player9'):].columns.values.tolist()
# Keep Total RPM and Total Wins
remove_features = [feat for feat in remove_features if feat not in ['Tot_WINS', 'Tot_RPM']]
# Remove One Hot Encode Cols
remove_features += ['Team_Name',
           'Home_or_Away', 
                   'day_of_week']

## One hot encode and set X and y

In [5]:
## Cleaning Nulls

In [6]:
possible_feats = df[[col for col in df.columns.values.tolist() if col not in remove_features]]
possible_feats.shape

(9840, 80)

## Since the Wins entering game likely has quite a bit to do with TV rating and ticket price, NaNs will be dropped.
## Possibly will do two models, one with and one without to see which is the beter performing since this drops half the data
## Player stats will be mean imputed since there's not a huge number of nulls

In [7]:
from src.utils.pandas_utils import null_column_report_df
from src.utils.custom_transformers import DFImputer
null_column_report_df(possible_feats)

Number of columns with null values:
57



Unnamed: 0,Column,Percent_Null_Records,Total_Null_Records
0,Season,0.0,34
1,Ticket_Value,0.01,76
2,Wins_Entering_Gm,0.52,5084
3,Losses_Entering_Gm,0.52,5084
4,WIN_Perc,0.52,5142
5,player1_WINS,0.05,455
6,player1_RPM,0.05,455
7,player1_DRPM,0.05,455
8,player1_ORPM,0.05,455
9,player1_WINS_portion,0.05,455


## Dropping Nulls

In [8]:
drop_cols = ['Season', 'day_of_week', 'Ticket_Value', 'Wins_Entering_Gm', 'Losses_Entering_Gm', 'WIN_Perc']
drop_cols_keep_win_data = ['Season', 'day_of_week', 'Ticket_Value']
drop_cols_win_data_cols = ['Wins_Entering_Gm', 'Losses_Entering_Gm', 'WIN_Perc']
df_dropped_null_wins_data = df.dropna(subset=drop_cols)
df_kept_null_wins_data = df.dropna(subset=drop_cols_keep_win_data)

## Mean Imputing

In [9]:
mean_impute_cols = [col for col 
                    in null_column_report_df(possible_feats).Column.values.tolist() 
                    if col not in drop_cols]
print(mean_impute_cols[:3])
df_kept_null_wins_data[mean_impute_cols] = DFImputer().fit_transform(df_kept_null_wins_data[mean_impute_cols])
df_dropped_null_wins_data[mean_impute_cols] = DFImputer().fit_transform(df_dropped_null_wins_data[mean_impute_cols])

Number of columns with null values:
57

['player1_WINS', 'player1_RPM', 'player1_DRPM']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


## One Hot Categoric Data

In [10]:
target_cols = [
    'Ticket_Value', 'TV_Rate'
]

def one_hot_df(games_df):
    from functools import reduce
    season_one_hot = pd.get_dummies(df['Season']).add_prefix('Season_')
    day_of_week_one_hot = pd.get_dummies(df['day_of_week']).add_prefix('Day_Of_Week')
    team_one_hot = pd.get_dummies(df['Team_Name']).add_prefix('Team_')
    frames = [season_one_hot, day_of_week_one_hot, team_one_hot] # add arguments as necessary to the read_csv method
    one_hot_features_df = reduce(lambda left,right: pd.merge(left,right,
                                                             left_index=True,
                                                            right_index=True), 
                                 frames)

    return one_hot_features_df

## Concat_together

In [11]:
features = [col for col in df.columns.values.tolist() if col not in target_cols+id_cols+remove_features]

X = pd.merge(one_hot_df(df_dropped_null_wins_data),
             df_dropped_null_wins_data[features],
             left_index=True,
             right_index=True)
Y_Tickets = df_dropped_null_wins_data['Ticket_Value']
Y_TV = df_dropped_null_wins_data['TV_Rate']

X_kept_win_data = pd.merge(one_hot_df(df_kept_null_wins_data), 
                          df_kept_null_wins_data[features],
                         left_index=True,
                         right_index=True)
X_kept_win_data = X_kept_win_data.drop(columns=drop_cols_win_data_cols)
Y_kept_win_data_Tickets = df_kept_null_wins_data['Ticket_Value']
Y_kept_win_data_TV = df_kept_null_wins_data['TV_Rate']

## Sanity Check

In [12]:
null_column_report_df(X)

Number of columns with null values:
0



In [13]:
X.shape

(4660, 105)

In [14]:
null_column_report_df(X_kept_win_data)

Number of columns with null values:
0



In [15]:
X_kept_win_data.shape

(9738, 102)

## Export Data

In [16]:
pd.concat([X, Y_Tickets, Y_TV], axis=1).to_csv('data/processed/modelling_data_dropped_null_wins.csv', index=False)
pd.concat([X_kept_win_data, Y_kept_win_data_Tickets, Y_kept_win_data_TV], axis=1).to_csv('data/processed/modelling_data.csv', index=False)

## CV Score with sklearn's GradientBoostingRegressor

In [17]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
gbr = GradientBoostingRegressor()
gbr_tv = GradientBoostingRegressor()

  from numpy.core.umath_tests import inner1d


In [18]:
# %%time
# cv_score_tickets = cross_val_score(gbr, X=X, y=Y_Tickets, cv=10,
#                                                 scoring='neg_mean_squared_error')                                   
# cv_score_tv = cross_val_score(gbr_tv, X=X, y=Y_TV, cv=10,
#                                                 scoring='neg_mean_squared_error')
                              

In [19]:
# import math
# math.sqrt(abs(cv_score_tickets.mean()))

In [20]:
# math.sqrt(abs(cv_score_tv.mean()))

In [21]:
from sklearn.model_selection import RandomizedSearchCV
gbr_tv= GradientBoostingRegressor()

In [22]:

#brute force scan for all parameters, here are the tricks
#usually max_depth is 6,7,8
#learning rate is around 0.05, but small changes may make big diff
#tuning min_child_weight subsample colsample_bytree can have 
#much fun of fighting against overfit 
#n_estimators is how many round of boosting
#finally, ensemble xgboost with multiple seeds may reduce variance
parameters = {
        'n_estimators': [5,10,100,500,1000],
        'loss': ["ls", "lad", "huber", "quantile"],
        'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.],
        'max_depth': list(range(1, 11)),
        'min_samples_split': list(range(2, 21)),
        'min_samples_leaf': list(range(1, 21)),
        'subsample': np.arange(0.05, 1.01, 0.05),
        'max_features': np.arange(0.05, 1.01, 0.05),
        'alpha': [0.75, 0.8, 0.85, 0.9, 0.95, 0.99]
    }


clf = RandomizedSearchCV(gbr, parameters, n_jobs=6, 
                   cv=5, verbose=2, refit=True)

In [23]:
%%time
clf.fit(X, Y_Tickets)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:  1.2min
[Parallel(n_jobs=6)]: Done  50 out of  50 | elapsed:  4.8min finished


Wall time: 4min 50s


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=6,
          param_distributions={'n_estimators': [5, 10, 100, 500, 1000], 'loss': ['ls', 'lad', 'huber', 'quantile'], 'learning_rate': [0.001, 0.01, 0.1, 0.5, 1.0], 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], 'min_sampl...0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.  ]), 'alpha': [0.75, 0.8, 0.85, 0.9, 0.95, 0.99]},
     

In [24]:
best_parameters, score, _ = max(clf.grid_scores_, key=lambda x: x[1])
print('Score:', score)
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

# test_probs = clf.predict_proba(test[features])[:,1]

# sample = pd.read_csv('../input/sample_submission.csv')
# sample.QuoteConversion_Flag = test_probs
# sample.to_csv("xgboost_best_parameter_submission.csv", index=False)

Score: -2.8775572182385383
alpha: 0.95
learning_rate: 0.001
loss: 'ls'
max_depth: 10
max_features: 0.9000000000000001
min_samples_leaf: 5
min_samples_split: 11
n_estimators: 5
subsample: 0.35000000000000003




In [25]:
best_parameters

{'subsample': 0.35000000000000003,
 'n_estimators': 5,
 'min_samples_split': 11,
 'min_samples_leaf': 5,
 'max_features': 0.9000000000000001,
 'max_depth': 10,
 'loss': 'ls',
 'learning_rate': 0.001,
 'alpha': 0.95}

In [26]:
import math
best_score = math.sqrt(abs(score.mean()))
print(f'Best RMSE: {best_score}')

Best RMSE: 1.6963364106917407


## Repeating but with Ticket Info

In [28]:
#brute force scan for all parameters, here are the tricks
#usually max_depth is 6,7,8
#learning rate is around 0.05, but small changes may make big diff
#tuning min_child_weight subsample colsample_bytree can have 
#much fun of fighting against overfit 
#n_estimators is how many round of boosting
#finally, ensemble xgboost with multiple seeds may reduce variance
tv_parameters =  {
        'n_estimators': [5,10,100,500,1000],
        'loss': ["ls", "lad", "huber", "quantile"],
        'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.],
        'max_depth': range(1, 11),
        'min_samples_split': range(2, 21),
        'min_samples_leaf': range(1, 21),
        'subsample': np.arange(0.05, 1.01, 0.05),
        'max_features': np.arange(0.05, 1.01, 0.05),
        'alpha': [0.75, 0.8, 0.85, 0.9, 0.95, 0.99]
    }


tv_clf = RandomizedSearchCV(gbr_tv, tv_parameters, n_jobs=6, 
                   cv=5, verbose=2, refit=True)

In [29]:
%%time
tv_clf.fit(X, Y_TV)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:   29.1s
[Parallel(n_jobs=6)]: Done  50 out of  50 | elapsed:   56.3s finished


Wall time: 57 s


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=6,
          param_distributions={'n_estimators': [5, 10, 100, 500, 1000], 'loss': ['ls', 'lad', 'huber', 'quantile'], 'learning_rate': [0.001, 0.01, 0.1, 0.5, 1.0], 'max_depth': range(1, 11), 'min_samples_split': range(2, 21), 'min_samples_leaf': range(1, 21), 'subsample': array([0.05, 0.1 , 0.15, 0.2 , 0.25, 0...0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.  ]), 'alpha': [0.75, 0.8, 0.85, 0.9, 0.95, 0.99]},
     

In [30]:
#trust your CV!
tv_best_parameters, tv_score, _ = max(tv_clf.grid_scores_, key=lambda x: x[1])
print('Score:', tv_score)
for param_name in sorted(tv_best_parameters.keys()):
    print("%s: %r" % (param_name, tv_best_parameters[param_name]))

# test_probs = clf.predict_proba(test[features])[:,1]

# sample = pd.read_csv('../input/sample_submission.csv')
# sample.QuoteConversion_Flag = test_probs
# sample.to_csv("xgboost_best_parameter_submission.csv", index=False)

Score: 0.6779016120496512
alpha: 0.85
learning_rate: 0.5
loss: 'huber'
max_depth: 6
max_features: 0.45
min_samples_leaf: 11
min_samples_split: 16
n_estimators: 5
subsample: 0.8500000000000001




In [31]:
tv_best_parameters

{'subsample': 0.8500000000000001,
 'n_estimators': 5,
 'min_samples_split': 16,
 'min_samples_leaf': 11,
 'max_features': 0.45,
 'max_depth': 6,
 'loss': 'huber',
 'learning_rate': 0.5,
 'alpha': 0.85}

In [32]:
import math
best_score = math.sqrt(abs(tv_score.mean()))
print(f'Best RMSE: {best_score}')

Best RMSE: 0.8233478074602805
