In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_error

%matplotlib inline

In [2]:
def fit_gb_model(df, X_train, y_train, X_test, y_test, mask_test):
    print ("**** GRADIENT BOOSTING Grid Search ****")
    gradient_boosting_grid = {'max_depth': [3, None],
                              'max_features': ['sqrt', 'log2', round(X_train.shape[1]/3), None],
                              'n_estimators': [100,300,500],
                              'learning_rate': [0.1,0.05,0.01],
                              'subsample': [0.5,1.0],
                              'random_state': [10]}
    
    gb_gridsearch = GridSearchCV(GradientBoostingRegressor(),
                                 gradient_boosting_grid,
                                 verbose=1,
                                 scoring='neg_mean_squared_error')
    gb_gridsearch.fit(X_train, y_train)
    print("Best Parameters:", gb_gridsearch.best_params_)
    print(' ')
    
    best_gb_model = gb_gridsearch.best_estimator_
    
    feature_importance = {}
    for label, importance in zip(X_train.columns, best_gb_model.feature_importances_):
        feature_importance[label] = importance
    print("Sorted Feature Importance:")
    sorted_feature_imp = sorted(feature_importance.items(), key=lambda x: (-x[1]))
    for e in sorted_feature_imp:
        print(e)
    
    y_pred_test = best_gb_model.predict(X_test)
    df_test = pd.concat([df[mask_test][['player','wkts','year1_wkts_pm']].reset_index(),
                         pd.DataFrame(y_pred_test).reset_index()],axis=1,)
    df_test = df_test.drop('index',axis=1)
    df_test.columns = ['player','wkts','wkts_baseline','wkts_exp']
    
    df_by_player = df_test.groupby('player').sum()
    
    print(' ')
    print('Explained Variance (GB model): ' + str(explained_variance_score(df_by_player.wkts,df_by_player.wkts_exp)))
    print('Explained Variance (Baseline): ' + str(explained_variance_score(df_by_player.wkts,df_by_player.wkts_baseline)))
    print('----')
    print('Mean Squared Error (GB model): ' + str(mean_squared_error(df_by_player.wkts,df_by_player.wkts_exp)))
    print('Mean Squared Error (Baseline): ' + str(mean_squared_error(df_by_player.wkts,df_by_player.wkts_baseline)))
    print('----')
    print(' ')

In [3]:
def fitting_gb_models(file, test_yr):
    df = pd.read_csv(file)
    
    mask_test = (df.year == test_yr)
    mask_train = (df.year >= test_yr-6) & (df.year <= test_yr-1)
    
    target = 'wkts'
    
    features_full = ['year1_mtchs_pld', 'year2_mtchs_pld', 'year3_mtchs_pld', 'year4_mtchs_pld', 'year5_mtchs_pld',
                     'year1_wkts_pm', 'year2_wkts_pm', 'year3_wkts_pm','year4_wkts_pm', 'year5_wkts_pm',
                     'bowler_agnst_oppo', 'oppo_agnst_bowl_typ', 'bowl_home_adv', 'ground_bowl_typ']
    features_small = ['year1_wkts_pm', 'year2_wkts_pm', 'year3_wkts_pm', 'year4_wkts_pm', 'year5_wkts_pm',
                      'bowler_agnst_oppo', 'oppo_agnst_bowl_typ', 'bowl_home_adv', 'ground_bowl_typ']
    features_smaller = ['year1_wkts_pm', 'year2_wkts_pm', 'year3_wkts_pm', 'year4_wkts_pm', 'year5_wkts_pm',
                        'bowl_home_adv', 'ground_bowl_typ']
    features_smallest = ['year1_wkts_pm', 'year2_wkts_pm', 'year3_wkts_pm', 'year4_wkts_pm', 'year5_wkts_pm']
    
    print("*********************************************")
    print("**** RUNNING MODELS FOR FULL FEATURE SET ****")
    print("*********************************************")
    
    features = features_full.copy()
    
    X_train = df[mask_train][features]
    y_train = df[mask_train][target]
    X_test = df[mask_test][features]
    y_test = df[mask_test][target]

    fit_gb_model(df, X_train, y_train, X_test, y_test, mask_test)

    
    print("**********************************************")
    print("**** RUNNING MODELS FOR SMALL FEATURE SET ****")
    print("**********************************************")
    
    features = features_small.copy()
    
    X_train = df[mask_train][features]
    y_train = df[mask_train][target]
    X_test = df[mask_test][features]
    y_test = df[mask_test][target]

    fit_gb_model(df, X_train, y_train, X_test, y_test, mask_test)


    print("************************************************")
    print("**** RUNNING MODELS FOR SMALLER FEATURE SET ****")
    print("************************************************")
    
    features = features_smaller.copy()
    
    X_train = df[mask_train][features]
    y_train = df[mask_train][target]
    X_test = df[mask_test][features]
    y_test = df[mask_test][target]

    fit_gb_model(df, X_train, y_train, X_test, y_test, mask_test)


    print("*************************************************")
    print("**** RUNNING MODELS FOR SMALLEST FEATURE SET ****")
    print("*************************************************")
    
    features = features_smallest.copy()
    
    X_train = df[mask_train][features]
    y_train = df[mask_train][target]
    X_test = df[mask_test][features]
    y_test = df[mask_test][target]

    fit_gb_model(df, X_train, y_train, X_test, y_test, mask_test)


In [4]:
input_file = 'data/bowling_data_enhanced.csv'
fitting_gb_models(input_file, test_yr=2011)

*********************************************
**** RUNNING MODELS FOR FULL FEATURE SET ****
*********************************************
**** GRADIENT BOOSTING Grid Search ****
Fitting 3 folds for each of 144 candidates, totalling 432 fits


[Parallel(n_jobs=1)]: Done 432 out of 432 | elapsed:  4.4min finished


Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'max_features': None, 'n_estimators': 100, 'random_state': 10, 'subsample': 0.5}
 
Sorted Feature Importance:
('year1_wkts_pm', 0.32176410210147638)
('year3_wkts_pm', 0.11941823460148186)
('year2_wkts_pm', 0.10672189024360308)
('ground_bowl_typ', 0.098448041103819786)
('year4_wkts_pm', 0.09440449571877052)
('bowl_home_adv', 0.067639402537492177)
('oppo_agnst_bowl_typ', 0.050116291857822175)
('year1_mtchs_pld', 0.043014130070802452)
('bowler_agnst_oppo', 0.029394859386856775)
('year5_wkts_pm', 0.017764963441717928)
('year2_mtchs_pld', 0.016109324098261556)
('year4_mtchs_pld', 0.015198337067176158)
('year5_mtchs_pld', 0.010950521320728103)
('year3_mtchs_pld', 0.0090554064499909562)
 
Explained Variance (GB model): 0.701626659209
Explained Variance (Baseline): 0.229112985258
----
Mean Squared Error (GB model): 32.5924198761
Mean Squared Error (Baseline): 98.4604956691
----
 
**********************************************
**** RUNNIN

[Parallel(n_jobs=1)]: Done 432 out of 432 | elapsed:  3.6min finished


Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'max_features': None, 'n_estimators': 100, 'random_state': 10, 'subsample': 0.5}
 
Sorted Feature Importance:
('year1_wkts_pm', 0.34551441682816597)
('year3_wkts_pm', 0.12666815437498044)
('ground_bowl_typ', 0.11978086938313692)
('year2_wkts_pm', 0.11452893868501456)
('year4_wkts_pm', 0.10008800656062998)
('bowl_home_adv', 0.078291985375918319)
('oppo_agnst_bowl_typ', 0.049735815165047575)
('bowler_agnst_oppo', 0.040494446364356479)
('year5_wkts_pm', 0.024897367262749857)
 
Explained Variance (GB model): 0.70693336315
Explained Variance (Baseline): 0.229112985258
----
Mean Squared Error (GB model): 31.7860121254
Mean Squared Error (Baseline): 98.4604956691
----
 
************************************************
**** RUNNING MODELS FOR SMALLER FEATURE SET ****
************************************************
**** GRADIENT BOOSTING Grid Search ****
Fitting 3 folds for each of 144 candidates, totalling 432 fits


[Parallel(n_jobs=1)]: Done 432 out of 432 | elapsed:  3.0min finished


Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'max_features': 'sqrt', 'n_estimators': 300, 'random_state': 10, 'subsample': 0.5}
 
Sorted Feature Importance:
('year1_wkts_pm', 0.22316102034368324)
('ground_bowl_typ', 0.17525557966678768)
('year3_wkts_pm', 0.13441313769981791)
('bowl_home_adv', 0.12894090149135051)
('year2_wkts_pm', 0.12869562070666229)
('year4_wkts_pm', 0.12554356573052666)
('year5_wkts_pm', 0.083990174361171469)
 
Explained Variance (GB model): 0.696227782705
Explained Variance (Baseline): 0.229112985258
----
Mean Squared Error (GB model): 33.5297164056
Mean Squared Error (Baseline): 98.4604956691
----
 
*************************************************
**** RUNNING MODELS FOR SMALLEST FEATURE SET ****
*************************************************
**** GRADIENT BOOSTING Grid Search ****
Fitting 3 folds for each of 144 candidates, totalling 432 fits
Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'max_features': None, 'n_estimators': 100, 'rando

[Parallel(n_jobs=1)]: Done 432 out of 432 | elapsed:  1.8min finished


In [5]:
fitting_gb_models(input_file, test_yr=2012)

*********************************************
**** RUNNING MODELS FOR FULL FEATURE SET ****
*********************************************
**** GRADIENT BOOSTING Grid Search ****
Fitting 3 folds for each of 144 candidates, totalling 432 fits


[Parallel(n_jobs=1)]: Done 432 out of 432 | elapsed:  4.2min finished


Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'max_features': 'sqrt', 'n_estimators': 300, 'random_state': 10, 'subsample': 1.0}
 
Sorted Feature Importance:
('year1_wkts_pm', 0.18083474577364722)
('year2_wkts_pm', 0.13378519662092669)
('ground_bowl_typ', 0.12079001287208536)
('year3_wkts_pm', 0.10405675840418278)
('year4_wkts_pm', 0.081888114359831496)
('bowl_home_adv', 0.073714229073621448)
('bowler_agnst_oppo', 0.056256308604306172)
('oppo_agnst_bowl_typ', 0.046365585796425003)
('year5_wkts_pm', 0.042305232332310275)
('year5_mtchs_pld', 0.040972854270818058)
('year1_mtchs_pld', 0.038782740552872078)
('year4_mtchs_pld', 0.029105330315106783)
('year2_mtchs_pld', 0.027900066219275318)
('year3_mtchs_pld', 0.023242824804591375)
 
Explained Variance (GB model): 0.747907486054
Explained Variance (Baseline): 0.436756427391
----
Mean Squared Error (GB model): 42.7058383058
Mean Squared Error (Baseline): 95.0030779207
----
 
**********************************************
**** RUNNI

[Parallel(n_jobs=1)]: Done 432 out of 432 | elapsed:  3.4min finished


Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'max_features': 'sqrt', 'n_estimators': 300, 'random_state': 10, 'subsample': 1.0}
 
Sorted Feature Importance:
('year1_wkts_pm', 0.21943354969184437)
('ground_bowl_typ', 0.17076998986289119)
('year2_wkts_pm', 0.16738389662048664)
('year3_wkts_pm', 0.097082550332720605)
('year4_wkts_pm', 0.089192052823753448)
('bowl_home_adv', 0.0880965205131444)
('bowler_agnst_oppo', 0.068490619542051281)
('oppo_agnst_bowl_typ', 0.059309542718035915)
('year5_wkts_pm', 0.040241277895072175)
 
Explained Variance (GB model): 0.750494653793
Explained Variance (Baseline): 0.436756427391
----
Mean Squared Error (GB model): 42.1382126345
Mean Squared Error (Baseline): 95.0030779207
----
 
************************************************
**** RUNNING MODELS FOR SMALLER FEATURE SET ****
************************************************
**** GRADIENT BOOSTING Grid Search ****
Fitting 3 folds for each of 144 candidates, totalling 432 fits


[Parallel(n_jobs=1)]: Done 432 out of 432 | elapsed:  2.9min finished


Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'max_features': 'sqrt', 'n_estimators': 300, 'random_state': 10, 'subsample': 1.0}
 
Sorted Feature Importance:
('year1_wkts_pm', 0.22115745820152985)
('year2_wkts_pm', 0.1793531325310517)
('ground_bowl_typ', 0.17712902961629592)
('year3_wkts_pm', 0.13482763061374592)
('bowl_home_adv', 0.11766618906479932)
('year4_wkts_pm', 0.10800331792161133)
('year5_wkts_pm', 0.061863242050965696)
 
Explained Variance (GB model): 0.748605780824
Explained Variance (Baseline): 0.436756427391
----
Mean Squared Error (GB model): 42.4705401272
Mean Squared Error (Baseline): 95.0030779207
----
 
*************************************************
**** RUNNING MODELS FOR SMALLEST FEATURE SET ****
*************************************************
**** GRADIENT BOOSTING Grid Search ****
Fitting 3 folds for each of 144 candidates, totalling 432 fits
Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'max_features': 'sqrt', 'n_estimators': 300, 'rand

[Parallel(n_jobs=1)]: Done 432 out of 432 | elapsed:  1.8min finished


In [6]:
fitting_gb_models(input_file, test_yr=2013)

*********************************************
**** RUNNING MODELS FOR FULL FEATURE SET ****
*********************************************
**** GRADIENT BOOSTING Grid Search ****
Fitting 3 folds for each of 144 candidates, totalling 432 fits


[Parallel(n_jobs=1)]: Done 432 out of 432 | elapsed:  4.3min finished


Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'max_features': 5, 'n_estimators': 100, 'random_state': 10, 'subsample': 1.0}
 
Sorted Feature Importance:
('year1_wkts_pm', 0.26203265908532303)
('year2_wkts_pm', 0.1865739954129518)
('ground_bowl_typ', 0.12248196133179125)
('year3_wkts_pm', 0.10751875676697742)
('year4_wkts_pm', 0.05687020936777764)
('bowl_home_adv', 0.053808782181693375)
('year2_mtchs_pld', 0.044417580152885536)
('year4_mtchs_pld', 0.038110736970471647)
('year1_mtchs_pld', 0.037325844557336667)
('oppo_agnst_bowl_typ', 0.027010357304129758)
('year3_mtchs_pld', 0.022742022879422681)
('bowler_agnst_oppo', 0.020803328807403619)
('year5_wkts_pm', 0.01120606213970732)
('year5_mtchs_pld', 0.0090977030421283672)
 
Explained Variance (GB model): 0.8507287159
Explained Variance (Baseline): 0.730165433366
----
Mean Squared Error (GB model): 28.6169957505
Mean Squared Error (Baseline): 52.8757511112
----
 
**********************************************
**** RUNNING MODELS

[Parallel(n_jobs=1)]: Done 432 out of 432 | elapsed:  3.5min finished


Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'max_features': 'sqrt', 'n_estimators': 100, 'random_state': 10, 'subsample': 1.0}
 
Sorted Feature Importance:
('year1_wkts_pm', 0.2927954508263067)
('year2_wkts_pm', 0.19388890614473286)
('ground_bowl_typ', 0.13730242374653656)
('year3_wkts_pm', 0.12423732217660842)
('year4_wkts_pm', 0.07378731330612362)
('bowl_home_adv', 0.071254406330485506)
('oppo_agnst_bowl_typ', 0.045085326142434205)
('year5_wkts_pm', 0.034093020249190122)
('bowler_agnst_oppo', 0.02755583107758186)
 
Explained Variance (GB model): 0.850347138731
Explained Variance (Baseline): 0.730165433366
----
Mean Squared Error (GB model): 28.7085867654
Mean Squared Error (Baseline): 52.8757511112
----
 
************************************************
**** RUNNING MODELS FOR SMALLER FEATURE SET ****
************************************************
**** GRADIENT BOOSTING Grid Search ****
Fitting 3 folds for each of 144 candidates, totalling 432 fits


[Parallel(n_jobs=1)]: Done 432 out of 432 | elapsed:  2.9min finished


Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'max_features': 'sqrt', 'n_estimators': 100, 'random_state': 10, 'subsample': 1.0}
 
Sorted Feature Importance:
('year1_wkts_pm', 0.26828890281711004)
('year2_wkts_pm', 0.20900057284847967)
('ground_bowl_typ', 0.16073639450070337)
('year3_wkts_pm', 0.1372322255523058)
('year4_wkts_pm', 0.0944360591327216)
('bowl_home_adv', 0.08762545993297001)
('year5_wkts_pm', 0.042680385215709611)
 
Explained Variance (GB model): 0.85007772308
Explained Variance (Baseline): 0.730165433366
----
Mean Squared Error (GB model): 28.7405505178
Mean Squared Error (Baseline): 52.8757511112
----
 
*************************************************
**** RUNNING MODELS FOR SMALLEST FEATURE SET ****
*************************************************
**** GRADIENT BOOSTING Grid Search ****
Fitting 3 folds for each of 144 candidates, totalling 432 fits
Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'max_features': 'sqrt', 'n_estimators': 100, 'random

[Parallel(n_jobs=1)]: Done 432 out of 432 | elapsed:  1.8min finished


In [7]:
fitting_gb_models(input_file, test_yr=2014)

*********************************************
**** RUNNING MODELS FOR FULL FEATURE SET ****
*********************************************
**** GRADIENT BOOSTING Grid Search ****
Fitting 3 folds for each of 144 candidates, totalling 432 fits


[Parallel(n_jobs=1)]: Done 432 out of 432 | elapsed:  4.8min finished


Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'max_features': None, 'n_estimators': 300, 'random_state': 10, 'subsample': 1.0}
 
Sorted Feature Importance:
('year1_wkts_pm', 0.21060028948592863)
('ground_bowl_typ', 0.17670424062472981)
('year2_wkts_pm', 0.11485940081946422)
('bowler_agnst_oppo', 0.11149681520058441)
('bowl_home_adv', 0.066230077426190359)
('year4_wkts_pm', 0.060602846350996678)
('year1_mtchs_pld', 0.054169873815558357)
('year5_mtchs_pld', 0.043553136409471151)
('oppo_agnst_bowl_typ', 0.039872502425078227)
('year2_mtchs_pld', 0.036811014261812242)
('year5_wkts_pm', 0.031901179987695372)
('year3_mtchs_pld', 0.025843145993818248)
('year3_wkts_pm', 0.023291378153189442)
('year4_mtchs_pld', 0.004064099045482902)
 
Explained Variance (GB model): 0.773996702567
Explained Variance (Baseline): 0.381455572077
----
Mean Squared Error (GB model): 23.0432481323
Mean Squared Error (Baseline): 69.4439037926
----
 
**********************************************
**** RUNNING

[Parallel(n_jobs=1)]: Done 432 out of 432 | elapsed:  3.8min finished


Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'max_features': 'sqrt', 'n_estimators': 100, 'random_state': 10, 'subsample': 1.0}
 
Sorted Feature Importance:
('year1_wkts_pm', 0.31117925565361171)
('year2_wkts_pm', 0.23137615693427044)
('ground_bowl_typ', 0.17040970454378063)
('year3_wkts_pm', 0.078866888008671698)
('bowl_home_adv', 0.061415717499004673)
('year4_wkts_pm', 0.061326429017417687)
('bowler_agnst_oppo', 0.036273736765729439)
('oppo_agnst_bowl_typ', 0.025212851958424376)
('year5_wkts_pm', 0.023939259619089482)
 
Explained Variance (GB model): 0.785285336418
Explained Variance (Baseline): 0.381455572077
----
Mean Squared Error (GB model): 21.6067243231
Mean Squared Error (Baseline): 69.4439037926
----
 
************************************************
**** RUNNING MODELS FOR SMALLER FEATURE SET ****
************************************************
**** GRADIENT BOOSTING Grid Search ****
Fitting 3 folds for each of 144 candidates, totalling 432 fits


[Parallel(n_jobs=1)]: Done 432 out of 432 | elapsed:  3.2min finished


Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'max_features': 'sqrt', 'n_estimators': 300, 'random_state': 10, 'subsample': 1.0}
 
Sorted Feature Importance:
('year1_wkts_pm', 0.25040824521138016)
('ground_bowl_typ', 0.22377376676746774)
('year2_wkts_pm', 0.19491503631981941)
('bowl_home_adv', 0.096349743635753604)
('year3_wkts_pm', 0.091521756531363893)
('year4_wkts_pm', 0.08565686008641113)
('year5_wkts_pm', 0.057374591447803977)
 
Explained Variance (GB model): 0.774261240283
Explained Variance (Baseline): 0.381455572077
----
Mean Squared Error (GB model): 22.7745500148
Mean Squared Error (Baseline): 69.4439037926
----
 
*************************************************
**** RUNNING MODELS FOR SMALLEST FEATURE SET ****
*************************************************
**** GRADIENT BOOSTING Grid Search ****
Fitting 3 folds for each of 144 candidates, totalling 432 fits
Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'max_features': 'sqrt', 'n_estimators': 300, 'r

[Parallel(n_jobs=1)]: Done 432 out of 432 | elapsed:  1.9min finished


In [8]:
fitting_gb_models(input_file, test_yr=2015)

*********************************************
**** RUNNING MODELS FOR FULL FEATURE SET ****
*********************************************
**** GRADIENT BOOSTING Grid Search ****
Fitting 3 folds for each of 144 candidates, totalling 432 fits


[Parallel(n_jobs=1)]: Done 432 out of 432 | elapsed:  4.3min finished


Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'max_features': 5, 'n_estimators': 300, 'random_state': 10, 'subsample': 0.5}
 
Sorted Feature Importance:
('ground_bowl_typ', 0.18192696555475227)
('year1_wkts_pm', 0.15159052928218245)
('year2_wkts_pm', 0.11060300510364143)
('oppo_agnst_bowl_typ', 0.078118141722242532)
('bowl_home_adv', 0.076103983785557369)
('bowler_agnst_oppo', 0.073412765128638549)
('year3_wkts_pm', 0.059361402263461871)
('year1_mtchs_pld', 0.043647531820196254)
('year4_wkts_pm', 0.04259149681289915)
('year5_mtchs_pld', 0.042337716274796461)
('year3_mtchs_pld', 0.036792030280743702)
('year4_mtchs_pld', 0.035909820854499556)
('year5_wkts_pm', 0.035365087145413469)
('year2_mtchs_pld', 0.032239523970974959)
 
Explained Variance (GB model): 0.79784996947
Explained Variance (Baseline): 0.281107905196
----
Mean Squared Error (GB model): 33.0847358694
Mean Squared Error (Baseline): 123.171196479
----
 
**********************************************
**** RUNNING MOD

[Parallel(n_jobs=1)]: Done 432 out of 432 | elapsed:  3.4min finished


Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'max_features': 'sqrt', 'n_estimators': 300, 'random_state': 10, 'subsample': 0.5}
 
Sorted Feature Importance:
('ground_bowl_typ', 0.21619821509087531)
('year1_wkts_pm', 0.16863787214441239)
('year2_wkts_pm', 0.13423997639824506)
('bowl_home_adv', 0.10873906103218209)
('oppo_agnst_bowl_typ', 0.0944400622008119)
('bowler_agnst_oppo', 0.084753968911584945)
('year3_wkts_pm', 0.078923576689089295)
('year4_wkts_pm', 0.058791094789746826)
('year5_wkts_pm', 0.055276172743052306)
 
Explained Variance (GB model): 0.808931775951
Explained Variance (Baseline): 0.281107905196
----
Mean Squared Error (GB model): 31.315471613
Mean Squared Error (Baseline): 123.171196479
----
 
************************************************
**** RUNNING MODELS FOR SMALLER FEATURE SET ****
************************************************
**** GRADIENT BOOSTING Grid Search ****
Fitting 3 folds for each of 144 candidates, totalling 432 fits


[Parallel(n_jobs=1)]: Done 432 out of 432 | elapsed:  2.9min finished


Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'max_features': 'sqrt', 'n_estimators': 300, 'random_state': 10, 'subsample': 0.5}
 
Sorted Feature Importance:
('ground_bowl_typ', 0.22920426735528215)
('year1_wkts_pm', 0.20529252727633576)
('year2_wkts_pm', 0.16451036644992473)
('bowl_home_adv', 0.1336602818930597)
('year3_wkts_pm', 0.11803115843017396)
('year4_wkts_pm', 0.078251530525975402)
('year5_wkts_pm', 0.071049868069248207)
 
Explained Variance (GB model): 0.819785170646
Explained Variance (Baseline): 0.281107905196
----
Mean Squared Error (GB model): 29.4710543083
Mean Squared Error (Baseline): 123.171196479
----
 
*************************************************
**** RUNNING MODELS FOR SMALLEST FEATURE SET ****
*************************************************
**** GRADIENT BOOSTING Grid Search ****
Fitting 3 folds for each of 144 candidates, totalling 432 fits
Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'max_features': 'sqrt', 'n_estimators': 300, 'ran

[Parallel(n_jobs=1)]: Done 432 out of 432 | elapsed:  1.8min finished


In [9]:
fitting_gb_models(input_file, test_yr=2016)

*********************************************
**** RUNNING MODELS FOR FULL FEATURE SET ****
*********************************************
**** GRADIENT BOOSTING Grid Search ****
Fitting 3 folds for each of 144 candidates, totalling 432 fits


[Parallel(n_jobs=1)]: Done 432 out of 432 | elapsed:  4.3min finished


Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'max_features': 5, 'n_estimators': 300, 'random_state': 10, 'subsample': 1.0}
 
Sorted Feature Importance:
('ground_bowl_typ', 0.23332147269475126)
('year1_wkts_pm', 0.1503192872484069)
('year2_wkts_pm', 0.14300000126354562)
('bowl_home_adv', 0.088654896809962497)
('year3_wkts_pm', 0.078218717625581191)
('year4_wkts_pm', 0.059326453400317072)
('bowler_agnst_oppo', 0.058882856624906166)
('year1_mtchs_pld', 0.050398024837073369)
('oppo_agnst_bowl_typ', 0.038004424108171023)
('year3_mtchs_pld', 0.029414667906748199)
('year5_mtchs_pld', 0.022049026314974941)
('year2_mtchs_pld', 0.020950243884001366)
('year5_wkts_pm', 0.020863258155899674)
('year4_mtchs_pld', 0.0065966691256604841)
 
Explained Variance (GB model): 0.719994152868
Explained Variance (Baseline): 0.620222303278
----
Mean Squared Error (GB model): 47.0016437724
Mean Squared Error (Baseline): 64.357582933
----
 
**********************************************
**** RUNNING MO

[Parallel(n_jobs=1)]: Done 432 out of 432 | elapsed:  3.5min finished


Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'max_features': 'sqrt', 'n_estimators': 300, 'random_state': 10, 'subsample': 0.5}
 
Sorted Feature Importance:
('ground_bowl_typ', 0.21669812826112883)
('year1_wkts_pm', 0.14822826351069998)
('year2_wkts_pm', 0.13754184109145332)
('bowl_home_adv', 0.097169642996359051)
('year3_wkts_pm', 0.096774160171517548)
('bowler_agnst_oppo', 0.091817514162144079)
('year4_wkts_pm', 0.080116441752603046)
('oppo_agnst_bowl_typ', 0.079870906024540569)
('year5_wkts_pm', 0.051783102029553746)
 
Explained Variance (GB model): 0.751423141191
Explained Variance (Baseline): 0.620222303278
----
Mean Squared Error (GB model): 41.8358495546
Mean Squared Error (Baseline): 64.357582933
----
 
************************************************
**** RUNNING MODELS FOR SMALLER FEATURE SET ****
************************************************
**** GRADIENT BOOSTING Grid Search ****
Fitting 3 folds for each of 144 candidates, totalling 432 fits


[Parallel(n_jobs=1)]: Done 432 out of 432 | elapsed:  3.0min finished


Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'max_features': 'sqrt', 'n_estimators': 300, 'random_state': 10, 'subsample': 0.5}
 
Sorted Feature Importance:
('ground_bowl_typ', 0.23665853240394591)
('year1_wkts_pm', 0.17270876189368317)
('year2_wkts_pm', 0.16644063632587991)
('bowl_home_adv', 0.13178596674895374)
('year3_wkts_pm', 0.12992751044112841)
('year4_wkts_pm', 0.097764130448528025)
('year5_wkts_pm', 0.064714461737880841)
 
Explained Variance (GB model): 0.754879234626
Explained Variance (Baseline): 0.620222303278
----
Mean Squared Error (GB model): 41.211564115
Mean Squared Error (Baseline): 64.357582933
----
 
*************************************************
**** RUNNING MODELS FOR SMALLEST FEATURE SET ****
*************************************************
**** GRADIENT BOOSTING Grid Search ****
Fitting 3 folds for each of 144 candidates, totalling 432 fits
Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'max_features': 'sqrt', 'n_estimators': 300, 'rand

[Parallel(n_jobs=1)]: Done 432 out of 432 | elapsed:  1.8min finished
