In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm

%matplotlib inline

  from pandas.core import datetools


In [2]:
def fit_lr_model(df, X_train, y_train, X_test, y_test, mask_test):
    print("**** LINEAR REGRESSION ****")
    lin_mod = sm.OLS(y_train, sm.add_constant(X_train))
    fit_lin = lin_mod.fit()
    print(fit_lin.summary())
    
    y_pred_test = fit_lin.predict(sm.add_constant(X_test))
    df_test = pd.concat([df[mask_test][['player','wkts','year1_wkts_pm']].reset_index(),
                         pd.DataFrame(y_pred_test).reset_index()],axis=1,)
    df_test = df_test.drop('index',axis=1)
    df_test.columns = ['player','wkts','wkts_baseline','wkts_exp']
    
    df_by_player = df_test.groupby('player').sum()
    
    print('Explained Variance (LR model): ' + str(explained_variance_score(df_by_player.wkts,df_by_player.wkts_exp)))
    print('Explained Variance (Baseline): ' + str(explained_variance_score(df_by_player.wkts,df_by_player.wkts_baseline)))
    print('----')
    print('Mean Squared Error (LR model): ' + str(mean_squared_error(df_by_player.wkts,df_by_player.wkts_exp)))
    print('Mean Squared Error (Baseline): ' + str(mean_squared_error(df_by_player.wkts,df_by_player.wkts_baseline)))
    print('----')
    print(' ')

In [3]:
def fit_rf_model(df, X_train, y_train, X_test, y_test, mask_test):
    print ("**** RANDOM FOREST Grid Search ****")
    random_forest_grid = {'max_depth': [3, None],
                          'max_features': ['sqrt', 'log2', round(X_train.shape[1]/3), None],
                          'min_samples_split': [2, 4],
                          'min_samples_leaf': [1, 2, 4],
                          'bootstrap': [True, False],
                          'n_estimators': [100,300,500],
                          'random_state': [10]}
    
    rf_gridsearch = GridSearchCV(RandomForestRegressor(),
                                 random_forest_grid,
                                 n_jobs=-1,
                                 verbose=True,
                                 scoring='neg_mean_squared_error')
    rf_gridsearch.fit(X_train, y_train)
    print("Best Parameters:", rf_gridsearch.best_params_)
    print(' ')
    
    best_rf_model = rf_gridsearch.best_estimator_
    
    feature_importance = {}
    for label, importance in zip(X_train.columns, best_rf_model.feature_importances_):
        feature_importance[label] = importance
    print("Sorted Feature Importance:")
    sorted_feature_imp = sorted(feature_importance.items(), key=lambda x: (-x[1]))
    for e in sorted_feature_imp:
        print(e)
    
    y_pred_test = best_rf_model.predict(X_test)
    df_test = pd.concat([df[mask_test][['player','wkts','year1_wkts_pm']].reset_index(),
                         pd.DataFrame(y_pred_test).reset_index()],axis=1,)
    df_test = df_test.drop('index',axis=1)
    df_test.columns = ['player','wkts','wkts_baseline','wkts_exp']
    
    df_by_player = df_test.groupby('player').sum()
    
    print(' ')
    print('Explained Variance (RF model): ' + str(explained_variance_score(df_by_player.wkts,df_by_player.wkts_exp)))
    print('Explained Variance (Baseline): ' + str(explained_variance_score(df_by_player.wkts,df_by_player.wkts_baseline)))
    print('----')
    print('Mean Squared Error (RF model): ' + str(mean_squared_error(df_by_player.wkts,df_by_player.wkts_exp)))
    print('Mean Squared Error (Baseline): ' + str(mean_squared_error(df_by_player.wkts,df_by_player.wkts_baseline)))
    print('----')
    print(' ')

In [4]:
def fitting_lr_and_rf(file, test_yr, fit_lr, fit_rf):
    df = pd.read_csv(file)
    
    mask_test = (df.year == test_yr)
    mask_train = (df.year >= test_yr-6) & (df.year <= test_yr-1)
    
    target = 'wkts'
    
    features_full = ['year1_mtchs_pld', 'year2_mtchs_pld', 'year3_mtchs_pld', 'year4_mtchs_pld', 'year5_mtchs_pld',
                     'year1_wkts_pm', 'year2_wkts_pm', 'year3_wkts_pm','year4_wkts_pm', 'year5_wkts_pm',
                     'bowler_agnst_oppo', 'oppo_agnst_bowl_typ', 'bowl_home_adv', 'ground_bowl_typ']
    features_small = ['year1_wkts_pm', 'year2_wkts_pm', 'year3_wkts_pm', 'year4_wkts_pm', 'year5_wkts_pm',
                      'bowler_agnst_oppo', 'oppo_agnst_bowl_typ', 'bowl_home_adv', 'ground_bowl_typ']
    features_smaller = ['year1_wkts_pm', 'year2_wkts_pm', 'year3_wkts_pm', 'year4_wkts_pm', 'year5_wkts_pm',
                        'bowl_home_adv', 'ground_bowl_typ']
    features_smallest = ['year1_wkts_pm', 'year2_wkts_pm', 'year3_wkts_pm', 'year4_wkts_pm', 'year5_wkts_pm']
    
    print("*********************************************")
    print("**** RUNNING MODELS FOR FULL FEATURE SET ****")
    print("*********************************************")
    
    features = features_full.copy()
    
    X_train = df[mask_train][features]
    y_train = df[mask_train][target]
    X_test = df[mask_test][features]
    y_test = df[mask_test][target]

    if fit_lr:
        fit_lr_model(df, X_train, y_train, X_test, y_test, mask_test)
    
    if fit_rf:
        fit_rf_model(df, X_train, y_train, X_test, y_test, mask_test)


    print("**********************************************")
    print("**** RUNNING MODELS FOR SMALL FEATURE SET ****")
    print("**********************************************")
    
    features = features_small.copy()
    
    X_train = df[mask_train][features]
    y_train = df[mask_train][target]
    X_test = df[mask_test][features]
    y_test = df[mask_test][target]

    if fit_lr:
        fit_lr_model(df, X_train, y_train, X_test, y_test, mask_test)
    
    if fit_rf:
        fit_rf_model(df, X_train, y_train, X_test, y_test, mask_test)


    print("************************************************")
    print("**** RUNNING MODELS FOR SMALLER FEATURE SET ****")
    print("************************************************")
    
    features = features_smaller.copy()
    
    X_train = df[mask_train][features]
    y_train = df[mask_train][target]
    X_test = df[mask_test][features]
    y_test = df[mask_test][target]

    if fit_lr:
        fit_lr_model(df, X_train, y_train, X_test, y_test, mask_test)
    
    if fit_rf:
        fit_rf_model(df, X_train, y_train, X_test, y_test, mask_test)


    print("*************************************************")
    print("**** RUNNING MODELS FOR SMALLEST FEATURE SET ****")
    print("*************************************************")
    
    features = features_smallest.copy()
    
    X_train = df[mask_train][features]
    y_train = df[mask_train][target]
    X_test = df[mask_test][features]
    y_test = df[mask_test][target]

    if fit_lr:
        fit_lr_model(df, X_train, y_train, X_test, y_test, mask_test)
    
    if fit_rf:
        fit_rf_model(df, X_train, y_train, X_test, y_test, mask_test)


In [5]:
input_file = 'data/bowling_data_enhanced.csv'
fitting_lr_and_rf(input_file, test_yr=2011, fit_lr=True, fit_rf=True)

*********************************************
**** RUNNING MODELS FOR FULL FEATURE SET ****
*********************************************
**** LINEAR REGRESSION ****
                            OLS Regression Results                            
Dep. Variable:                   wkts   R-squared:                       0.103
Model:                            OLS   Adj. R-squared:                  0.095
Method:                 Least Squares   F-statistic:                     12.63
Date:                Thu, 11 Jan 2018   Prob (F-statistic):           1.71e-28
Time:                        08:50:19   Log-Likelihood:                -3491.1
No. Observations:                1557   AIC:                             7012.
Df Residuals:                    1542   BIC:                             7093.
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                          coef    std err   

[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 216 tasks      | elapsed:   31.3s
[Parallel(n_jobs=-1)]: Done 466 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 816 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed:  3.6min finished


Best Parameters: {'bootstrap': True, 'max_depth': 3, 'max_features': 5, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100, 'random_state': 10}
 
Sorted Feature Importance:
('year1_wkts_pm', 0.3449863591221019)
('year4_wkts_pm', 0.17194511058217363)
('year3_wkts_pm', 0.13783850464054984)
('year2_wkts_pm', 0.10211723260571721)
('ground_bowl_typ', 0.038404915486087654)
('year5_wkts_pm', 0.037840587479928259)
('year1_mtchs_pld', 0.037623923665338982)
('bowl_home_adv', 0.026068425551716692)
('oppo_agnst_bowl_typ', 0.022754612060818347)
('bowler_agnst_oppo', 0.020974925564761975)
('year5_mtchs_pld', 0.020754004140177675)
('year4_mtchs_pld', 0.015433864635704403)
('year2_mtchs_pld', 0.012056425299555092)
('year3_mtchs_pld', 0.01120110916536831)
 
Explained Variance (RF model): 0.695621279593
Explained Variance (Baseline): 0.229112985258
----
Mean Squared Error (RF model): 33.5368607592
Mean Squared Error (Baseline): 98.4604956691
----
 
***************************************

[Parallel(n_jobs=-1)]: Done  60 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done 210 tasks      | elapsed:   33.0s
[Parallel(n_jobs=-1)]: Done 460 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 810 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed:  3.5min finished


Best Parameters: {'bootstrap': True, 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 100, 'random_state': 10}
 
Sorted Feature Importance:
('year1_wkts_pm', 0.3444853946521062)
('year3_wkts_pm', 0.18777579843916481)
('year4_wkts_pm', 0.14066801912776553)
('year2_wkts_pm', 0.11880049070459016)
('year5_wkts_pm', 0.083527226643849872)
('bowl_home_adv', 0.04071466478471631)
('ground_bowl_typ', 0.038715865208038838)
('oppo_agnst_bowl_typ', 0.024243361056361303)
('bowler_agnst_oppo', 0.02106917938340705)
 
Explained Variance (RF model): 0.697757919346
Explained Variance (Baseline): 0.229112985258
----
Mean Squared Error (RF model): 33.1266847449
Mean Squared Error (Baseline): 98.4604956691
----
 
************************************************
**** RUNNING MODELS FOR SMALLER FEATURE SET ****
************************************************
**** LINEAR REGRESSION ****
                            OLS Regression Results                    

[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-1)]: Done 211 tasks      | elapsed:   30.9s
[Parallel(n_jobs=-1)]: Done 461 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 811 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed:  2.9min finished


Best Parameters: {'bootstrap': True, 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100, 'random_state': 10}
 
Sorted Feature Importance:
('year1_wkts_pm', 0.28365359092930309)
('year4_wkts_pm', 0.20348324192848896)
('year3_wkts_pm', 0.165396383852976)
('year2_wkts_pm', 0.15420258413078111)
('year5_wkts_pm', 0.10689583593833406)
('bowl_home_adv', 0.049710166715700783)
('ground_bowl_typ', 0.036658196504415909)
 
Explained Variance (RF model): 0.704979689854
Explained Variance (Baseline): 0.229112985258
----
Mean Squared Error (RF model): 32.2835583027
Mean Squared Error (Baseline): 98.4604956691
----
 
*************************************************
**** RUNNING MODELS FOR SMALLEST FEATURE SET ****
*************************************************
**** LINEAR REGRESSION ****
                            OLS Regression Results                            
Dep. Variable:                   wkts   R-squared:                       0.085

[Parallel(n_jobs=-1)]: Done  60 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done 210 tasks      | elapsed:   29.1s
[Parallel(n_jobs=-1)]: Done 460 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 810 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed:  2.4min finished


Best Parameters: {'bootstrap': True, 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100, 'random_state': 10}
 
Sorted Feature Importance:
('year1_wkts_pm', 0.37987865918742431)
('year3_wkts_pm', 0.20461745574987422)
('year4_wkts_pm', 0.19038588009468388)
('year2_wkts_pm', 0.15767422829242964)
('year5_wkts_pm', 0.067443776675587985)
 
Explained Variance (RF model): 0.696249316817
Explained Variance (Baseline): 0.229112985258
----
Mean Squared Error (RF model): 33.3003638183
Mean Squared Error (Baseline): 98.4604956691
----
 


In [6]:
fitting_lr_and_rf(input_file, test_yr=2012, fit_lr=True, fit_rf=True)

*********************************************
**** RUNNING MODELS FOR FULL FEATURE SET ****
*********************************************
**** LINEAR REGRESSION ****
                            OLS Regression Results                            
Dep. Variable:                   wkts   R-squared:                       0.071
Model:                            OLS   Adj. R-squared:                  0.062
Method:                 Least Squares   F-statistic:                     8.230
Date:                Thu, 11 Jan 2018   Prob (F-statistic):           2.91e-17
Time:                        09:36:28   Log-Likelihood:                -3425.1
No. Observations:                1524   AIC:                             6880.
Df Residuals:                    1509   BIC:                             6960.
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                          coef    std err   

[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-1)]: Done 216 tasks      | elapsed:   33.4s
[Parallel(n_jobs=-1)]: Done 466 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 816 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed:  4.1min finished


Best Parameters: {'bootstrap': True, 'max_depth': 3, 'max_features': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 300, 'random_state': 10}
 
Sorted Feature Importance:
('year1_wkts_pm', 0.38781703140398771)
('year2_wkts_pm', 0.19023045507808475)
('year3_wkts_pm', 0.12499414363595472)
('year4_wkts_pm', 0.059929921967871669)
('ground_bowl_typ', 0.055231466346554681)
('oppo_agnst_bowl_typ', 0.037468032446783851)
('bowler_agnst_oppo', 0.033563832476684291)
('year1_mtchs_pld', 0.029310578793243867)
('bowl_home_adv', 0.023648233634919559)
('year3_mtchs_pld', 0.018536402978078959)
('year5_mtchs_pld', 0.012691846239699585)
('year2_mtchs_pld', 0.01020957844760528)
('year5_wkts_pm', 0.010149704652586317)
('year4_mtchs_pld', 0.0062187718979445517)
 
Explained Variance (RF model): 0.742760015333
Explained Variance (Baseline): 0.436756427391
----
Mean Squared Error (RF model): 43.3948788507
Mean Squared Error (Baseline): 95.0030779207
----
 
*********************************

[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done 206 tasks      | elapsed:   38.4s
[Parallel(n_jobs=-1)]: Done 456 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 806 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed:  3.9min finished


Best Parameters: {'bootstrap': False, 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100, 'random_state': 10}
 
Sorted Feature Importance:
('year1_wkts_pm', 0.33370269707437977)
('year2_wkts_pm', 0.20421979491958545)
('year3_wkts_pm', 0.19412196020490124)
('year4_wkts_pm', 0.1021181496544126)
('year5_wkts_pm', 0.066365530585402815)
('bowler_agnst_oppo', 0.042244816872319929)
('ground_bowl_typ', 0.030365554182365289)
('oppo_agnst_bowl_typ', 0.017775095188454524)
('bowl_home_adv', 0.0090864013181783301)
 
Explained Variance (RF model): 0.752811118025
Explained Variance (Baseline): 0.436756427391
----
Mean Squared Error (RF model): 41.721972146
Mean Squared Error (Baseline): 95.0030779207
----
 
************************************************
**** RUNNING MODELS FOR SMALLER FEATURE SET ****
************************************************
**** LINEAR REGRESSION ****
                            OLS Regression Results                 

[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done 207 tasks      | elapsed:   33.5s
[Parallel(n_jobs=-1)]: Done 457 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 807 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed:  3.3min finished


Best Parameters: {'bootstrap': True, 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 500, 'random_state': 10}
 
Sorted Feature Importance:
('year1_wkts_pm', 0.28277251463403219)
('year2_wkts_pm', 0.21756883408085845)
('year3_wkts_pm', 0.16383141273785667)
('year4_wkts_pm', 0.13901729261220544)
('year5_wkts_pm', 0.084483844920666296)
('ground_bowl_typ', 0.062568737309074904)
('bowl_home_adv', 0.049757363705306028)
 
Explained Variance (RF model): 0.746215849224
Explained Variance (Baseline): 0.436756427391
----
Mean Squared Error (RF model): 42.8388839517
Mean Squared Error (Baseline): 95.0030779207
----
 
*************************************************
**** RUNNING MODELS FOR SMALLEST FEATURE SET ****
*************************************************
**** LINEAR REGRESSION ****
                            OLS Regression Results                            
Dep. Variable:                   wkts   R-squared:                       0.

[Parallel(n_jobs=-1)]: Done  59 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done 209 tasks      | elapsed:   32.2s
[Parallel(n_jobs=-1)]: Done 459 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 809 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed:  2.6min finished


Best Parameters: {'bootstrap': True, 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 300, 'random_state': 10}
 
Sorted Feature Importance:
('year1_wkts_pm', 0.31273214435817259)
('year2_wkts_pm', 0.23847530464586222)
('year3_wkts_pm', 0.21105301838944851)
('year4_wkts_pm', 0.1579600683849311)
('year5_wkts_pm', 0.079779464221586036)
 
Explained Variance (RF model): 0.744408410595
Explained Variance (Baseline): 0.436756427391
----
Mean Squared Error (RF model): 43.1415317968
Mean Squared Error (Baseline): 95.0030779207
----
 


In [7]:
fitting_lr_and_rf(input_file, test_yr=2013, fit_lr=True, fit_rf=True)

*********************************************
**** RUNNING MODELS FOR FULL FEATURE SET ****
*********************************************
**** LINEAR REGRESSION ****
                            OLS Regression Results                            
Dep. Variable:                   wkts   R-squared:                       0.061
Model:                            OLS   Adj. R-squared:                  0.052
Method:                 Least Squares   F-statistic:                     7.061
Date:                Thu, 11 Jan 2018   Prob (F-statistic):           2.56e-14
Time:                        09:53:18   Log-Likelihood:                -3495.9
No. Observations:                1546   AIC:                             7022.
Df Residuals:                    1531   BIC:                             7102.
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                          coef    std err   

[Parallel(n_jobs=-1)]: Done  60 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done 210 tasks      | elapsed:   31.9s
[Parallel(n_jobs=-1)]: Done 460 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 810 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed:  3.9min finished


Best Parameters: {'bootstrap': False, 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 500, 'random_state': 10}
 
Sorted Feature Importance:
('year1_wkts_pm', 0.24369172895209495)
('year2_wkts_pm', 0.20577890968326729)
('year3_wkts_pm', 0.11460804006320108)
('ground_bowl_typ', 0.082643857086243028)
('year1_mtchs_pld', 0.076845793197553428)
('year4_wkts_pm', 0.068880083283558255)
('bowl_home_adv', 0.039583202183241885)
('year5_wkts_pm', 0.034378679091987031)
('year2_mtchs_pld', 0.03404707636327961)
('year4_mtchs_pld', 0.031885367982224452)
('oppo_agnst_bowl_typ', 0.022913699974901346)
('year5_mtchs_pld', 0.018685149484587812)
('year3_mtchs_pld', 0.013779073713275948)
('bowler_agnst_oppo', 0.012279338940583707)
 
Explained Variance (RF model): 0.854192350838
Explained Variance (Baseline): 0.730165433366
----
Mean Squared Error (RF model): 27.9420051925
Mean Squared Error (Baseline): 52.8757511112
----
 
*******************************

[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done 207 tasks      | elapsed:   35.1s
[Parallel(n_jobs=-1)]: Done 457 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 807 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed:  3.7min finished


Best Parameters: {'bootstrap': False, 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500, 'random_state': 10}
 
Sorted Feature Importance:
('year1_wkts_pm', 0.32557709401957197)
('year2_wkts_pm', 0.22021215473987882)
('year3_wkts_pm', 0.1346778126525032)
('ground_bowl_typ', 0.11381089820345597)
('year4_wkts_pm', 0.082645381799386991)
('bowl_home_adv', 0.048867129573440624)
('year5_wkts_pm', 0.031742446198886845)
('oppo_agnst_bowl_typ', 0.023365813749225323)
('bowler_agnst_oppo', 0.019101269063650429)
 
Explained Variance (RF model): 0.861387086381
Explained Variance (Baseline): 0.730165433366
----
Mean Squared Error (RF model): 26.5902570559
Mean Squared Error (Baseline): 52.8757511112
----
 
************************************************
**** RUNNING MODELS FOR SMALLER FEATURE SET ****
************************************************
**** LINEAR REGRESSION ****
                            OLS Regression Results                 

[Parallel(n_jobs=-1)]: Done  60 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done 210 tasks      | elapsed:   31.2s
[Parallel(n_jobs=-1)]: Done 460 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 810 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed:  3.2min finished


Best Parameters: {'bootstrap': True, 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100, 'random_state': 10}
 
Sorted Feature Importance:
('year1_wkts_pm', 0.29015962862901462)
('year2_wkts_pm', 0.21033015307469899)
('year3_wkts_pm', 0.12541214129058542)
('year4_wkts_pm', 0.12336921428150785)
('ground_bowl_typ', 0.11370366620111975)
('bowl_home_adv', 0.08788725257398855)
('year5_wkts_pm', 0.049137943949084655)
 
Explained Variance (RF model): 0.860402800664
Explained Variance (Baseline): 0.730165433366
----
Mean Squared Error (RF model): 26.7334449937
Mean Squared Error (Baseline): 52.8757511112
----
 
*************************************************
**** RUNNING MODELS FOR SMALLEST FEATURE SET ****
*************************************************
**** LINEAR REGRESSION ****
                            OLS Regression Results                            
Dep. Variable:                   wkts   R-squared:                       0.04

[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done 206 tasks      | elapsed:   37.8s
[Parallel(n_jobs=-1)]: Done 456 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 806 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed:  3.1min finished


Best Parameters: {'bootstrap': True, 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300, 'random_state': 10}
 
Sorted Feature Importance:
('year1_wkts_pm', 0.36213982371061793)
('year2_wkts_pm', 0.2941514011182656)
('year3_wkts_pm', 0.16687962742898471)
('year4_wkts_pm', 0.11728823432234455)
('year5_wkts_pm', 0.059540913419787102)
 
Explained Variance (RF model): 0.855084676228
Explained Variance (Baseline): 0.730165433366
----
Mean Squared Error (RF model): 27.7565069431
Mean Squared Error (Baseline): 52.8757511112
----
 


In [8]:
fitting_lr_and_rf(input_file, test_yr=2014, fit_lr=True, fit_rf=True)

*********************************************
**** RUNNING MODELS FOR FULL FEATURE SET ****
*********************************************
**** LINEAR REGRESSION ****
                            OLS Regression Results                            
Dep. Variable:                   wkts   R-squared:                       0.070
Model:                            OLS   Adj. R-squared:                  0.062
Method:                 Least Squares   F-statistic:                     8.767
Date:                Thu, 11 Jan 2018   Prob (F-statistic):           1.09e-18
Time:                        10:09:56   Log-Likelihood:                -3728.4
No. Observations:                1645   AIC:                             7487.
Df Residuals:                    1630   BIC:                             7568.
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                          coef    std err   

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   37.7s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed:  5.3min finished


Best Parameters: {'bootstrap': True, 'max_depth': 3, 'max_features': 5, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 300, 'random_state': 10}
 
Sorted Feature Importance:
('year1_wkts_pm', 0.26352720569289761)
('year2_wkts_pm', 0.2134102235068191)
('ground_bowl_typ', 0.12173478974949482)
('year4_wkts_pm', 0.069274490861767027)
('year3_wkts_pm', 0.068318781135629822)
('bowl_home_adv', 0.060785880984413794)
('year1_mtchs_pld', 0.047699743641138499)
('year2_mtchs_pld', 0.036923349073002779)
('year5_wkts_pm', 0.022663538782689355)
('bowler_agnst_oppo', 0.021451609685564279)
('oppo_agnst_bowl_typ', 0.021000886428468628)
('year5_mtchs_pld', 0.019905805743207224)
('year4_mtchs_pld', 0.019120690863414353)
('year3_mtchs_pld', 0.014183003851492652)
 
Explained Variance (RF model): 0.785530661278
Explained Variance (Baseline): 0.381455572077
----
Mean Squared Error (RF model): 21.6231703738
Mean Squared Error (Baseline): 69.4439037926
----
 
*************************************

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   44.3s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed:  5.0min finished


Best Parameters: {'bootstrap': True, 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300, 'random_state': 10}
 
Sorted Feature Importance:
('year1_wkts_pm', 0.29118398636358639)
('year2_wkts_pm', 0.22188700189229574)
('ground_bowl_typ', 0.13011545942877223)
('year3_wkts_pm', 0.10473465147265335)
('year4_wkts_pm', 0.076729837491458883)
('bowl_home_adv', 0.072803135861588764)
('year5_wkts_pm', 0.041050131827513311)
('oppo_agnst_bowl_typ', 0.033864486387023808)
('bowler_agnst_oppo', 0.027631309275107457)
 
Explained Variance (RF model): 0.787466295041
Explained Variance (Baseline): 0.381455572077
----
Mean Squared Error (RF model): 21.3979102313
Mean Squared Error (Baseline): 69.4439037926
----
 
************************************************
**** RUNNING MODELS FOR SMALLER FEATURE SET ****
************************************************
**** LINEAR REGRESSION ****
                            OLS Regression Results                 

[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done 207 tasks      | elapsed:   31.6s
[Parallel(n_jobs=-1)]: Done 457 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 807 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed:  3.1min finished


Best Parameters: {'bootstrap': True, 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 500, 'random_state': 10}
 
Sorted Feature Importance:
('year1_wkts_pm', 0.29890928021749991)
('year2_wkts_pm', 0.24100835895563505)
('ground_bowl_typ', 0.12567682357325827)
('year3_wkts_pm', 0.10914456637447124)
('year4_wkts_pm', 0.09458072522080517)
('bowl_home_adv', 0.082716944628994454)
('year5_wkts_pm', 0.047963301029335513)
 
Explained Variance (RF model): 0.788980534627
Explained Variance (Baseline): 0.381455572077
----
Mean Squared Error (RF model): 21.2475769615
Mean Squared Error (Baseline): 69.4439037926
----
 
*************************************************
**** RUNNING MODELS FOR SMALLEST FEATURE SET ****
*************************************************
**** LINEAR REGRESSION ****
                            OLS Regression Results                            
Dep. Variable:                   wkts   R-squared:                       0.0

[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done 211 tasks      | elapsed:   29.5s
[Parallel(n_jobs=-1)]: Done 461 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 811 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed:  2.4min finished


Best Parameters: {'bootstrap': True, 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 300, 'random_state': 10}
 
Sorted Feature Importance:
('year1_wkts_pm', 0.3788528175449194)
('year2_wkts_pm', 0.3077049916628331)
('year3_wkts_pm', 0.13223388727186702)
('year4_wkts_pm', 0.11902012078280912)
('year5_wkts_pm', 0.0621881827375712)
 
Explained Variance (RF model): 0.769014322312
Explained Variance (Baseline): 0.381455572077
----
Mean Squared Error (RF model): 23.2648720143
Mean Squared Error (Baseline): 69.4439037926
----
 


In [9]:
fitting_lr_and_rf(input_file, test_yr=2015, fit_lr=True, fit_rf=True)

*********************************************
**** RUNNING MODELS FOR FULL FEATURE SET ****
*********************************************
**** LINEAR REGRESSION ****
                            OLS Regression Results                            
Dep. Variable:                   wkts   R-squared:                       0.071
Model:                            OLS   Adj. R-squared:                  0.063
Method:                 Least Squares   F-statistic:                     8.280
Date:                Thu, 11 Jan 2018   Prob (F-statistic):           2.18e-17
Time:                        10:30:16   Log-Likelihood:                -3449.5
No. Observations:                1521   AIC:                             6929.
Df Residuals:                    1506   BIC:                             7009.
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                          coef    std err   

[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done 233 tasks      | elapsed:   35.0s
[Parallel(n_jobs=-1)]: Done 483 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 833 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed:  3.5min finished


Best Parameters: {'bootstrap': True, 'max_depth': 3, 'max_features': 5, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 300, 'random_state': 10}
 
Sorted Feature Importance:
('year1_wkts_pm', 0.25006069805324604)
('year2_wkts_pm', 0.21900772765756221)
('ground_bowl_typ', 0.13972219815085574)
('year3_wkts_pm', 0.074573099056609776)
('year1_mtchs_pld', 0.065377140386198546)
('year4_wkts_pm', 0.056714013569744481)
('bowl_home_adv', 0.04375989670979074)
('year3_mtchs_pld', 0.026340063796564204)
('oppo_agnst_bowl_typ', 0.024509688938122186)
('year4_mtchs_pld', 0.022672408825991718)
('bowler_agnst_oppo', 0.022495208792913678)
('year2_mtchs_pld', 0.020868171275340502)
('year5_wkts_pm', 0.019468802367541733)
('year5_mtchs_pld', 0.014430882419518563)
 
Explained Variance (RF model): 0.80049353923
Explained Variance (Baseline): 0.281107905196
----
Mean Squared Error (RF model): 32.7016893157
Mean Squared Error (Baseline): 123.171196479
----
 
**************************************

[Parallel(n_jobs=-1)]: Done  60 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done 210 tasks      | elapsed:   33.5s
[Parallel(n_jobs=-1)]: Done 460 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 810 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed:  3.8min finished


Best Parameters: {'bootstrap': True, 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 300, 'random_state': 10}
 
Sorted Feature Importance:
('year1_wkts_pm', 0.28958784970502444)
('year2_wkts_pm', 0.23306883315638763)
('ground_bowl_typ', 0.14418579772239479)
('year3_wkts_pm', 0.10176916510515874)
('bowl_home_adv', 0.058955210296794125)
('year4_wkts_pm', 0.056069565852523803)
('oppo_agnst_bowl_typ', 0.045104735217020915)
('year5_wkts_pm', 0.037404536989588816)
('bowler_agnst_oppo', 0.033854305955106613)
 
Explained Variance (RF model): 0.800408884128
Explained Variance (Baseline): 0.281107905196
----
Mean Squared Error (RF model): 32.6939660128
Mean Squared Error (Baseline): 123.171196479
----
 
************************************************
**** RUNNING MODELS FOR SMALLER FEATURE SET ****
************************************************
**** LINEAR REGRESSION ****
                            OLS Regression Results                 

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   32.8s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed:  3.4min finished


Best Parameters: {'bootstrap': True, 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 300, 'random_state': 10}
 
Sorted Feature Importance:
('year1_wkts_pm', 0.29341939680700707)
('year2_wkts_pm', 0.21713970477037259)
('ground_bowl_typ', 0.15986727880595306)
('year3_wkts_pm', 0.10662431155281903)
('year4_wkts_pm', 0.096678941430961812)
('bowl_home_adv', 0.079387285257901702)
('year5_wkts_pm', 0.046883081374984897)
 
Explained Variance (RF model): 0.801506756728
Explained Variance (Baseline): 0.281107905196
----
Mean Squared Error (RF model): 32.4982374911
Mean Squared Error (Baseline): 123.171196479
----
 
*************************************************
**** RUNNING MODELS FOR SMALLEST FEATURE SET ****
*************************************************
**** LINEAR REGRESSION ****
                            OLS Regression Results                            
Dep. Variable:                   wkts   R-squared:                       0.

[Parallel(n_jobs=-1)]: Done  55 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done 205 tasks      | elapsed:   33.5s
[Parallel(n_jobs=-1)]: Done 455 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 805 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed:  2.7min finished


Best Parameters: {'bootstrap': True, 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300, 'random_state': 10}
 
Sorted Feature Importance:
('year1_wkts_pm', 0.35650965271545709)
('year2_wkts_pm', 0.34318879550437204)
('year3_wkts_pm', 0.13799036443522741)
('year4_wkts_pm', 0.10779654640703837)
('year5_wkts_pm', 0.054514640937905152)
 
Explained Variance (RF model): 0.803787480675
Explained Variance (Baseline): 0.281107905196
----
Mean Squared Error (RF model): 32.108824735
Mean Squared Error (Baseline): 123.171196479
----
 


In [10]:
fitting_lr_and_rf(input_file, test_yr=2016, fit_lr=True, fit_rf=True)

*********************************************
**** RUNNING MODELS FOR FULL FEATURE SET ****
*********************************************
**** LINEAR REGRESSION ****
                            OLS Regression Results                            
Dep. Variable:                   wkts   R-squared:                       0.078
Model:                            OLS   Adj. R-squared:                  0.069
Method:                 Least Squares   F-statistic:                     9.212
Date:                Thu, 11 Jan 2018   Prob (F-statistic):           9.05e-20
Time:                        10:45:32   Log-Likelihood:                -3512.6
No. Observations:                1543   AIC:                             7055.
Df Residuals:                    1528   BIC:                             7135.
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                          coef    std err   

[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done 216 tasks      | elapsed:   30.7s
[Parallel(n_jobs=-1)]: Done 466 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 816 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed:  3.7min finished


Best Parameters: {'bootstrap': True, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 300, 'random_state': 10}
 
Sorted Feature Importance:
('ground_bowl_typ', 0.18269068433351016)
('year1_wkts_pm', 0.10609633524201466)
('oppo_agnst_bowl_typ', 0.10505041795834122)
('year2_wkts_pm', 0.10436081128335983)
('bowl_home_adv', 0.084921086023531739)
('year3_wkts_pm', 0.069824590128187439)
('bowler_agnst_oppo', 0.067173858028487118)
('year1_mtchs_pld', 0.057912823315163231)
('year4_wkts_pm', 0.049335605134161756)
('year2_mtchs_pld', 0.049307502059870098)
('year3_mtchs_pld', 0.044432911563004378)
('year4_mtchs_pld', 0.02869910514176683)
('year5_wkts_pm', 0.026759580332887237)
('year5_mtchs_pld', 0.023434689455714204)
 
Explained Variance (RF model): 0.704230099892
Explained Variance (Baseline): 0.620222303278
----
Mean Squared Error (RF model): 49.5273414207
Mean Squared Error (Baseline): 64.357582933
----
 
*******************************

[Parallel(n_jobs=-1)]: Done  60 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done 210 tasks      | elapsed:   33.6s
[Parallel(n_jobs=-1)]: Done 460 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 810 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed:  3.5min finished


Best Parameters: {'bootstrap': False, 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100, 'random_state': 10}
 
Sorted Feature Importance:
('year2_wkts_pm', 0.29160333458567761)
('year1_wkts_pm', 0.23526634562337734)
('year3_wkts_pm', 0.16100280432608463)
('ground_bowl_typ', 0.15054128257697855)
('year4_wkts_pm', 0.064527541463384019)
('bowl_home_adv', 0.054412848484555763)
('bowler_agnst_oppo', 0.018698599978176533)
('year5_wkts_pm', 0.013606687428992363)
('oppo_agnst_bowl_typ', 0.010340555532773249)
 
Explained Variance (RF model): 0.693967431339
Explained Variance (Baseline): 0.620222303278
----
Mean Squared Error (RF model): 51.464747631
Mean Squared Error (Baseline): 64.357582933
----
 
************************************************
**** RUNNING MODELS FOR SMALLER FEATURE SET ****
************************************************
**** LINEAR REGRESSION ****
                            OLS Regression Results                  

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   28.7s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed:  3.0min finished


Best Parameters: {'bootstrap': True, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100, 'random_state': 10}
 
Sorted Feature Importance:
('ground_bowl_typ', 0.26394785528100712)
('year1_wkts_pm', 0.16546145436694651)
('year2_wkts_pm', 0.15792194913697599)
('bowl_home_adv', 0.14826919004421873)
('year3_wkts_pm', 0.11266482480389227)
('year4_wkts_pm', 0.096387909698794572)
('year5_wkts_pm', 0.055346816668164919)
 
Explained Variance (RF model): 0.72476578375
Explained Variance (Baseline): 0.620222303278
----
Mean Squared Error (RF model): 46.0817846851
Mean Squared Error (Baseline): 64.357582933
----
 
*************************************************
**** RUNNING MODELS FOR SMALLEST FEATURE SET ****
*************************************************
**** LINEAR REGRESSION ****
                            OLS Regression Results                            
Dep. Variable:                   wkts   R-squared:                       0.

[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 211 tasks      | elapsed:   29.6s
[Parallel(n_jobs=-1)]: Done 461 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 811 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed:  2.4min finished


Best Parameters: {'bootstrap': True, 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 300, 'random_state': 10}
 
Sorted Feature Importance:
('year2_wkts_pm', 0.39099614760438672)
('year1_wkts_pm', 0.25034447666514398)
('year3_wkts_pm', 0.17362755543297251)
('year4_wkts_pm', 0.14040281609333594)
('year5_wkts_pm', 0.04462900420416048)
 
Explained Variance (RF model): 0.702429401483
Explained Variance (Baseline): 0.620222303278
----
Mean Squared Error (RF model): 50.0291470977
Mean Squared Error (Baseline): 64.357582933
----
 
