In [1]:
import numpy as np
import pandas as pd
from IPython.display import display
import matplotlib.pyplot as plt
import time
import math
import pickle

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.ensemble import RandomForestClassifier

from sklearn import metrics
from sklearn.model_selection import cross_val_score

from scipy.stats import poisson,skellam
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
import statsmodels.formula.api as smf

%matplotlib inline
pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings('ignore')

In [2]:
TRAIN_FILE = './data/holes-2016.csv'
VAL_FILE = './data/holes-2017.csv'
TEST_FILE = './data/holes-2018.csv'

In [3]:
def parse_time_to_hour(x):
    return int(x[:2])

def is_morning(x):
    return x < 13

def get_wind(t, a, p):
    if is_morning(t):
        return a
    else:
        return p

In [4]:
def multicollinearity_check(X, inplace=True, thresh=5.0, debug=False):
    # adapted from: https://stackoverflow.com/a/48826255
    # https://www.statsmodels.org/stable/generated/statsmodels.stats.outliers_influence.variance_inflation_factor.html
    data_type = X.dtypes
    int_cols = \
    X.select_dtypes(include=['uint8', 'int', 'int16', 'int32', 'int64', 'float', 'float16', 'float32', 'float64']).shape[1]
    total_cols = X.shape[1]
    try:
        if int_cols != total_cols:
            raise Exception('All the columns should be integer or float, for multicollinearity test.')
        else:
            variables = list(range(X.shape[1]))
            dropped = True
            if debug:
                print('''\n\nThe VIF calculator will now iterate through the features and calculate their respective values.
                It shall continue dropping the highest VIF features until all the features have VIF less than the threshold of 5.\n\n''')
            while dropped:
                dropped = False
                vif = [variance_inflation_factor(X.iloc[:, variables].values, ix) for ix in variables]
                if debug:
                    print('\n\nvif is: ', vif)
                maxloc = vif.index(max(vif))
                if max(vif) > thresh:
                    if debug:
                        print('dropping \'' + X.iloc[:, variables].columns[maxloc] + '\' at index: ' + str(maxloc))
                    # del variables[maxloc]
                    X.drop(X.columns[variables[maxloc]], 1, inplace=inplace)
                    variables = list(range(X.shape[1]))
                    dropped = True

            if debug:
                print('\n\nRemaining variables:\n')
                print(X.columns[variables])
            # return X.iloc[:,variables]
            return X
    except Exception as e:
        print('Error caught: ', e)

In [93]:
def load_and_process_simple(file):
    columns = [
        'Id', 'Name', 'Tournament Year', 'Tournament Schedule', 'Event Name', 'Course Name', 'Round', 
        'Hole', 'Driving Accuracy', 'Driving Distance', 'Green in Regulation', 'Scrambling Success',
        'Putts Per Hole', 'Putts Per Hole GIR',
        'Total SG Putting', 'Avg SG Putting', 'Total SG OTT', 'Avg SG OTT', 'Total SG Approach', 
        'Avg SG Approach', 'Total SG Around the Green', 'Avg SG Around the Green', 'Par', 'Actual Yard', 
        'Fwy Firmness', 'Grn Firmness', 'Stimp', 'Fwy Height', 'Grn Height', 'Rough Height', 'Tee Grass', 
        'Fwy. Grass', 'Rough Grass', 'Actual 250 Distance', 'Actual 275 Distance', 'Actual 300 Distance', 
        'Actual 325 Distance', 'Actual 350 Distance', 'AM Wind Spd', 'AM Wind Dir', 'PM Wind Spd', 
        'PM Wind Dir', 'Time Hole Finished', 'Hit Fwy', 'Hit Green', 'Hit Greenside Bunker', 'Tee Shot Landing Loc', 
        'Score'
    ]    
    df = pd.read_csv(file, index_col=None, names=columns)
    
    # drop SG totals, use averages only
    df.drop(['Total SG Putting', 'Total SG OTT', 'Total SG Approach', 'Total SG Around the Green'], axis=1, inplace=True)
    
    # drop wind and time
    df.drop(['AM Wind Dir', 'AM Wind Spd', 'PM Wind Dir', 'PM Wind Spd'], axis=1, inplace=True)
    df.drop(['Time Hole Finished'], axis=1, inplace=True)
    
    # drop result columns (include later for potential simulations?)
    df.drop(['Hit Fwy', 'Hit Green', 'Hit Greenside Bunker', 'Tee Shot Landing Loc'], axis=1, inplace=True)
    
    # drop non-features (only for identification)
    df.drop(['Id', 'Name', 'Tournament Year', 'Tournament Schedule', 'Event Name', 'Course Name', 'Round', 'Hole'], axis=1, inplace=True)
    
    # drop categorical firmness and grass
    df.drop(['Fwy Firmness', 'Grn Firmness', 'Tee Grass', 'Fwy. Grass', 'Rough Grass'], axis=1, inplace=True)
    
    # rename columns to remove non-alpha chars
    renamed_cols = {}
    for x in df.columns:
        renamed_cols[x] = x.replace('/','').replace('&','').replace('#','').replace(' ','').replace('.','')
        
    df.rename(columns=renamed_cols, inplace=True)
        
    return df

In [49]:
def load_and_process(file):
    columns = [
        'Id', 'Name', 'Tournament Year', 'Tournament Schedule', 'Event Name', 'Course Name', 'Round', 
        'Hole', 'Driving Accuracy', 'Driving Distance', 'Green in Regulation', 'Scrambling Success',
        'Putts Per Hole', 'Putts Per Hole GIR',
        'Total SG Putting', 'Avg SG Putting', 'Total SG OTT', 'Avg SG OTT', 'Total SG Approach', 
        'Avg SG Approach', 'Total SG Around the Green', 'Avg SG Around the Green', 'Par', 'Actual Yard', 
        'Fwy Firmness', 'Grn Firmness', 'Stimp', 'Fwy Height', 'Grn Height', 'Rough Height', 'Tee Grass', 
        'Fwy. Grass', 'Rough Grass', 'Actual 250 Distance', 'Actual 275 Distance', 'Actual 300 Distance', 
        'Actual 325 Distance', 'Actual 350 Distance', 'AM Wind Spd', 'AM Wind Dir', 'PM Wind Spd', 
        'PM Wind Dir', 'Time Hole Finished', 'Hit Fwy', 'Hit Green', 'Hit Greenside Bunker', 'Tee Shot Landing Loc', 
        'Score'
    ]   
    df = pd.read_csv(file, index_col=None, names=columns)
    
    # convert time to just the hour (24-hour)
    df['Time Hole Finished'] = df['Time Hole Finished'].apply(parse_time_to_hour)
    
    # determine wind speed and direction at time of hole played (am/pm)
    # drop old columns
    df['Wind Dir'] = np.vectorize(get_wind)(df['Time Hole Finished'], df['AM Wind Dir'], df['PM Wind Dir'])
    df['Wind Spd'] = np.vectorize(get_wind)(df['Time Hole Finished'], df['AM Wind Spd'], df['PM Wind Spd'])
    df.drop(['AM Wind Dir', 'AM Wind Spd', 'PM Wind Dir', 'PM Wind Spd'], axis=1, inplace=True)
    df.drop(['Time Hole Finished'], axis=1, inplace=True)
    
    # drop result columns (include later for potential simulations?)
    df.drop(['Hit Fwy', 'Hit Green', 'Hit Greenside Bunker', 'Tee Shot Landing Loc'], axis=1, inplace=True)
    
    # drop non-features (only for identification)
    df.drop(['Id', 'Name', 'Tournament Year', 'Tournament Schedule', 'Event Name', 'Course Name', 'Hole'], axis=1, inplace=True)
    
    ## one hot encode firmness and grasses
    #df = pd.get_dummies(df, columns=['Fwy Firmness', 'Grn Firmness', 'Tee Grass', 'Fwy. Grass', 'Rough Grass'])
    df.drop(['Fwy Firmness', 'Grn Firmness', 'Tee Grass', 'Fwy. Grass', 'Rough Grass'], axis=1, inplace=True)
    
    ## one hot encode wind dir ({' G', '0', 'C', 'DW', 'IW', 'LR', 'RL', 'SW'})
    #df = pd.get_dummies(df, columns=['Wind Dir'])
    df.drop(['Wind Dir'], axis=1, inplace=True)
    
    # drop wind spd for now until method of parsing can be determined
    df.drop(['Wind Spd'], axis=1, inplace=True)
    
    # rename columns to remove non-alpha chars
    renamed_cols = {}
    for x in df.columns:
        renamed_cols[x] = x.replace('/','').replace('&','').replace('#','').replace(' ','').replace('.','')
        
    df.rename(columns=renamed_cols, inplace=True)
    
    # run values through a standard scalar?
        
    return df

In [50]:
def split_data(data):
    X = data.drop('Score', axis=1, inplace=False)
    y = data['Score']
    return X, y

def normalize_data(X):
    cols = X.columns
    return pd.DataFrame(MinMaxScaler().fit_transform(X), columns=cols)

In [94]:
df_train = load_and_process_simple(TRAIN_FILE)
df_train.head()

Unnamed: 0,AvgSGPutting,AvgSGOTT,AvgSGApproach,AvgSGAroundtheGreen,Par,ActualYard,Stimp,FwyHeight,GrnHeight,RoughHeight,Actual250Distance,Actual275Distance,Actual300Distance,Actual325Distance,Actual350Distance,Score
0,0.00701,0.00065,0.00359,0.00215,4,444,11.0,0.44,0.1,2.0,26.0,28.0,31.0,33.0,0.0,4.0
1,0.00701,0.00065,0.00359,0.00215,4,358,11.0,0.44,0.1,2.0,25.0,28.0,30.0,33.0,0.0,3.0
2,0.00701,0.00065,0.00359,0.00215,4,455,11.0,0.44,0.1,2.0,25.0,28.0,30.0,33.0,35.0,4.0
3,0.00701,0.00065,0.00359,0.00215,3,160,11.0,0.44,0.1,2.0,0.0,0.0,0.0,0.0,0.0,3.0
4,0.00701,0.00065,0.00359,0.00215,5,531,11.0,0.44,0.1,2.0,25.0,28.0,30.0,33.0,35.0,5.0


In [95]:
X_train, y_train = split_data(df_train)
old_columns = list(X_train.columns)

print('Total columns BEFORE multicollinearity check:', len(X_train.columns))
#multicollinearity_check(X_train, debug=False)
print('Total columns AFTER multicollinearity check:', len(X_train.columns))

new_columns = list(X_train.columns)

Total columns BEFORE multicollinearity check: 15
Total columns AFTER multicollinearity check: 15


In [96]:
X_train.head()

Unnamed: 0,AvgSGPutting,AvgSGOTT,AvgSGApproach,AvgSGAroundtheGreen,Par,ActualYard,Stimp,FwyHeight,GrnHeight,RoughHeight,Actual250Distance,Actual275Distance,Actual300Distance,Actual325Distance,Actual350Distance
0,0.00701,0.00065,0.00359,0.00215,4,444,11.0,0.44,0.1,2.0,26.0,28.0,31.0,33.0,0.0
1,0.00701,0.00065,0.00359,0.00215,4,358,11.0,0.44,0.1,2.0,25.0,28.0,30.0,33.0,0.0
2,0.00701,0.00065,0.00359,0.00215,4,455,11.0,0.44,0.1,2.0,25.0,28.0,30.0,33.0,35.0
3,0.00701,0.00065,0.00359,0.00215,3,160,11.0,0.44,0.1,2.0,0.0,0.0,0.0,0.0,0.0
4,0.00701,0.00065,0.00359,0.00215,5,531,11.0,0.44,0.1,2.0,25.0,28.0,30.0,33.0,35.0


In [97]:
def load_and_process_filter(file, columns_filter):
    df = load_and_process(file)
    X, y = split_data(df)
    
    same_cols = list(set(columns_filter).intersection(set(X.columns)))
    diff_cols = list(set(columns_filter).difference(set(X.columns)))
    X = X[same_cols]
    for col in diff_cols:
        X[col] = 0
    return X, y

In [98]:
#X_val, y_val = load_and_process_filter(VAL_FILE, new_columns)
df_val = load_and_process_simple(VAL_FILE)
X_val, y_val = split_data(df_val)
X_val.head()

Unnamed: 0,AvgSGPutting,AvgSGOTT,AvgSGApproach,AvgSGAroundtheGreen,Par,ActualYard,Stimp,FwyHeight,GrnHeight,RoughHeight,Actual250Distance,Actual275Distance,Actual300Distance,Actual325Distance,Actual350Distance
0,0.00203,-0.00812,0.01908,-0.00044,4,439,11.6,0.44,0.11,2.5,25.0,28.0,30.0,33.0,35.0
1,0.00203,-0.00812,0.01908,-0.00044,3,200,11.6,0.44,0.11,2.5,0.0,0.0,0.0,0.0,0.0
2,0.00203,-0.00812,0.01908,-0.00044,4,432,11.6,0.44,0.11,2.5,25.0,27.0,30.0,32.0,35.0
3,0.00203,-0.00812,0.01908,-0.00044,4,407,11.6,0.44,0.11,2.5,25.0,27.0,30.0,32.0,35.0
4,0.00203,-0.00812,0.01908,-0.00044,5,535,11.6,0.44,0.11,2.5,25.0,27.0,30.0,32.0,35.0


In [99]:
#X_test, y_test = load_and_process_filter(TEST_FILE, new_columns)
df_test = load_and_process_simple(TEST_FILE)
X_test, y_test = split_data(df_test)
X_test.head()

Unnamed: 0,AvgSGPutting,AvgSGOTT,AvgSGApproach,AvgSGAroundtheGreen,Par,ActualYard,Stimp,FwyHeight,GrnHeight,RoughHeight,Actual250Distance,Actual275Distance,Actual300Distance,Actual325Distance,Actual350Distance
0,0.00338,0.01169,0.00049,-0.00363,4,430,11.75,0.44,0.11,2.5,25.0,27.0,30.0,32.0,35.0
1,0.00338,0.01169,0.00049,-0.00363,3,203,11.75,0.44,0.11,2.5,0.0,0.0,0.0,0.0,0.0
2,0.00338,0.01169,0.00049,-0.00363,4,433,11.75,0.44,0.11,2.5,25.0,27.0,30.0,32.0,35.0
3,0.00338,0.01169,0.00049,-0.00363,4,407,11.75,0.44,0.11,2.5,25.0,27.0,30.0,32.0,35.0
4,0.00338,0.01169,0.00049,-0.00363,5,525,11.75,0.44,0.11,2.5,25.0,27.0,30.0,32.0,35.0


In [100]:
print('Training X Shape', X_train.shape)
print('Training y Shape', y_train.shape)
print('Validation X Shape', X_val.shape)
print('Validation y Shape', y_val.shape)
print('Test X Shape', X_test.shape)
print('Test y Shape', y_test.shape)

Training X Shape (12244, 15)
Training y Shape (12244,)
Validation X Shape (13137, 15)
Validation y Shape (13137,)
Test X Shape (13184, 15)
Test y Shape (13184,)


In [173]:
THRESHOLD = 2

def scoring(model, X, y):
    return score(y, model.predict(X))

def score(y, y_hat):
    return np.mean((y_hat > THRESHOLD) == (y > 2))

def run_cross_val(model, X, y, s=scoring):
    return cross_val_score(model, X, y, scoring=s)

In [175]:
run_cross_val(rf_reg, X_train, y_train, s=scoring)

array([0.96692798, 0.96593972, 0.97010537])

In [91]:
run_cross_val(rf_reg, X_val, y_val, s=scoring)

array([0.95524092, 0.95798127, 0.96323361])

In [2]:
col_list = list(X_train.columns)
poisson_formula = ' + '.join(col_list)

poisson_model = smf.glm(formula="Score ~ " + poisson_formula, data=df_train, family=sm.families.Poisson()).fit()
poisson_model.summary()

NameError: name 'X_train' is not defined

In [1]:
poisson_model.predict(X_train)

NameError: name 'poisson_model' is not defined

In [112]:
print(X_train.columns)

Index(['AvgSGPutting', 'AvgSGOTT', 'AvgSGApproach', 'AvgSGAroundtheGreen',
       'Par', 'ActualYard', 'Stimp', 'FwyHeight', 'GrnHeight', 'RoughHeight',
       'Actual250Distance', 'Actual275Distance', 'Actual300Distance',
       'Actual325Distance', 'Actual350Distance'],
      dtype='object')


In [116]:
# predict Phil 2018 at firestone round 1, hole 16 (tournament 450)

test_data = pd.DataFrame(data={
    'DrivingAccuracy': 0,
    'DrivingDistance': 0,
    'GreenInRegulation': 0,
    'ScramblingSuccess': 0,
    'PuttsPerHole': 0,
    'PuttsPerHoleGIR': 0,
    'AvgSGPutting': 0.01209, 
    'AvgSGOTT': 0.00163, 
    'AvgSGApproach': 0.00439, 
    'AvgSGAroundtheGreen': -0.00642,
    'Par': 5, 
    'ActualYard': 652, 
    'Stimp': 12.5, 
    'FwyHeight': 0.300000012, 
    'GrnHeight': 0.100000001, 
    'RoughHeight': 3,
    'Actual250Distance': 31, 
    'Actual275Distance': 33, 
    'Actual300Distance': 36,
    'Actual325Distance': 38, 
    'Actual350Distance': 41
}, index=[1])

poisson_model.predict(test_data)

1    4.783581
dtype: float64

In [124]:
def poisson_probabilities(model, df, max_score=9):
    hole_avg = model.predict(df).values[0]
    #hole_pred = [[poisson.pmf(i, h_avg) for i in range(0, max_score+1)] for h_avg in [hole_avg]]
    hole_pred = [poisson.pmf(i, hole_avg) for i in range(0, max_score+1)]
    return hole_pred

In [125]:
poisson_probabilities(poisson_model, test_data)

[0.008365986832551577,
 0.04001937549018214,
 0.09571796171090359,
 0.1526248736912895,
 0.18252336071184228,
 0.17462305533725922,
 0.13922058769352272,
 0.09513899361421158,
 0.056888135037321395,
 0.03023655563842188]

In [127]:
pickle.dump(poisson_model, open('./models/hole-prediction-poisson.pkl', 'wb'))

In [128]:
rf_reg.predict(test_data)

array([4.8])

In [176]:
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)
y_pred_clf = rf_clf.predict(X_val)
rf_clf.score(X_val, y_val)

0.5827814569536424

In [171]:
pickle.dump(rf_clf, open('./models/hole-prediction-rf-clf.pkl', 'wb'))

In [177]:
print(y_pred_clf[:5])

[4. 3. 4. 4. 5.]


In [178]:
print(np.round(y_val[0:5],0))

0    5.0
1    4.0
2    3.0
3    3.0
4    4.0
Name: Score, dtype: float64


In [138]:
print('Mean score:', y_val.mean())
print('Mean predicted:', y_pred_clf.mean())

Mean score: 3.8657988886351524
Mean predicted: 3.7749866788460076


In [160]:
rf_clf.estimators_[0]

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=5, max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=1934392477, splitter='best')

In [157]:
feature_importances = pd.DataFrame(rf_clf.feature_importances_,
                                   index = X_train.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)
feature_importances.loc[feature_importances['importance'] >= 0.01]

Unnamed: 0,importance
ActualYard,0.406839
Par,0.103282
AvgSGApproach,0.058297
AvgSGPutting,0.057032
AvgSGAroundtheGreen,0.056831
AvgSGOTT,0.056703
Actual250Distance,0.052579
Actual275Distance,0.043762
Actual325Distance,0.033318
Actual350Distance,0.0309


In [141]:
cross_val_score(rf_clf, X_train, y_train, scoring=scoring)

array([0.9630355 , 0.95270767, 0.96027464])

In [142]:
cross_val_score(rf_clf, X_val, y_val, scoring=scoring)

array([0.95667047, 0.93899931, 0.95794286])

In [146]:
# random vs grid search tests:  https://scikit-learn.org/stable/auto_examples/model_selection/plot_randomized_search.html

from scipy.stats import randint as sp_randint
from time import time
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [147]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [149]:
clf = RandomForestClassifier(n_estimators=20, n_jobs=-1)
param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(2, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search, cv=5)

start = time()
random_search.fit(X_train, y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)

RandomizedSearchCV took 16.12 seconds for 20 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.634 (std: 0.010)
Parameters: {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 3, 'max_features': 9, 'min_samples_split': 4}

Model with rank: 2
Mean validation score: 0.634 (std: 0.010)
Parameters: {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 3, 'max_features': 9, 'min_samples_split': 10}

Model with rank: 3
Mean validation score: 0.633 (std: 0.012)
Parameters: {'bootstrap': True, 'criterion': 'gini', 'max_depth': 3, 'max_features': 7, 'min_samples_split': 8}



In [151]:
param_grid = {"max_depth": [3, None],
              "max_features": [1, 3, 10],
              "min_samples_split": [2, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=5)
start = time()
grid_search.fit(X_train, y_train)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.cv_results_['params'])))
report(grid_search.cv_results_)

GridSearchCV took 46.53 seconds for 72 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.637 (std: 0.010)
Parameters: {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 3, 'max_features': 10, 'min_samples_split': 3}

Model with rank: 2
Mean validation score: 0.636 (std: 0.010)
Parameters: {'bootstrap': True, 'criterion': 'gini', 'max_depth': 3, 'max_features': 10, 'min_samples_split': 2}

Model with rank: 3
Mean validation score: 0.635 (std: 0.009)
Parameters: {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 3, 'max_features': 10, 'min_samples_split': 10}



#### TODO
- run larger grid search
- add in ALL golfers and train
- test against full 18 hole round (RF Classifier vs Poisson)
- Remove course features and see about differences
- Add in additional golfer features
- Run cumulative stats against last 450 HOLES played (25 rounds or 6.25 events) - future work: see what amount history is best
- visualize decision tree graph

In [167]:
from sklearn.ensemble import RandomForestClassifier
best_rf_clf = RandomForestClassifier(max_features=10, oob_score=True, bootstrap=True, criterion='entropy', 
                                    max_depth=3, min_samples_split=3)
best_rf_clf.fit(X_train, y_train)
y_pred_best_clf = best_rf_clf.predict(X_val)
best_rf_clf.score(X_val, y_val)

0.6292151937276395

In [168]:
print('Mean score:', y_val.mean())
print('Mean predicted:', y_pred_best_clf.mean())

Mean score: 3.8657988886351524
Mean predicted: 3.82735784425668


In [170]:
feature_importances = pd.DataFrame(best_rf_clf.feature_importances_,
                                   index = X_train.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)
feature_importances.loc[feature_importances['importance'] >= 0.01]

Unnamed: 0,importance
Par,0.559594
ActualYard,0.415392


In [172]:
pickle.dump(best_rf_clf, open('./models/hole-prediction-best-rf-clf.pkl', 'wb'))