In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [2]:
ufc = pd.read_csv('../../Data/UFCdata.csv')

In [3]:
ufc = ufc.dropna()
ufc = ufc.drop(['R_fighter','B_fighter','Referee','date','location'],axis=1)

In [4]:
ufc_dummy = pd.get_dummies(ufc.loc[:, ufc.columns != 'Winner'])
ufc_dummy['Winner'] = ufc.Winner
ufc = ufc_dummy

In [5]:
ufc_train,ufc_test = train_test_split(ufc,test_size=0.2,random_state=321,stratify=ufc.Winner)

In [6]:
features_train = ufc_train.drop('Winner',axis=1)
features_test = ufc_test.drop('Winner',axis=1)
target_train = ufc_train.Winner
target_test = ufc_test.Winner

In [7]:
n_estimators = np.arange(1,1000,100)
max_depth = np.arange(1,100,10)
params = {'n_estimators':n_estimators,'max_depth':max_depth}

In [8]:
rf = RandomForestClassifier(random_state=321)
rfCV = GridSearchCV(rf,param_grid=params,return_train_score=True,n_jobs=-1)
rfCV.fit(features_train,target_train)

GridSearchCV(estimator=RandomForestClassifier(random_state=321), n_jobs=-1,
             param_grid={'max_depth': array([ 1, 11, 21, 31, 41, 51, 61, 71, 81, 91]),
                         'n_estimators': array([  1, 101, 201, 301, 401, 501, 601, 701, 801, 901])},
             return_train_score=True)

In [9]:
optimal_index = rfCV.cv_results_['mean_test_score'].argmax()
train_score = rfCV.cv_results_['mean_train_score'][optimal_index]
validation_score = rfCV.cv_results_['mean_test_score'][optimal_index]
test_score = rfCV.best_estimator_.score(features_test,target_test)
print("Optimal n_estimators:",rfCV.best_params_['n_estimators'])
print("Optimal max_depth:",rfCV.best_params_['max_depth'])
print("Optimal Train Accuracy:",round(train_score,3))
print("Optimal Validation Accuracy:",round(validation_score,3))
print("Optimal Test Accuracy:",round(test_score,3))

Optimal n_estimators: 301
Optimal max_depth: 21
Optimal Train Accuracy: 1.0
Optimal Validation Accuracy: 0.634
Optimal Test Accuracy: 0.63


In [10]:
n_estimators = np.arange(150,450,30)
max_depth = np.arange(15,35,5)
params = {'n_estimators':n_estimators,'max_depth':max_depth}

In [11]:
rf = RandomForestClassifier(random_state=123)
rfCV = GridSearchCV(rf,param_grid=params,return_train_score=True,n_jobs=-1)
rfCV.fit(features_train,target_train)

GridSearchCV(estimator=RandomForestClassifier(random_state=123), n_jobs=-1,
             param_grid={'max_depth': array([15, 20, 25, 30]),
                         'n_estimators': array([150, 180, 210, 240, 270, 300, 330, 360, 390, 420])},
             return_train_score=True)

In [12]:
optimal_index = rfCV.cv_results_['mean_test_score'].argmax()
train_score = rfCV.cv_results_['mean_train_score'][optimal_index]
validation_score = rfCV.cv_results_['mean_test_score'][optimal_index]
test_score = rfCV.best_estimator_.score(features_test,target_test)
print("Optimal n_estimators:",rfCV.best_params_['n_estimators'])
print("Optimal max_depth:",rfCV.best_params_['max_depth'])
print("Optimal Train Accuracy:",round(train_score,3))
print("Optimal Validation Accuracy:",round(validation_score,3))
print("Optimal Test Accuracy:",round(test_score,3))

Optimal n_estimators: 270
Optimal max_depth: 15
Optimal Train Accuracy: 1.0
Optimal Validation Accuracy: 0.634
Optimal Test Accuracy: 0.631


In [13]:
baseline_score = sum(target_test == "Red") / len(target_test)
print("Baseline Accuracy:",round(baseline_score,3))

Baseline Accuracy: 0.616
