Purpose is to optimize the RandomForest model

In [1]:
# loading libraries
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt

In [2]:
# load dataset
#   Training data
df_training = pd.read_csv('C:\\Users\\rivas\\OneDrive\\Documents\\JMR\\Education\\Springboard\\Projects\\Capstone1\\fashionmnisttrain.csv')

#   split data table into data Features (x) and class labels (y)
xdf_train = df_training.iloc[:, 1:]
ydf_train = df_training.iloc[:, :1]
print('Features: ',xdf_train.shape, ' Labels: ',ydf_train.shape)

Features:  (60000, 784)  Labels:  (60000, 1)


In [3]:
#   Test data
#   split data table into data Features (x) and class labels (y)
df_test = pd.read_csv('C:\\Users\\rivas\\OneDrive\\Documents\\JMR\\Education\\Springboard\\Projects\\Capstone1\\fashionmnisttest.csv')

#   split data table into data Features (x) and class labels (y)
xdf_test = df_test.iloc[:, 1:]
ydf_test = df_test.iloc[:, :1]
print('Features: ',xdf_test.shape, ' Labels: ',ydf_test.shape)

Features:  (10000, 784)  Labels:  (10000, 1)


In [4]:
#   Convert to numpy array
X_train = xdf_train.as_matrix()
y_train = ydf_train.as_matrix()
X_test = xdf_test.as_matrix()
y_test = ydf_test.as_matrix()

# Reshape array
y2_train = np.ravel( y_train[ : , 0] ) 
y2_test = np.ravel( y_test[ : , 0] ) 

#
print(X_train.shape, y2_train.shape)


(60000, 784) (60000,)


Reduce the dimensionality of a data set. To be able to process large datasets used
principal component analysis (PCA) to reduce the dimensionality of a data set.
This is an Unsupervised clustering algorithm.

In [5]:
# Standardizing
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X_train)

# PCA in scikit-learn  
from sklearn.decomposition import PCA
pca = PCA().fit(X_std)



We wil use 50 components  for the PCA to reduce the dimensionality

In [6]:
# PCA - Using 50 components for training data
sklearn50_pca = PCA(n_components=50)
X_50 = sklearn50_pca.fit_transform(X_std)
print(X_50.shape)

(60000, 50)


In [7]:
# Test data - Using 50 components
X_std = StandardScaler().fit_transform(X_test)
pca = PCA().fit(X_std)
sklearn50_pca = PCA(n_components=50)
Xtest_50 = sklearn50_pca.fit_transform(X_std)
print(Xtest_50.shape)



(10000, 50)


Run the initial Randon Forest model

In [8]:
# Random Forest Classifier
# 1 - Import the model
from sklearn.ensemble import RandomForestClassifier


In [9]:
# 2 - Make an instance of the Model
RF_clf = RandomForestClassifier(n_jobs=2, random_state=0)

In [10]:
# 3 - Training the model: Using PCA 50 (X_50) for training data
#     fitting the model
RF_clf.fit(X_50, y2_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [11]:
# 4 - Predict the response
RF_clf_predict = RF_clf.predict(Xtest_50)

In [12]:
# 5 - Evaluate accuracy
#     loading libraries
from sklearn.metrics import precision_score, \
    recall_score, confusion_matrix, classification_report, \
    accuracy_score, f1_score

print ('Accuracy:', accuracy_score(y2_test, RF_clf_predict))

Accuracy: 0.6948


We will start the process to optimize the RF model

In [14]:
# To use RandomizedSearchCV, we first need
# to create a parameter grid to sample from during fitting:

from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)


# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth}
pprint(random_grid)

{'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [25]:
# First create the base model to tune
RF_clf = RandomForestClassifier()


In [27]:
# Random search of parameters, using 2 fold cross validation, 
# search across 10 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = RF_clf, param_distributions = random_grid, n_iter = 10, cv = 2, verbose=2, random_state=42, n_jobs = -1)

In [28]:
# Fit the random search model
rf_random.fit(X_50, y2_train)

Fitting 2 folds for each of 10 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed: 72.0min finished


RandomizedSearchCV(cv=2, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=-1,
          param_distributions={'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [29]:
rf_random.best_params_

{'max_depth': 50, 'max_features': 'auto', 'n_estimators': 1400}

To determine if random search yielded a better model, we compare the
base model with the best random search model

In [33]:
# Establish Base Model
base_model = RandomForestClassifier(n_jobs=2, random_state=0)
base_model.fit(X_50, y2_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [35]:
# Predict the response
base_model_predict = base_model.predict(Xtest_50)

In [36]:
print ('Accuracy:', accuracy_score(y2_test, base_model_predict))


Accuracy: 0.6948


In [38]:
# Establish new model with best parameters
# {'max_depth': 50, 'max_features': 'auto', 'n_estimators': 1400}
new_model = RandomForestClassifier(max_depth=50, max_features='auto', n_estimators=1400,n_jobs=-1, random_state=42 )
new_model.fit(X_50, y2_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=50, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1400, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [41]:
new_model_predict = new_model.predict(Xtest_50)

Compare Accurracy of both models[{000214A0-0000-0000-C000-000000000046}]
Prop3=19,2
[InternetShortcut]
IDList=
URL=http://localhost:8888/notebooks/SP%20Capstone%201/Fine%20tuning%20RF.ipynb#


In [44]:
print ('Base Model Accuracy:', accuracy_score(y2_test, base_model_predict))
print ('New Model Accuracy :', accuracy_score(y2_test, new_model_predict))

Base Model Accuracy: 0.6948
New Model Accuracy : 0.7734
