In [25]:
'''
Empirical analysis for RandomForestClassifier
    1. n_estimators
    2. random_state
    3. criterion
    4. depth
    5. max_features
    6. bootstrap


'''

'\nEmpirical analysis for RandomForestClassifier\n    1. n_estimators\n    2. random_state\n    3. criterion\n    4. depth\n    5. max_features\n    6. bootstrap\n\n\n'

In [26]:
from __future__ import division
from mnist import MNIST
from sklearn import tree
import numpy as np
from sklearn.metrics import classification_report,log_loss,accuracy_score,roc_auc_score
from time import time
import pandas as pd
from datetime import datetime
from openpyxl import load_workbook
import re
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import label_binarize


In [27]:
from sklearn.ensemble import RandomForestClassifier


In [28]:
# Loading the data
mndata = MNIST('./')
images_train, labels_train = mndata.load_training()
images_test, labels_test = mndata.load_testing()
labels_test = np.array(labels_test)
excel_file_name = 'RandomForest' + str(datetime.now().strftime('%Y%m%d%H%M%S')) + '.xlsx'

images_train = np.array(images_train)
labels_train = np.array(labels_train)


In [29]:
def classifaction_report_to_dataframe(report):
    report_data = []
    lines = report.split('\n')
    for line in lines[2:-5]:
        row = {}
        row_data = line.split('      ')
        if len(row_data)>4:
            row['class'] = row_data[1]
            row['precision'] = float(row_data[2])
            row['recall'] = float(row_data[3])
            row['f1_score'] = float(row_data[4])
            row['support'] = float(row_data[5])
            report_data.append(row)
    for line in lines[-4:-1]:
        row = {}
        p = re.compile("[a-z]+ [a-z]+")
        m = p.search(line)  
        row['class'] = m.group(0)
        p = re.compile("\d+.?\d+")
        row_data = p.findall(line)
        row['precision'] = float(row_data[0])
        row['recall'] = float(row_data[1])
        row['f1_score'] = float(row_data[2])
        row['support'] = float(row_data[3])
        report_data.append(row)
        
        
    dataframe = pd.DataFrame.from_dict(report_data)
    return dataframe


In [30]:
'''
    1.
    Aim: Analysis on the n_estimators
    RandomForestClassifier with n_estimators = 1,2,5,10,25,50,100,250,500,1000 with random_state = 10

    Outcome: Accuracy increases and negative log loss decreases with increase in 
    n_estimators upto a point after that the increase in accuracy 
    and decrease in log loss is very less for increase in n_estimators.
    
'''

n_random_trees  =[1,2,5,10,25,50,100,250,500,1000] 
# n_random_trees  =[1,2] 

r_s = 10
df = pd.DataFrame(columns=['Number of Estimators', 'Accuracy','Normalized Accuracy', 
                 'Time_taken_train','Time_taken_test',
                 'Negative Log loss','Macro_auc','Micro_auc'])

for n in n_random_trees:
    rf = RandomForestClassifier(n_estimators = n, random_state = r_s)

    start = time()
    rf.fit(images_train, labels_train)
    train_time = time() - start
    
    start = time()
    predictions = rf.predict(images_test)
    test_time = time() - start
   
    # Metrics calculation 
    accuracy = accuracy_score(predictions,labels_test,normalize=False)
    normalized_accuracy = accuracy_score(predictions,labels_test,normalize=True)
    log_loss_prediction = rf.predict_proba(images_test)
    log_loss_val = log_loss(labels_test,log_loss_prediction)     
    classfication_repo = classification_report(labels_test, predictions)
    
    labels_train_one_hot=label_binarize(labels_test,classes=[0,1,2,3,4,5,6,7,8,9])
    predictions_one_hot=label_binarize(predictions,classes=[0,1,2,3,4,5,6,7,8,9])
    micro_auc=roc_auc_score(labels_train_one_hot,predictions_one_hot,average="micro")
    macro_auc=roc_auc_score(labels_train_one_hot,predictions_one_hot,average="macro")
    
    df_data = [n,accuracy,normalized_accuracy,train_time,test_time,log_loss_val,macro_auc,micro_auc]
    df.loc[len(df)] = df_data

excel_writer = pd.ExcelWriter(excel_file_name,engine='xlsxwriter')
df.to_excel(excel_writer, sheet_name='n_estimators')
df_classification_report = classifaction_report_to_dataframe(classfication_repo)
df_classification_report.to_excel(excel_writer, sheet_name='n_estimators_cr')
excel_writer.save()

In [31]:
'''
    2.
    Aim: Analysis on the random_state
    RandomForestClassifier with n_estimators = 20,25,30,35 with random_state = 5,10,15,20,25,30

    Outcome: There is no impact of random_state on accuracy or neg_log_loss
    
'''

n_random_trees  =[20,25,30,35] 
# n_random_trees  =[1,2] 
random_states = [5,10,15,20,25,30]
# random_states = [5,10]

df = pd.DataFrame(columns=['Number of Estimators', 'Random State', 'Accuracy','Normalized Accuracy', 
                 'Time_taken_train','Time_taken_test',
                 'Negative Log loss','Macro_auc','Micro_auc'])

for n in n_random_trees:
    for r_s in random_states:
        rf = RandomForestClassifier(n_estimators = n, random_state = r_s)

        start = time()
        rf.fit(images_train, labels_train)
        train_time = time() - start

        start = time()
        predictions = rf.predict(images_test)
        test_time = time() - start

        # Metrics calculation 
        accuracy = accuracy_score(predictions,labels_test,normalize=False)
        normalized_accuracy = accuracy_score(predictions,labels_test,normalize=True)
        log_loss_prediction = rf.predict_proba(images_test)
        log_loss_val = log_loss(labels_test,log_loss_prediction)     
        classfication_repo = classification_report(labels_test, predictions)

        labels_train_one_hot=label_binarize(labels_test,classes=[0,1,2,3,4,5,6,7,8,9])
        predictions_one_hot=label_binarize(predictions,classes=[0,1,2,3,4,5,6,7,8,9])
        micro_auc=roc_auc_score(labels_train_one_hot,predictions_one_hot,average="micro")
        macro_auc=roc_auc_score(labels_train_one_hot,predictions_one_hot,average="macro")

    
        df_data = [n,r_s,accuracy,normalized_accuracy,train_time,test_time,log_loss_val,macro_auc,micro_auc]
        df.loc[len(df)] = df_data
        
        
book = load_workbook(excel_file_name)
with pd.ExcelWriter(excel_file_name, engine='openpyxl') as excel_writer:
    excel_writer.book = book
    df.to_excel(excel_writer, sheet_name='random_state')
    df_classification_report = classifaction_report_to_dataframe(classfication_repo)
    df_classification_report.to_excel(excel_writer, sheet_name='random_state_cr')
    excel_writer.save()


KeyboardInterrupt: 

In [None]:
'''
    3.
    Aim: Analysis on criterion
    RandomForestClassifier with n_estimators = 20,25,30,35 with random_state = 10

    Outcome: There is no impact of the difference between gini and entropy as both mean the same in terms of accuracy
    
'''
criterions = ['entropy','gini']
n_random_trees  =[20,25,30,35] 
# n_random_trees  =[1,2] 

r_s = 10
df = pd.DataFrame(columns=['Number of Estimators', 'criterion','Accuracy','Normalized Accuracy', 
                 'Time_taken_train','Time_taken_test',
                 'Negative Log loss','Macro_auc','Micro_auc'])

for c in criterions:
    for n in n_random_trees:
        rf = RandomForestClassifier(n_estimators = n, random_state = r_s,criterion =c)

        start = time()
        rf.fit(images_train, labels_train)
        train_time = time() - start

        start = time()
        predictions = rf.predict(images_test)
        test_time = time() - start

        # Metrics calculation 
        accuracy = accuracy_score(predictions,labels_test,normalize=False)
        normalized_accuracy = accuracy_score(predictions,labels_test,normalize=True)
        log_loss_prediction = rf.predict_proba(images_test)
        log_loss_val = log_loss(labels_test,log_loss_prediction)     
        classfication_repo = classification_report(labels_test, predictions)
        
        labels_train_one_hot=label_binarize(labels_test,classes=[0,1,2,3,4,5,6,7,8,9])
        predictions_one_hot=label_binarize(predictions,classes=[0,1,2,3,4,5,6,7,8,9])
        micro_auc=roc_auc_score(labels_train_one_hot,predictions_one_hot,average="micro")
        macro_auc=roc_auc_score(labels_train_one_hot,predictions_one_hot,average="macro")

        df_data = [n,c,accuracy,normalized_accuracy,train_time,test_time,log_loss_val,macro_auc,micro_auc]
        df.loc[len(df)] = df_data

book = load_workbook(excel_file_name)
with pd.ExcelWriter(excel_file_name, engine='openpyxl') as excel_writer:
    excel_writer.book = book
    df.to_excel(excel_writer, sheet_name='criterion')
    df_classification_report = classifaction_report_to_dataframe(classfication_repo)
    df_classification_report.to_excel(excel_writer, sheet_name='criterion_cr')
    excel_writer.save()

In [None]:
'''
    4.
    Aim: Analysis on depth of the Random Tree
    RandomForestClassifier with n_estimators = 20,25,30,35 with depth = [1,2,5,10,15,20,21,22,25,30,40]

    Outcome: Accuracy increases with increase in depth and then doesn't increase much on still increasing the depth. 
    Negative log loss also decreases faster reaches a point and starts decreasing slowly after certain threshold of death 
    
'''
n_random_trees  =[20,25,30,35] 
# n_random_trees  =[1,2] 
depths = [1,2,5,10,15,20,21,22,25,30,40]
# depths = [5,10,15]


r_s = 10
df = pd.DataFrame(columns=['Number of Estimators', 'depth','Accuracy','Normalized Accuracy', 
                 'Time_taken_train','Time_taken_test',
                 'Negative Log loss','Macro_auc','Micro_auc'])

for n in n_random_trees:
    for d in depths:
        rf = RandomForestClassifier(n_estimators = n, random_state = r_s,max_depth =d)

        start = time()
        rf.fit(images_train, labels_train)
        train_time = time() - start

        start = time()
        predictions = rf.predict(images_test)
        test_time = time() - start

        # Metrics calculation 
        accuracy = accuracy_score(predictions,labels_test,normalize=False)
        normalized_accuracy = accuracy_score(predictions,labels_test,normalize=True)
        log_loss_prediction = rf.predict_proba(images_test)
        log_loss_val = log_loss(labels_test,log_loss_prediction)     
        classfication_repo = classification_report(labels_test, predictions)
        
        labels_train_one_hot=label_binarize(labels_test,classes=[0,1,2,3,4,5,6,7,8,9])
        predictions_one_hot=label_binarize(predictions,classes=[0,1,2,3,4,5,6,7,8,9])
        micro_auc=roc_auc_score(labels_train_one_hot,predictions_one_hot,average="micro")
        macro_auc=roc_auc_score(labels_train_one_hot,predictions_one_hot,average="macro")

        df_data = [n,d,accuracy,normalized_accuracy,train_time,test_time,log_loss_val,macro_auc,micro_auc]
        df.loc[len(df)] = df_data

book = load_workbook(excel_file_name)
with pd.ExcelWriter(excel_file_name, engine='openpyxl') as excel_writer:
    excel_writer.book = book
    df.to_excel(excel_writer, sheet_name='max_depth')
    df_classification_report = classifaction_report_to_dataframe(classfication_repo)
    df_classification_report.to_excel(excel_writer, sheet_name='max_depth_cr')
    excel_writer.save()

In [None]:
'''
    5.
    Aim: Analysis on max_features of the Random Tree
    RandomForestClassifier with n_estimators = 20,25,30,35 and with 
        max_features = ['auto','sqrt','log2',None,1,5,10,20,30,40,50,100,0.1,0.2,0.25,0.5,0.75,0.8,0.9,0.95]

    Outcome: 
    
'''
n_random_trees  =[20,25,30,35] 
# n_random_trees  =[1,2] 
max_features = ['auto','sqrt','log2',None,1,5,10,20,30,40,50,100,0.1,0.2,0.25,0.5,0.75,0.8,0.9,0.95]
# max_features = ['auto']


r_s = 10
df = pd.DataFrame(columns=['Number of Estimators', 'max_features','Accuracy','Normalized Accuracy', 
                 'Time_taken_train','Time_taken_test',
                 'Negative Log loss','Macro_auc','Micro_auc'])

for n in n_random_trees:
    for m in max_features:
        rf = RandomForestClassifier(n_estimators = n, random_state = r_s,max_features = m)

        start = time()
        rf.fit(images_train, labels_train)
        train_time = time() - start

        start = time()
        predictions = rf.predict(images_test)
        test_time = time() - start

        # Metrics calculation 
        accuracy = accuracy_score(predictions,labels_test,normalize=False)
        normalized_accuracy = accuracy_score(predictions,labels_test,normalize=True)
        log_loss_prediction = rf.predict_proba(images_test)
        log_loss_val = log_loss(labels_test,log_loss_prediction)     
        classfication_repo = classification_report(labels_test, predictions)
        
        labels_train_one_hot=label_binarize(labels_test,classes=[0,1,2,3,4,5,6,7,8,9])
        predictions_one_hot=label_binarize(predictions,classes=[0,1,2,3,4,5,6,7,8,9])
        micro_auc=roc_auc_score(labels_train_one_hot,predictions_one_hot,average="micro")
        macro_auc=roc_auc_score(labels_train_one_hot,predictions_one_hot,average="macro")

        df_data = [n,m,accuracy,normalized_accuracy,train_time,test_time,log_loss_val,macro_auc,micro_auc]
        df.loc[len(df)] = df_data

book = load_workbook(excel_file_name)
with pd.ExcelWriter(excel_file_name, engine='openpyxl') as excel_writer:
    excel_writer.book = book
    df.to_excel(excel_writer, sheet_name='max_features')
    df_classification_report = classifaction_report_to_dataframe(classfication_repo)
    df_classification_report.to_excel(excel_writer, sheet_name='max_features_cr')
    excel_writer.save()

In [None]:
'''
    6.
    Aim: Analysis on bootstrap of the Random Tree
    RandomForestClassifier with n_estimators = 20,25,30,35 and with 
        bootstrap_values = ['auto','sqrt','log2',None,1,5,10,20,30,40,50,100,0.1,0.2,0.25,0.5,0.75,0.8,0.9,0.95]

    Outcome: 
    
'''
n_random_trees  =[20,25,30,35] 
# n_random_trees  =[1,2] 
bootstrap_values = [True, False]


r_s = 10
df = pd.DataFrame(columns=['Number of Estimators', 'Bootstrap','Accuracy','Normalized Accuracy', 
                 'Time_taken_train','Time_taken_test',
                 'Negative Log loss','Macro_auc','Micro_auc'])

for n in n_random_trees:
    for m in bootstrap_values:
        rf = RandomForestClassifier(n_estimators = n, random_state = r_s, bootstrap=m)

        start = time()
        rf.fit(images_train, labels_train)
        train_time = time() - start

        start = time()
        predictions = rf.predict(images_test)
        test_time = time() - start

        # Metrics calculation 
        accuracy = accuracy_score(predictions,labels_test,normalize=False)
        normalized_accuracy = accuracy_score(predictions,labels_test,normalize=True)
        log_loss_prediction = rf.predict_proba(images_test)
        log_loss_val = log_loss(labels_test,log_loss_prediction)     
        classfication_repo = classification_report(labels_test, predictions)
        
        labels_train_one_hot=label_binarize(labels_test,classes=[0,1,2,3,4,5,6,7,8,9])
        predictions_one_hot=label_binarize(predictions,classes=[0,1,2,3,4,5,6,7,8,9])
        micro_auc=roc_auc_score(labels_train_one_hot,predictions_one_hot,average="micro")
        macro_auc=roc_auc_score(labels_train_one_hot,predictions_one_hot,average="macro")

        df_data = [n,m,accuracy,normalized_accuracy,train_time,test_time,log_loss_val,macro_auc,micro_auc]
        df.loc[len(df)] = df_data

book = load_workbook(excel_file_name)
with pd.ExcelWriter(excel_file_name, engine='openpyxl') as excel_writer:
    excel_writer.book = book
    df.to_excel(excel_writer, sheet_name='bootstrap')
    df_classification_report = classifaction_report_to_dataframe(classfication_repo)
    df_classification_report.to_excel(excel_writer, sheet_name='bootstrap_cr')
    excel_writer.save()

In [None]:
'''
    7.
    Aim: Analysis on the accuracy of the model vs the training data present
    RandomForestClassifier with n_estimators = 1,2,5,10,25,50,100,250,500,1000 with random_state = 10 and the amount of training
    data present is [0.1,0.2,0.25,0.5,0.6,0.75,0.8,0.9,1] times the training data. 

    Outcome: 
    
'''

n_random_trees  =[1,2,5,10,25,50,100,250,500,1000] 
# n_random_trees  =[1,2] 

r_s = 10

no_of_training_set_examples = len(images_train)

perct_no_of_training_set_examples = [0.1,0.2,0.25,0.5,0.6,0.75,0.8,0.9,1]
perct_no_of_training_set_examples = [0.1,0.2]

df = pd.DataFrame(columns=['Number of Estimators', 'No of Samples','Accuracy','Normalized Accuracy', 
                 'Time_taken_train','Time_taken_test',
                 'Negative Log loss','Macro_auc','Micro_auc'])

for n in n_random_trees:
    for e in perct_no_of_training_set_examples:
        
        rf = RandomForestClassifier(n_estimators = n, random_state = r_s)
        
        no_of_samples = int(no_of_training_set_examples * e)
        
        sample_indices =  np.random.choice(no_of_training_set_examples,no_of_samples)
        
        sampled_train_images = images_train[sample_indices,:]
        sampled_train_labels = labels_train[sample_indices]
        
        
        start = time()
        rf.fit(sampled_train_images, sampled_train_labels)
        train_time = time() - start


        start = time()
        predictions = rf.predict(images_test)
        test_time = time() - start
        
        # Metrics calculation 
        accuracy = accuracy_score(predictions,labels_test,normalize=False)
        normalized_accuracy = accuracy_score(predictions,labels_test,normalize=True)
        log_loss_prediction = rf.predict_proba(images_test)
        log_loss_val = log_loss(labels_test,log_loss_prediction)     
        classfication_repo = classification_report(labels_test, predictions)
        
        labels_train_one_hot=label_binarize(labels_test,classes=[0,1,2,3,4,5,6,7,8,9])
        predictions_one_hot=label_binarize(predictions,classes=[0,1,2,3,4,5,6,7,8,9])
        micro_auc=roc_auc_score(labels_train_one_hot,predictions_one_hot,average="micro")
        macro_auc=roc_auc_score(labels_train_one_hot,predictions_one_hot,average="macro")

    
        df_data = [n,no_of_samples,accuracy,normalized_accuracy,train_time,test_time,log_loss_val,macro_auc,micro_auc]
        df.loc[len(df)] = df_data


with pd.ExcelWriter(excel_file_name, engine='openpyxl') as excel_writer:
    excel_writer.book = book
    df.to_excel(excel_writer, sheet_name='training_data_p')
    df_classification_report = classifaction_report_to_dataframe(classfication_repo)
    df_classification_report.to_excel(excel_writer, sheet_name='training_data_p_cr')
    excel_writer.save()

In [None]:
# Grid Search and RandomSearch 
''' clf = RandomForestClassifier()
param_grid = {"max_depth": [3, None],
              "max_features": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"],
              "n_estimators" : [10,20,25,30,35] }
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=5)
start = time()
grid_search.fit(images_train, labels_train)
'''


In [None]:
# Grid and Random Search
# Understanding the data mnist. 

In [None]:
# from sklearn import decomposition



In [None]:
# pca = decomposition.PCA()
# pca.fit(images_train)