In [1]:
import numpy as np
import gzip
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from keras.utils.np_utils import to_categorical # convert to one-hot-encoding

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def read_data(train_num_images,test_num_images,image_size,train_image_file,train_label_file,test_image_file,test_label_file):
    
    train_image_file = gzip.open(train_image_file,'r')
    train_label_file = gzip.open(train_label_file,'r')
    test_image_file = gzip.open(test_image_file,'r')
    test_label_file = gzip.open(test_label_file,'r')

    # Read Training Images
    train_image_file = gzip.open('train-images-idx3-ubyte.gz','r')
    train_image_file.read(16)
    buf = train_image_file.read(image_size * image_size * train_num_images)
    train_images = np.frombuffer(buf, dtype=np.uint8).astype(np.float32)
    train_images = train_images.reshape(train_num_images, image_size, image_size, 1)

    # Read Training Labels
    train_label_file.read(8)
    buf = train_label_file.read(train_num_images)
    train_labels = np.frombuffer(buf, dtype=np.uint8).astype(np.float32)

    # Read Testing Images
    test_image_file.read(16)
    buf = test_image_file.read(image_size * image_size * test_num_images)
    test_images = np.frombuffer(buf, dtype=np.uint8).astype(np.float32)
    test_images = test_images.reshape(test_num_images, image_size, image_size, 1)

    # Read Testing Labels
    test_label_file.read(8)
    buf = test_label_file.read(test_num_images)
    test_labels = np.frombuffer(buf, dtype=np.uint8).astype(np.float32)
    
    return train_images,train_labels,test_images,test_labels

In [3]:
def hyperparameter_tuning(x_train,y_train,x_test,y_test):
    n_estimators = [10,50,100,200,300,400,500]
#     n_estimators = [1,5,10,20,30,40,50,100]

    opt_estimator = 1
    max_score = 0
    for i in n_estimators:
        rf = RandomForestClassifier(n_estimators = i, n_jobs=-1)
        rf.fit(x_train, y_train)
        train_score = rf.score(x_train, y_train)
        val_score = rf.score(x_test, y_test)
        print("Maximum Iterations:{}, Train Accuracy:{:2.4}, Validation Accuracy:{:2.4}".format(i,100*train_score,100*val_score))
        if(val_score> max_score):
            max_score = val_score
            opt_estimator = i
    print("Optimum:", opt_estimator, 100*max_score)
    
    return opt_estimator

In [4]:
def main():
    train_num_images = 60000
    test_num_images = 10000
    image_size = 28
    
    train_image_file = 'train-images-idx3-ubyte.gz'
    train_label_file = 'train-labels-idx1-ubyte.gz'
    test_image_file = 't10k-images-idx3-ubyte.gz'
    test_label_file = 't10k-labels-idx1-ubyte.gz'
    train_images,train_labels,test_images,test_labels = read_data(train_num_images,test_num_images,image_size,\
                                                                  train_image_file,train_label_file,test_image_file,\
                                                                  test_label_file)
    # Reshaping Images for training and testing
    train_images = train_images.reshape(train_num_images,784)
    test_images = test_images.reshape(test_num_images,784)
    
    # Splitting training data into trainig and validation data
    x_train, x_test, y_train, y_test = train_test_split(train_images, train_labels, test_size=0.25, random_state=0)

    # Finding optimum number of iterations
    opt_estimator = hyperparameter_tuning(x_train,y_train,x_test,y_test)
    
    # Training on the whole training dataset
    rf = RandomForestClassifier(n_estimators = opt_estimator,n_jobs = -1)
    rf.fit(train_images, train_labels)
    train_score = rf.score(train_images, train_labels)
    print("Train Accuracy:{:2.4}".format(100*train_score))

    predictions = rf.predict(test_images)
    test_score = rf.score(test_images, test_labels)
    print("Test Accuracy:{:2.4}".format(100*test_score))
    
    
    # Dumping testing results into csv file
    results = to_categorical(predictions,num_classes = 10)
    np.savetxt("rf.csv", results, fmt = '%4d', delimiter=",")

In [5]:
if __name__ == "__main__":
    main()

Maximum Iterations:10, Train Accuracy:99.9, Validation Accuracy:94.19
Maximum Iterations:50, Train Accuracy:100.0, Validation Accuracy:96.42
Maximum Iterations:100, Train Accuracy:100.0, Validation Accuracy:96.8
Maximum Iterations:200, Train Accuracy:100.0, Validation Accuracy:96.92
Maximum Iterations:300, Train Accuracy:100.0, Validation Accuracy:96.98
Maximum Iterations:400, Train Accuracy:100.0, Validation Accuracy:97.0
Maximum Iterations:500, Train Accuracy:100.0, Validation Accuracy:96.97
Optimum: 400 97.0
Train Accuracy:100.0
Test Accuracy:97.14
