## Import Libraries

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

## Kaggle csv dataset is used, Train-Test split is already there

In [2]:
df = pd.read_csv("../mnist_train.csv")
x_train = df.drop(["label"], axis = 1)
y_train = df["label"]

In [3]:
df = pd.read_csv("../mnist_test.csv")
x_test = df.drop(["label"], axis = 1)
y_test = df["label"]

## Define RandomForest model and parameter grid for hyperparameter tuning

In [4]:
rf_model = RandomForestClassifier(random_state=42)
params_grid = {'n_estimators': np.arange(10, 100, 10),
               'max_depth': [None, 10, 20, 30, 40, 50],
               'max_features': ['sqrt', 'log2'],
               'min_samples_split': np.arange(2, 10),
               'min_samples_leaf': np.arange(1, 10)}

## Define RandomizedSearchCV object and fit on training data

In [6]:
rs_cv = RandomizedSearchCV(rf_model, params_grid, cv=5, n_iter=100, random_state=42)
rs_cv.fit(x_train, y_train)

# Print the best hyperparameters and corresponding mean test score
print('Best hyperparameters:', rs_cv.best_params_)
print('Best mean test score: {:.2f}%'.format(rs_cv.best_score_*100))

Best hyperparameters: {'n_estimators': 70, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 40}
Best mean test score: 96.56%


## Predict on test data using the best model

In [7]:
best_model = rs_cv.best_estimator_
y_pred = best_model.predict(x_test)

## Calculate accuracy score

In [8]:
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}%'.format(accuracy*100))

Accuracy: 96.72%


## Perform cross-validation

In [None]:
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(best_model, x_train, y_train, cv=5)

In [18]:
def Average(avg):
    return sum(avg) / len(avg)

average = Average(cv_scores)
print(average)

0.96555


## Export pickle file

In [19]:
import pickle
filename = 'randomforest.pkl'

# Dump the model to a pickle file
with open(filename, 'wb') as f:
    pickle.dump(best_model, f)