In [None]:
# setting up the data path
import os 
# Importing all the necessary libraries
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.cluster import KMeans



# Importing Train and Test datasets
train_data = pd.read_csv("fashion-mnist_train.csv")
final_test_data = pd.read_csv("fashion-mnist_test.csv")


# Splitting independent variables from the dependent variable in both training and testing
X_train = train_data.iloc[:,1:]
y_train = train_data.label.astype("str")

X_final_test = final_test_data.iloc[:,1:]
y_final_test = final_test_data.label.astype("str")



# Splitting train data into training and validation datasets
x_train, x_test, y_train_v, y_test_v = train_test_split(X_train,y_train, test_size = 0.3, random_state = 2)

# ================== Using Random Forest without hyper paramter tuning and clustering ===================
rf = RandomForestClassifier()

rf.fit(x_train,y_train_v)
# Predictions on training and validation
y_pred_train = rf.predict(x_train)
    # predictions for test
y_pred_test = rf.predict(x_test)
    # training metrics
print("Training metrics:")
print(sklearn.metrics.classification_report(y_true= y_train_v, y_pred= y_pred_train))
    
    # test data metrics
print("Test data metrics:")
print(sklearn.metrics.classification_report(y_true= y_test_v, y_pred= y_pred_test))


# Predictions on testset
y_pred_test = rf.predict(X_final_test)
    # test data metrics
print("Test data metrics:")
print(sklearn.metrics.classification_report(y_true= y_final_test, y_pred= y_pred_test))

# Results:
#    86% accuracy on both validation and test datasets


# =========================== Using Grid Search for hyper parameter tuning ===================================
clf = GridSearchCV(rf, param_grid={'n_estimators':[100,200],'min_samples_leaf':[2,3]})
model = clf.fit(x_train,y_train_v)


y_pred_train = model.predict(x_train)
    # predictions for test
y_pred_test = model.predict(x_test)
    # training metrics
print("Training metrics:")
print(sklearn.metrics.classification_report(y_true= y_train_v, y_pred= y_pred_train))
    
    # test data metrics
print("Test data metrics:")
print(sklearn.metrics.classification_report(y_true= y_test_v, y_pred= y_pred_test))


# Predictions on testset
y_pred_test = model.predict(X_final_test)
    # test data metrics
print("Test data metrics:")
print(sklearn.metrics.classification_report(y_true= y_final_test, y_pred= y_pred_test))
print(sklearn.metrics.classification_matrix(y_true= y_final_test, y_pred= y_pred_test))


Training metrics:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4163
           1       1.00      1.00      1.00      4211
           2       1.00      1.00      1.00      4211
           3       1.00      1.00      1.00      4182
           4       1.00      1.00      1.00      4243
           5       1.00      1.00      1.00      4203
           6       1.00      1.00      1.00      4214
           7       1.00      1.00      1.00      4135
           8       1.00      1.00      1.00      4222
           9       1.00      1.00      1.00      4216

    accuracy                           1.00     42000
   macro avg       1.00      1.00      1.00     42000
weighted avg       1.00      1.00      1.00     42000

Test data metrics:
              precision    recall  f1-score   support

           0       0.81      0.87      0.84      1837
           1       1.00      0.96      0.98      1789
           2       0.80      0.82      0.