In [None]:
# -*- coding: utf-8 -*-
"""
Title: Image Classification using Random Forest
@author: Team I
"""

# setting up the data path
import os 
# os.chdir("C:/Users/prave/Downloads/Praveen/UConn/Predictive modeling/My Learnings/Python Project/")

# Importing all the necessary libraries
import pandas as pd
import numpy as np
import sklearn

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# # Importing Train and Test datasets
# train_data = pd.read_csv("datasets/fashion_mnist_train.csv")
# final_test_data = pd.read_csv("datasets/fashion_mnist_test.csv")


# # Splitting independent variables from the dependent variable in both training and testing
# X_train = train_data.iloc[:,1:]
# y_train = train_data.label.astype("str")

# X_final_test = final_test_data.iloc[:,1:]
# y_final_test = final_test_data.label.astype("str")



# Splitting train data into training and validation datasets
X_train, X_final_test, y_train, y_final_test = np.load("/content/drive/MyDrive/steel_surface_200.npy", allow_pickle=True)
x_train, x_test, y_train_v, y_test_v = train_test_split(X_train,y_train, test_size = 0.3, random_state = 43)
print(x_train.shape, x_test.shape, X_final_test.shape)
X_train=X_train.reshape(1080,120000)
x_train=x_train.reshape(756,120000)
x_test=x_test.reshape(324,120000)
X_final_test=X_final_test.reshape(360,120000)
print(x_train.shape, x_test.shape, X_final_test.shape)

(756, 200, 200, 3) (324, 200, 200, 3) (360, 200, 200, 3)
(756, 120000) (324, 120000) (360, 120000)


In [None]:
# ================== Using Random Forest without hyper paramter tuning and clustering ===================
rf = RandomForestClassifier()

rf.fit(x_train,y_train_v)
# Predictions on training and validation
y_pred_train = rf.predict(x_train)
    # predictions for test
y_pred_test = rf.predict(x_test)
    # training metrics
print("Training metrics:")
print(sklearn.metrics.classification_report(y_true= y_train_v, y_pred= y_pred_train))
    
    # test data metrics
print("Test data metrics:")
print(sklearn.metrics.classification_report(y_true= y_test_v, y_pred= y_pred_test))


# Predictions on testset
y_pred_test = rf.predict(X_final_test)
    # test data metrics
print("Test data metrics:")
print(sklearn.metrics.classification_report(y_true= y_final_test, y_pred= y_pred_test))

Training metrics:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       116
           1       1.00      1.00      1.00       134
           2       1.00      1.00      1.00       137
           3       1.00      1.00      1.00       125
           4       1.00      1.00      1.00       116
           5       1.00      1.00      1.00       128

    accuracy                           1.00       756
   macro avg       1.00      1.00      1.00       756
weighted avg       1.00      1.00      1.00       756

Test data metrics:
              precision    recall  f1-score   support

           0       0.75      0.67      0.70        57
           1       0.62      0.72      0.67        53
           2       0.92      0.72      0.80        46
           3       0.76      0.83      0.79        58
           4       0.70      0.79      0.74        57
           5       0.61      0.57      0.59        53

    accuracy                           0

In [None]:
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score
print("accuracy:",accuracy_score(y_final_test, y_pred_test))
print("Precision:",precision_score(y_final_test, y_pred_test, average='weighted'))
print("Recall:",recall_score(y_final_test, y_pred_test, average='weighted'))
print("F1-Score:",f1_score(y_final_test, y_pred_test, average='weighted'))


accuracy: 0.75
Precision: 0.7660473743852008
Recall: 0.75
F1-Score: 0.7526071334437394


In [None]:

# =========================== Using Grid Search for hyper parameter tuning ===================================
clf = GridSearchCV(rf, param_grid={'n_estimators':[100,200],'min_samples_leaf':[2,3], 'max_depth':[4,8,10,16,24,48,128,256], 'min_samples_split':[2,3,5]}, scoring='accuracy')
model = clf.fit(x_train,y_train_v)


y_pred_train = model.predict(x_train)
    # predictions for test
y_pred_test = model.predict(x_test)
    # training metrics
print("Training metrics:")
print(sklearn.metrics.classification_report(y_true= y_train_v, y_pred= y_pred_train))
    
    # test data metrics
print("Test data metrics:")
print(sklearn.metrics.classification_report(y_true= y_test_v, y_pred= y_pred_test))


# Predictions on testset
y_pred_test = model.predict(X_final_test)
    # test data metrics
print("Test data metrics:")
print(sklearn.metrics.classification_report(y_true= y_final_test, y_pred= y_pred_test))

Training metrics:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       116
           1       1.00      0.99      1.00       134
           2       1.00      1.00      1.00       137
           3       0.99      1.00      1.00       125
           4       1.00      1.00      1.00       116
           5       1.00      1.00      1.00       128

    accuracy                           1.00       756
   macro avg       1.00      1.00      1.00       756
weighted avg       1.00      1.00      1.00       756

Test data metrics:
              precision    recall  f1-score   support

           0       0.72      0.67      0.69        57
           1       0.66      0.74      0.70        53
           2       0.90      0.76      0.82        46
           3       0.78      0.81      0.80        58
           4       0.70      0.77      0.73        57
           5       0.66      0.62      0.64        53

    accuracy                           0

In [None]:
print('Best score:', clf.best_score_)
print('Best paramas:',clf.best_params_)
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score
print("accuracy:",accuracy_score(y_final_test, y_pred_test))
print("Precision:",precision_score(y_final_test, y_pred_test, average='weighted'))
print("Recall:",recall_score(y_final_test, y_pred_test, average='weighted'))
print("F1_Score:",f1_score(y_final_test, y_pred_test, average='weighted'))


Best score: 0.7076856047403276
Best paramas: {'max_depth': 256, 'min_samples_leaf': 2, 'min_samples_split': 3, 'n_estimators': 100}
accuracy: 0.7666666666666667
Precision: 0.7787372678430872
Recall: 0.7666666666666667
F1_Score: 0.7695824967616687


In [None]:
print("accuracy:",accuracy_score(y_final_test, y_pred_test))
print("Precision:",precision_score(y_final_test, y_pred_test, average='weighted'))
print("Recall:",recall_score(y_final_test, y_pred_test, average='weighted'))
print("F1_Score:",f1_score(y_final_test, y_pred_test, average='weighted'))


accuracy: 0.7666666666666667
Precision: 0.7787372678430872
Recall: 0.7666666666666667
F1_Score: 0.7695824967616687


In [None]:
import pandas as pd
scores_df = pd.DataFrame(clf.cv_results_)
print(scores_df)
scores_df.to_csv("/content/drive/MyDrive/RandomForest/Randomforest_scores.csv", mode='w')

    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0        5.540103      0.042851         0.045287        0.005607   
1       11.138705      0.425097         0.062490        0.001689   
2        5.475683      0.114281         0.045506        0.004582   
3       10.702446      0.108551         0.061367        0.008146   
4        5.484154      0.074174         0.047928        0.001027   
..            ...           ...              ...             ...   
91      19.957472      0.478260         0.074617        0.016963   
92      10.010913      0.229911         0.053754        0.001285   
93      19.812665      0.225897         0.064047        0.007085   
94      10.204691      0.525891         0.053261        0.001333   
95      20.083933      0.121217         0.069321        0.005409   

   param_max_depth param_min_samples_leaf param_min_samples_split  \
0                4                      2                       2   
1                4                      2    

In [None]:

# # ==================== Using Clustering and hyper parameter tuning ============================
# # K- means clustering
# kmeans = KMeans(n_clusters=10, init='k-means++')

# # fitting K means to X_train
# kmeans.fit(X_train)
# X_train["k_means_label"] = (kmeans.labels_)
# X_train["k_means_label"] = X_train["k_means_label"].astype('str')

# # Checking column type of K_means_label
# X_train["k_means_label"].dtypes
# X_train.k_means_label[0:10]
# y_train[0:10]

# # fitting K means to X_final_test
# kmeans.fit(X_final_test)
# X_final_test["k_means_label"] = (kmeans.labels_)
# X_final_test["k_means_label"] = X_final_test["k_means_label"].astype('str')
# y_final_test[0:10]

# # Splitting train data into training and validation datasets
# x_train, x_test, y_train_v, y_test_v = train_test_split(X_train,y_train, test_size = 0.3, random_state = 2)

# # Hyper parameter tuning with new feature
# clf = GridSearchCV(rf, param_grid={'n_estimators':[100,200],'min_samples_leaf':[2,3]})
# model = clf.fit(x_train,y_train_v)

# y_pred_train = model.predict(x_train)
#     # predictions for test
# y_pred_test = model.predict(x_test)
#     # training metrics
# print("Training metrics:")
# print(sklearn.metrics.classification_report(y_true= y_train_v, y_pred= y_pred_train))
    
#     # test data metrics
# print("Test data metrics:")
# print(sklearn.metrics.classification_report(y_true= y_test_v, y_pred= y_pred_test))


# # Predictions on testset
# y_pred_test = model.predict(X_final_test)
#     # test data metrics
# print("Test data metrics:")
# print(sklearn.metrics.classification_report(y_true= y_final_test, y_pred= y_pred_test))


In [None]:
# # =================== Using 5 Fold Cross Validation to check the consistency of the final model ====================
# sk_fold = StratifiedKFold(n_splits=5, shuffle=True)

# for train_index, test_index in sk_fold.split(x_train, y_train_v):
#     train = [x_train.iloc[i,:] for i in train_index]
#     y_trn_k = [y_train_v.iloc[i] for i in train_index]
#     test = [x_train.iloc[i,:] for i in test_index]
#     y_tst_k = [y_train_v.iloc[i] for i in test_index]
#     # predictions for train
#     model.fit(train, y_trn_k)
#     y_pred_train = model.predict(train)
#     # predictions for test
#     y_pred_test = model.predict(test)
#     # training metrics
#     print("Training metrics:")
#     print(sklearn.metrics.classification_report(y_true= y_trn_k, y_pred= y_pred_train))
    
#     # test data metrics
#     print("Test data metrics:")
#     print(sklearn.metrics.classification_report(y_true= y_tst_k, y_pred= y_pred_test))
    

# # predictions on train
# y_pred_train = model.predict(X_train)
#     # predictions for test
# y_pred_test = model.predict(X_final_test)
#     # training metrics
# print("Training metrics:")
# print(sklearn.metrics.classification_report(y_true= y_train, y_pred= y_pred_train))
    
#     # test data metrics
# print("Test data metrics:")
# print(sklearn.metrics.classification_report(y_true= y_final_test, y_pred= y_pred_test))