In [1]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
tf.config.run_functions_eagerly(True)
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
import joblib
from sklearn.ensemble import RandomForestClassifier

In [2]:
def data_load():
    train_X = pd.read_csv('train.csv')
    test_X = pd.read_csv('test.csv' )
    train_X.drop(train_X.columns[train_X.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
    test_X.drop(test_X.columns[test_X.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)

    train_result = pd.read_csv('train_result.csv')
    train_y_class = train_result['Class']

    X_train = train_X.to_numpy()
    X_test = test_X.to_numpy()
    train_y = train_y_class.to_numpy()


    return X_train, X_test, train_y

In [3]:
X_train, X_test, train_y = data_load()
print(train_y.shape)
print(X_train.shape)
print(X_test.shape)

(50000,)
(50000, 1568)
(10000, 1568)


In [4]:
X_train, X_val, train_y, val_y = train_test_split(X_train, train_y,  test_size=0.20, random_state=101)
print(X_train.shape)
print(X_val.shape)
print(train_y.shape)
print(val_y.shape)

(40000, 1568)
(10000, 1568)
(40000,)
(10000,)


In [5]:
def model_predictions(X, y, predictions):
    '''
    X  = Train or Valdation set
    y  = target values of train or validation set
    '''
    #predictions = rf_model.predict(X)
    cf_matrix = confusion_matrix(y,predictions)
    #print(cf_matrix)

    sum_preds = np.sum(cf_matrix)
    sum_correct = np.sum(np.diag(cf_matrix))
    misclassification_error = 1.0 - (float(sum_correct) / float(sum_preds))
    
    print("sum_preds               :", int(sum_preds)) 
    print("sum_correct_predictions :", int(sum_correct)) 
    print("sum_wrong_predictions   :", int(sum_preds - sum_correct))
    print("misclassification_error :", misclassification_error)
    print("Accuarcy                :", accuracy_score(y,predictions))

    # class-wise accuracy
    class_report = classification_report(y,predictions)
    print(class_report)
    return None

In [7]:
rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(X_train, train_y)

RandomForestClassifier()

In [8]:
predictions = rf_model.predict(X_train)
model_predictions(X_train, train_y, predictions)

sum_preds               : 40000
sum_correct_predictions : 40000
sum_wrong_predictions   : 0
misclassification_error : 0.0
Accuarcy                : 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       420
           1       1.00      1.00      1.00       871
           2       1.00      1.00      1.00      1254
           3       1.00      1.00      1.00      1713
           4       1.00      1.00      1.00      2067
           5       1.00      1.00      1.00      2393
           6       1.00      1.00      1.00      2751
           7       1.00      1.00      1.00      3195
           8       1.00      1.00      1.00      3684
           9       1.00      1.00      1.00      3820
          10       1.00      1.00      1.00      3669
          11       1.00      1.00      1.00      3116
          12       1.00      1.00      1.00      2748
          13       1.00      1.00      1.00      2359
          14       1.00      1.00    

In [9]:
val_predictions = rf_model.predict(X_val)
model_predictions(X_val, val_y, val_predictions)

sum_preds               : 10000
sum_correct_predictions : 7129
sum_wrong_predictions   : 2871
misclassification_error : 0.2871
Accuarcy                : 0.7129
              precision    recall  f1-score   support

           0       0.89      0.86      0.88       102
           1       0.85      0.92      0.88       202
           2       0.90      0.88      0.89       343
           3       0.84      0.81      0.82       439
           4       0.81      0.80      0.80       537
           5       0.74      0.73      0.73       597
           6       0.73      0.73      0.73       692
           7       0.76      0.72      0.74       817
           8       0.68      0.77      0.72       917
           9       0.65      0.74      0.69       987
          10       0.66      0.70      0.68       938
          11       0.61      0.64      0.63       762
          12       0.62      0.70      0.65       706
          13       0.68      0.66      0.67       578
          14       0.81      

# Randomized Search for Hyper Parameters

In [10]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
#n_estimators 
n_estimators = [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [10, 20, 30, 40, 50, 60,70, 80,  90, 100, 120, 150, 200]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [  2, 5, 10, 20, 30]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4, 10, 20, 30]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Criterion
criterion=['gini', 'entropy']

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               'criterion': criterion}


# Due to memory issues with my laptop, I have performed Randomized grid search for parameters at a HPC cluster where I have access. The best param was loaded as a pickle file.

In [11]:
'''
# Use the random grid to search for best hyperparameters
# First create the base model to tune

rf_model = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf_model, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, train_y)
'''

'\n# Use the random grid to search for best hyperparameters\n# First create the base model to tune\n\nrf_model = RandomForestClassifier()\n\n# Random search of parameters, using 3 fold cross validation, \n# search across 100 different combinations, and use all available cores\nrf_random = RandomizedSearchCV(estimator = rf_model, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)\n# Fit the random search model\nrf_random.fit(X_train, train_y)\n'

In [12]:
rf_best_params_saved_pickle = joblib.load("rf_random_best_params.pkl")
rf_best_params_saved_pickle

{'n_estimators': 800,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 70,
 'criterion': 'entropy',
 'bootstrap': False}

In [13]:
rf_best_param_model = RandomForestClassifier(**rf_best_params_saved_pickle)
rf_best_param_model.fit(X_train, train_y)

RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=70,
                       max_features='sqrt', min_samples_split=10,
                       n_estimators=800)

In [14]:
predictions = rf_best_param_model.predict(X_train)
model_predictions(X_train, train_y, predictions)

sum_preds               : 40000
sum_correct_predictions : 40000
sum_wrong_predictions   : 0
misclassification_error : 0.0
Accuarcy                : 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       420
           1       1.00      1.00      1.00       871
           2       1.00      1.00      1.00      1254
           3       1.00      1.00      1.00      1713
           4       1.00      1.00      1.00      2067
           5       1.00      1.00      1.00      2393
           6       1.00      1.00      1.00      2751
           7       1.00      1.00      1.00      3195
           8       1.00      1.00      1.00      3684
           9       1.00      1.00      1.00      3820
          10       1.00      1.00      1.00      3669
          11       1.00      1.00      1.00      3116
          12       1.00      1.00      1.00      2748
          13       1.00      1.00      1.00      2359
          14       1.00      1.00    

In [15]:
val_predictions = rf_best_param_model.predict(X_val)
model_predictions(X_val, val_y, val_predictions)

sum_preds               : 10000
sum_correct_predictions : 7783
sum_wrong_predictions   : 2217
misclassification_error : 0.2217
Accuarcy                : 0.7783
              precision    recall  f1-score   support

           0       0.91      0.89      0.90       102
           1       0.85      0.92      0.89       202
           2       0.90      0.90      0.90       343
           3       0.87      0.85      0.86       439
           4       0.84      0.80      0.82       537
           5       0.80      0.78      0.79       597
           6       0.80      0.79      0.80       692
           7       0.80      0.76      0.78       817
           8       0.76      0.81      0.78       917
           9       0.73      0.82      0.77       987
          10       0.76      0.76      0.76       938
          11       0.71      0.75      0.73       762
          12       0.69      0.79      0.74       706
          13       0.77      0.76      0.77       578
          14       0.86      

# Retrain using whole train dataset and predictions on test set

In [16]:
X_train, X_test, train_y = data_load()
print(train_y.shape)
print(X_train.shape)
print(X_test.shape)

(50000,)
(50000, 1568)
(10000, 1568)


In [17]:
rf_best_param_model = RandomForestClassifier(**rf_best_params_saved_pickle)
rf_best_param_model.fit(X_train, train_y)

RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=70,
                       max_features='sqrt', min_samples_split=10,
                       n_estimators=800)

In [18]:
print('\n')
print("Test Predictions")
test_predictions = rf_best_param_model.predict(X_test)
print(test_predictions)
print(len(test_predictions))
df = pd.DataFrame(test_predictions)
df.index.name= 'Index'
df.columns = ['Class']
file_name = 'test_predictions_random_forest_grid_searchbest_params_final.csv'
df.to_csv(file_name, index=True)
df



Test Predictions
[14  7 10 ... 10  4  6]
10000


Unnamed: 0_level_0,Class
Index,Unnamed: 1_level_1
0,14
1,7
2,10
3,7
4,5
...,...
9995,7
9996,12
9997,10
9998,4
