In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

import numpy as np

In [5]:
def runRandomForest(i, j):
    TRAIN_PATH = f'all_data/train_c{str(i)}_d{str(j)}.csv'
    TEST_PATH = f'all_data/test_c{str(i)}_d{str(j)}.csv'
    VAL_PATH = f'all_data/valid_c{str(i)}_d{str(j)}.csv'

    train_data = np.genfromtxt(TRAIN_PATH, delimiter = ',')
    val_data = np.genfromtxt(VAL_PATH, delimiter = ',')
    test_data = np.genfromtxt(TEST_PATH, delimiter = ',')

    X_train = train_data[:, :-1]
    y_train = train_data[:, -1]

    X_val = val_data[:, :-1]
    y_val = val_data[:, -1]

    X_test = test_data[:, :-1]
    y_test = test_data[:, -1]

    param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [4, 6, 8],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2, 3],
    'n_estimators': [10, 20, 30],
    }

    clf = RandomForestClassifier()
    grid_search = GridSearchCV(clf, param_grid=param_grid, cv=5, n_jobs=-1)
    grid_search.fit(X_val, y_val)

    print('Best Parameters found -', grid_search.best_params_)

    X_trainval = np.concatenate((X_train, X_val), axis = 0)
    y_trainval = np.concatenate((y_train, y_val), axis = 0)
    
    best_params = grid_search.best_params_
    clf = RandomForestClassifier(
    criterion=best_params['criterion'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    n_estimators=best_params['n_estimators'],
    )

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print('Test set accuracy =', accuracy)
    print('Test set f1 score =', f1)

In [None]:
runRandomForest(300,100)

Best Parameters found - {'criterion': 'entropy', 'max_depth': 6, 'max_features': None, 'min_samples_leaf': 4, 'min_samples_split': 4, 'n_estimators': 30}
Test set accuracy = 0.68
Test set f1 score = 0.6701030927835052


In [None]:
runRandomForest(300, 1000)

Best Parameters found - {'criterion': 'entropy', 'max_depth': 6, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 30}
Test set accuracy = 0.8485
Test set f1 score = 0.8568729333963155


In [None]:
runRandomForest(300, 5000)

Best Parameters found - {'criterion': 'entropy', 'max_depth': 8, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 30}
Test set accuracy = 0.8683
Test set f1 score = 0.8769963575231157


In [None]:
runRandomForest(500, 100)

Best Parameters found - {'criterion': 'gini', 'max_depth': 2, 'max_features': 'sqrt', 'min_samples_leaf': 3, 'min_samples_split': 8, 'n_estimators': 30}
Test set accuracy = 0.75
Test set f1 score = 0.7422680412371133


In [10]:
runRandomForest(500, 1000)

Best Parameters found - {'criterion': 'gini', 'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 8, 'n_estimators': 30}
Test set accuracy = 0.8575
Test set f1 score = 0.8558421851289832


In [5]:
runRandomForest(500, 5000)

Best Parameters found - {'criterion': 'gini', 'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 6, 'n_estimators': 30}
Test set accuracy = 0.9045
Test set f1 score = 0.9057906678504488


In [6]:
runRandomForest(1000, 100)

Best Parameters found - {'criterion': 'entropy', 'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 3, 'min_samples_split': 6, 'n_estimators': 30}
Test set accuracy = 0.91
Test set f1 score = 0.9134615384615385


In [7]:
runRandomForest(1000, 1000)

Best Parameters found - {'criterion': 'entropy', 'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 3, 'min_samples_split': 6, 'n_estimators': 30}
Test set accuracy = 0.97
Test set f1 score = 0.9703849950641659


In [6]:
runRandomForest(1000, 5000)

Best Parameters found - {'criterion': 'entropy', 'max_depth': 8, 'min_samples_leaf': 3, 'min_samples_split': 6, 'n_estimators': 30}
Test set accuracy = 0.9798
Test set f1 score = 0.9798964968152867


In [7]:
runRandomForest(1500, 100)

Best Parameters found - {'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 30}
Test set accuracy = 0.985
Test set f1 score = 0.9849246231155778


In [8]:
runRandomForest(1500, 1000)

Best Parameters found - {'criterion': 'entropy', 'max_depth': 8, 'min_samples_leaf': 2, 'min_samples_split': 6, 'n_estimators': 30}
Test set accuracy = 0.997
Test set f1 score = 0.997


In [9]:
runRandomForest(1500, 5000)

Best Parameters found - {'criterion': 'gini', 'max_depth': 8, 'min_samples_leaf': 2, 'min_samples_split': 4, 'n_estimators': 30}
Test set accuracy = 0.9984
Test set f1 score = 0.9983996799359872


In [10]:
runRandomForest(1800, 100)

Best Parameters found - {'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 30}
Test set accuracy = 0.995
Test set f1 score = 0.9950248756218906


In [11]:
runRandomForest(1800, 1000)

Best Parameters found - {'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 30}
Test set accuracy = 0.9995
Test set f1 score = 0.9995002498750626


In [12]:
runRandomForest(1800, 5000)

Best Parameters found - {'criterion': 'gini', 'max_depth': 8, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 30}
Test set accuracy = 1.0
Test set f1 score = 1.0
