In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score

import numpy as np

In [15]:
def runDecisionTree(i, j):
    TRAIN_PATH = f'all_data/train_c{str(i)}_d{str(j)}.csv'
    TEST_PATH = f'all_data/test_c{str(i)}_d{str(j)}.csv'
    VAL_PATH = f'all_data/valid_c{str(i)}_d{str(j)}.csv'

    train_data = np.genfromtxt(TRAIN_PATH, delimiter = ',')
    val_data = np.genfromtxt(VAL_PATH, delimiter = ',')
    test_data = np.genfromtxt(TEST_PATH, delimiter = ',')

    X_train = train_data[:, :-1]
    y_train = train_data[:, -1]

    X_val = val_data[:, :-1]
    y_val = val_data[:, -1]

    X_test = test_data[:, :-1]
    y_test = test_data[:, -1]

    param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [2, 4, 6, 8, 10],
    'min_samples_split': [2, 4, 6, 8, 10],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'max_features': [None, 'sqrt', 'log2']
    }

    dt = DecisionTreeClassifier()

    grid_search = GridSearchCV(dt, param_grid, cv = 5)
    grid_search.fit(X_val, y_val)

    print('Best Parameters found -', grid_search.best_params_)

    best_dt = DecisionTreeClassifier(**grid_search.best_params_)
    X_trainval = np.concatenate((X_train, X_val), axis = 0)
    y_trainval = np.concatenate((y_train, y_val), axis = 0)
    
    best_dt.fit(X_trainval, y_trainval)
    
    y_pred = best_dt.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print('Test set accuracy =', accuracy)
    print('Test set f1 score =', f1)

In [16]:
runDecisionTree(300,100)

Best Parameters found - {'criterion': 'gini', 'max_depth': 2, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 4}
Test set accuracy = 0.575
Test set f1 score = 0.5971563981042655


In [17]:
runDecisionTree(300, 1000)

Best Parameters found - {'criterion': 'entropy', 'max_depth': 6, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 8}
Test set accuracy = 0.682
Test set f1 score = 0.7022471910112359


In [18]:
runDecisionTree(300, 5000)

Best Parameters found - {'criterion': 'gini', 'max_depth': 8, 'max_features': None, 'min_samples_leaf': 5, 'min_samples_split': 4}
Test set accuracy = 0.7796
Test set f1 score = 0.7998547039593171


In [19]:
runDecisionTree(500, 100)

Best Parameters found - {'criterion': 'entropy', 'max_depth': 6, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 4}
Test set accuracy = 0.57
Test set f1 score = 0.5222222222222221


In [20]:
runDecisionTree(500, 1000)

Best Parameters found - {'criterion': 'entropy', 'max_depth': 6, 'max_features': None, 'min_samples_leaf': 4, 'min_samples_split': 4}
Test set accuracy = 0.708
Test set f1 score = 0.7203065134099617


In [21]:
runDecisionTree(500, 5000)

Best Parameters found - {'criterion': 'entropy', 'max_depth': 8, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10}
Test set accuracy = 0.7812
Test set f1 score = 0.8019909502262444


In [22]:
runDecisionTree(1000, 100)

Best Parameters found - {'criterion': 'gini', 'max_depth': 6, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 2}
Test set accuracy = 0.715
Test set f1 score = 0.7106598984771574


In [23]:
runDecisionTree(1000, 1000)

Best Parameters found - {'criterion': 'entropy', 'max_depth': 8, 'max_features': None, 'min_samples_leaf': 4, 'min_samples_split': 2}
Test set accuracy = 0.7955
Test set f1 score = 0.8023199613339776


In [24]:
runDecisionTree(1000, 5000)

Best Parameters found - {'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 2}
Test set accuracy = 0.861
Test set f1 score = 0.8660370084811102


In [25]:
runDecisionTree(1500, 100)

Best Parameters found - {'criterion': 'entropy', 'max_depth': 6, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2}
Test set accuracy = 0.825
Test set f1 score = 0.8356807511737089


In [26]:
runDecisionTree(1500, 1000)

Best Parameters found - {'criterion': 'gini', 'max_depth': 8, 'max_features': None, 'min_samples_leaf': 4, 'min_samples_split': 10}
Test set accuracy = 0.9115
Test set f1 score = 0.9140359397765906


In [27]:
runDecisionTree(1500, 5000)

Best Parameters found - {'criterion': 'gini', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 2}
Test set accuracy = 0.9528
Test set f1 score = 0.9533596837944663


In [29]:
runDecisionTree(1800, 100)

Best Parameters found - {'criterion': 'gini', 'max_depth': 4, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 4}
Test set accuracy = 0.94
Test set f1 score = 0.9417475728155339


In [30]:
runDecisionTree(1800, 1000)

Best Parameters found - {'criterion': 'entropy', 'max_depth': 8, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 4}
Test set accuracy = 0.975
Test set f1 score = 0.9753208292201383


In [31]:
runDecisionTree(1800, 5000)

Best Parameters found - {'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10}
Test set accuracy = 0.9865
Test set f1 score = 0.9865765138709356
