In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score

import numpy as np

In [4]:
def runGradientBoosting(i, j):
    TRAIN_PATH = f'all_data/train_c{str(i)}_d{str(j)}.csv'
    TEST_PATH = f'all_data/test_c{str(i)}_d{str(j)}.csv'
    VAL_PATH = f'all_data/valid_c{str(i)}_d{str(j)}.csv'

    train_data = np.genfromtxt(TRAIN_PATH, delimiter = ',')
    val_data = np.genfromtxt(VAL_PATH, delimiter = ',')
    test_data = np.genfromtxt(TEST_PATH, delimiter = ',')

    X_train = train_data[:, :-1]
    y_train = train_data[:, -1]

    X_val = val_data[:, :-1]
    y_val = val_data[:, -1]

    X_test = test_data[:, :-1]
    y_test = test_data[:, -1]

    param_grid = {
    'n_estimators': [20, 30],
    'max_depth': [4, 8],
    'min_samples_split': [2, 5]
    }

    clf = GradientBoostingClassifier()

    grid_search = GridSearchCV(clf, param_grid=param_grid, cv=5, n_jobs=-1)
    grid_search.fit(X_val, y_val)

    print('Best Parameters found -', grid_search.best_params_)

    X_trainval = np.concatenate((X_train, X_val), axis = 0)
    y_trainval = np.concatenate((y_train, y_val), axis = 0)

    best_params = grid_search.best_params_
    clf = GradientBoostingClassifier(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split']
    )

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print('Test set accuracy =', accuracy)
    print('Test set f1 score =', f1)


In [None]:
runGradientBoosting(300,100)

Best Parameters found - {'criterion': 'friedman_mse', 'learning_rate': 0.1, 'max_depth': 8, 'max_features': 'sqrt', 'min_samples_split': 5, 'n_estimators': 200}
Test set accuracy = 0.755
Test set f1 score = 0.7512690355329948


In [None]:
runGradientBoosting(300, 1000)

Best Parameters found - {'criterion': 'friedman_mse', 'learning_rate': 0.1, 'max_depth': 4, 'max_features': None, 'min_samples_split': 2, 'n_estimators': 200}
Test set accuracy = 0.9765
Test set f1 score = 0.9769946157611357


In [5]:
runGradientBoosting(300, 5000)

Best Parameters found - {'max_depth': 8, 'min_samples_split': 2, 'n_estimators': 30}
Test set accuracy = 0.9736
Test set f1 score = 0.9742288168684107


In [6]:
runGradientBoosting(500,100)

Best Parameters found - {'max_depth': 4, 'min_samples_split': 2, 'n_estimators': 30}
Test set accuracy = 0.725
Test set f1 score = 0.729064039408867


In [7]:
runGradientBoosting(500, 1000)

Best Parameters found - {'max_depth': 4, 'min_samples_split': 2, 'n_estimators': 30}
Test set accuracy = 0.919
Test set f1 score = 0.9199604743083004


In [8]:
runGradientBoosting(500, 5000)

Best Parameters found - {'max_depth': 8, 'min_samples_split': 2, 'n_estimators': 30}
Test set accuracy = 0.9759
Test set f1 score = 0.976286529568041


In [9]:
runGradientBoosting(1000, 100)

Best Parameters found - {'max_depth': 4, 'min_samples_split': 2, 'n_estimators': 30}
Test set accuracy = 0.855
Test set f1 score = 0.8557213930348259


In [10]:
runGradientBoosting(1000, 1000)

Best Parameters found - {'max_depth': 4, 'min_samples_split': 5, 'n_estimators': 30}
Test set accuracy = 0.953
Test set f1 score = 0.9535113748763601


In [11]:
runGradientBoosting(1000, 5000)

Best Parameters found - {'max_depth': 8, 'min_samples_split': 2, 'n_estimators': 30}
Test set accuracy = 0.9858
Test set f1 score = 0.985836824256932


In [12]:
runGradientBoosting(1500, 100)

Best Parameters found - {'max_depth': 4, 'min_samples_split': 2, 'n_estimators': 30}
Test set accuracy = 0.945
Test set f1 score = 0.9458128078817734


In [13]:
runGradientBoosting(1500, 1000)

Best Parameters found - {'max_depth': 4, 'min_samples_split': 2, 'n_estimators': 30}
Test set accuracy = 0.99
Test set f1 score = 0.98998998998999


In [14]:
runGradientBoosting(1500, 5000)

Best Parameters found - {'max_depth': 4, 'min_samples_split': 2, 'n_estimators': 30}
Test set accuracy = 0.9953
Test set f1 score = 0.9953061020673125


In [15]:
runGradientBoosting(1800, 100)

Best Parameters found - {'max_depth': 4, 'min_samples_split': 5, 'n_estimators': 20}
Test set accuracy = 0.875
Test set f1 score = 0.8847926267281105


In [16]:
runGradientBoosting(1800, 1000)

Best Parameters found - {'max_depth': 4, 'min_samples_split': 5, 'n_estimators': 30}
Test set accuracy = 0.9955
Test set f1 score = 0.9955022488755622


In [17]:
runGradientBoosting(1800, 5000)

Best Parameters found - {'max_depth': 4, 'min_samples_split': 2, 'n_estimators': 30}
Test set accuracy = 0.9981
Test set f1 score = 0.9981009495252374
