In [33]:
# import deep learning libraries
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.utils import resample
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, classification_report
import glob
from tqdm import tqdm
from pathlib import Path
import pickle


In [9]:
SEED = 1706

In [13]:
numpy_file_path = 'data/processed/concatenated_features.npy'

if Path.exists(Path(numpy_file_path)):
    print('Loading numpy file...')
    data = np.load(numpy_file_path)
else:
    path = 'data/processed/pd_feature_array'
    all_files = glob.glob(path + "/*.parquet")

    df_li = []
    fail = []

    def check_shape(frame, filename):
        arr = np.array(frame)
        arr = arr.reshape(-1, 5596)
        target = arr[:, -1]
        set_target = set(target)
        for i in set_target:
            assert i in [0, 1], "Error in {}".format(filename)

    for filename in tqdm(all_files):
        frame = pd.read_parquet(filename)
        check_shape(frame, filename)
        df_li.append(frame)

    print("Concatenating dataframes")
    df = pd.concat(df_li)

    print("Deleting df_li")
    del(df_li)

    print(f'Filling {df[df.isnull().any(1)].size} NaNs')
    df.fillna(0, inplace=True)
    data = np.array(df).reshape(-1, 5596)
    np.save(numpy_file_path, )

Loading numpy file...


In [14]:
print("Delete df after getting array")
try:
    del(df)
except:
    pass

Delete df after getting array


In [15]:
# split into training and test set
x_train, x_test, y_train, y_test = train_test_split(data[:, :-1], data[:, -1], test_size=0.2, random_state=SEED, shuffle=True)

# upsample minority class
x_minority = x_train[y_train == 0]
y_minority = y_train[y_train == 0]
x_majority = x_train[y_train == 1]
y_majority = y_train[y_train == 1]

x_majority_subsample, y_majority_subsample = resample(x_majority, y_majority, replace=False, n_samples=x_minority.shape[0], random_state=SEED)

x_train_balanced = np.concatenate((x_majority_subsample, x_minority))
y_train_balanced = np.concatenate((y_majority_subsample, y_minority))


In [17]:
model_search = RandomForestClassifier(random_state=SEED)
param_grid = {'n_estimators': [100, 200, 500, 1000],
              'max_depth': [6, 8, 10, 12, 14],
              'max_features': ['auto', 'sqrt', 'log2'],
              'criterion': ['gini', 'entropy']}

CV_model = GridSearchCV(estimator=model_search, param_grid=param_grid, cv=5, n_jobs=-1, verbose=100)
CV_model.fit(x_train_balanced, y_train_balanced)

Fitting 5 folds for each of 120 candidates, totalling 600 fits
[CV 3/5; 2/120] START criterion=gini, max_depth=6, max_features=auto, n_estimators=200
[CV 3/5; 3/120] START criterion=gini, max_depth=6, max_features=auto, n_estimators=500
[CV 2/5; 10/120] START criterion=gini, max_depth=6, max_features=log2, n_estimators=200
[CV 5/5; 4/120] START criterion=gini, max_depth=6, max_features=auto, n_estimators=1000
[CV 3/5; 15/120] START criterion=gini, max_depth=8, max_features=auto, n_estimators=500
[CV 5/5; 35/120] START criterion=gini, max_depth=10, max_features=log2, n_estimators=500
[CV 3/5; 4/120] START criterion=gini, max_depth=6, max_features=auto, n_estimators=1000
[CV 1/5; 10/120] START criterion=gini, max_depth=6, max_features=log2, n_estimators=200
[CV 1/5; 11/120] START criterion=gini, max_depth=6, max_features=log2, n_estimators=500
[CV 2/5; 18/120] START criterion=gini, max_depth=8, max_features=sqrt, n_estimators=200
[CV 4/5; 1/120] START criterion=gini, max_depth=6, max_fea

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=1706),
             n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [6, 8, 10, 12, 14],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [100, 200, 500, 1000]},
             verbose=100)

In [21]:
pickle.dump(CV_model, open('models/RandomForest_GridSearch.pkl', 'wb'))

In [23]:
CV_model.best_params_

{'criterion': 'entropy',
 'max_depth': 14,
 'max_features': 'auto',
 'n_estimators': 500}

In [29]:
best_model = RandomForestClassifier(criterion="entropy", max_depth=14, max_features="auto", n_estimators=500, verbose=5, n_jobs=-1, random_state=SEED)
best_model.fit(x_train_balanced, y_train_balanced)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 256 concurrent workers.


building tree 1 of 500
building tree 2 of 500
building tree 3 of 500
building tree 4 of 500
building tree 5 of 500
building tree 6 of 500
building tree 7 of 500
building tree 8 of 500
building tree 9 of 500
building tree 10 of 500
building tree 11 of 500
building tree 12 of 500
building tree 13 of 500
building tree 14 of 500
building tree 15 of 500
building tree 16 of 500
building tree 17 of 500
building tree 18 of 500
building tree 19 of 500
building tree 20 of 500
building tree 21 of 500
building tree 22 of 500
building tree 23 of 500
building tree 24 of 500
building tree 25 of 500
building tree 26 of 500
building tree 27 of 500
building tree 28 of 500
building tree 29 of 500
building tree 30 of 500
building tree 31 of 500
building tree 32 of 500
building tree 33 of 500
building tree 34 of 500
building tree 35 of 500
building tree 36 of 500
building tree 37 of 500
building tree 38 of 500
building tree 39 of 500
building tree 40 of 500
building tree 41 of 500
building tree 42 of 500bu

[Parallel(n_jobs=-1)]: Done  90 out of 500 | elapsed:    3.3s remaining:   15.1s


building tree 328 of 500
building tree 329 of 500



building tree 330 of 500

building tree 331 of 500building tree 332 of 500
building tree 333 of 500building tree 334 of 500

building tree 335 of 500
building tree 336 of 500

building tree 337 of 500
building tree 338 of 500

building tree 339 of 500building tree 340 of 500building tree 341 of 500


building tree 342 of 500building tree 343 of 500
building tree 344 of 500
building tree 345 of 500
building tree 346 of 500
building tree 347 of 500

building tree 348 of 500building tree 349 of 500
building tree 350 of 500
building tree 351 of 500
building tree 352 of 500

building tree 353 of 500
building tree 354 of 500
building tree 355 of 500building tree 356 of 500
building tree 357 of 500
building tree 358 of 500building tree 359 of 500
building tree 360 of 500building tree 361 of 500building tree 362 of 500



building tree 363 of 500

building tree 364 of 500building tree 365 of 500building tree 366 of 500


building tree 367 of

[Parallel(n_jobs=-1)]: Done 191 out of 500 | elapsed:    6.0s remaining:    9.7s



building tree 444 of 500building tree 445 of 500building tree 446 of 500

building tree 447 of 500

building tree 448 of 500
building tree 449 of 500building tree 450 of 500
building tree 451 of 500building tree 452 of 500
building tree 453 of 500building tree 454 of 500

building tree 455 of 500
building tree 456 of 500
building tree 457 of 500building tree 458 of 500

building tree 460 of 500
building tree 462 of 500
building tree 459 of 500building tree 463 of 500
building tree 464 of 500
building tree 465 of 500


building tree 466 of 500
building tree 467 of 500
building tree 461 of 500building tree 468 of 500building tree 469 of 500


building tree 470 of 500building tree 471 of 500building tree 472 of 500

building tree 473 of 500
building tree 474 of 500building tree 475 of 500
building tree 476 of 500



building tree 477 of 500
building tree 478 of 500building tree 479 of 500
building tree 480 of 500
building tree 481 of 500
building tree 482 of 500building tree 483 of 500bu

[Parallel(n_jobs=-1)]: Done 292 out of 500 | elapsed:    8.4s remaining:    6.0s
[Parallel(n_jobs=-1)]: Done 393 out of 500 | elapsed:   10.4s remaining:    2.8s
[Parallel(n_jobs=-1)]: Done 494 out of 500 | elapsed:   11.2s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   11.4s finished


RandomForestClassifier(criterion='entropy', max_depth=14, n_estimators=500,
                       n_jobs=-1, random_state=1706, verbose=5)

In [31]:
y_pred = best_model.predict(x_test)

[Parallel(n_jobs=256)]: Using backend ThreadingBackend with 256 concurrent workers.
[Parallel(n_jobs=256)]: Done  90 out of 500 | elapsed:    0.4s remaining:    1.7s
[Parallel(n_jobs=256)]: Done 191 out of 500 | elapsed:    0.5s remaining:    0.9s
[Parallel(n_jobs=256)]: Done 292 out of 500 | elapsed:    0.6s remaining:    0.4s
[Parallel(n_jobs=256)]: Done 393 out of 500 | elapsed:    0.6s remaining:    0.2s
[Parallel(n_jobs=256)]: Done 494 out of 500 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=256)]: Done 500 out of 500 | elapsed:    0.6s finished


In [43]:
print(f"Confusion Matrix \n {confusion_matrix(y_test, y_pred)}", end=('\n'*2))
print(f"ROC CURVE {roc_curve(y_test, y_pred)}", end=('\n'*2))
print(f"ROC AUC SCORE: {roc_auc_score(y_test, y_pred)}", end=('\n'*2))
print(classification_report(y_test,y_pred), )

Confusion Matrix 
 [[1471   69]
 [ 138 7977]]

ROC CURVE (array([0.        , 0.04480519, 1.        ]), array([0.        , 0.98299445, 1.        ]), array([2., 1., 0.]))

ROC AUC SCORE: 0.9690946299541494

              precision    recall  f1-score   support

         0.0       0.91      0.96      0.93      1540
         1.0       0.99      0.98      0.99      8115

    accuracy                           0.98      9655
   macro avg       0.95      0.97      0.96      9655
weighted avg       0.98      0.98      0.98      9655



In [46]:
y_test[y_test ==0].shape

(1540,)

In [48]:
x_test.shape

(9655, 5595)

In [50]:
x_train_balanced.shape

(12364, 5595)