In [19]:
# import deep learning libraries
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.utils import resample
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve
import glob
from tqdm import tqdm
from pathlib import Path
import pickle


In [9]:
SEED = 1706

In [13]:
numpy_file_path = 'data/processed/concatenated_features.npy'

if Path.exists(Path(numpy_file_path)):
    print('Loading numpy file...')
    data = np.load(numpy_file_path)
else:
    path = 'data/processed/pd_feature_array'
    all_files = glob.glob(path + "/*.parquet")

    df_li = []
    fail = []

    def check_shape(frame, filename):
        arr = np.array(frame)
        arr = arr.reshape(-1, 5596)
        target = arr[:, -1]
        set_target = set(target)
        for i in set_target:
            assert i in [0, 1], "Error in {}".format(filename)

    for filename in tqdm(all_files):
        frame = pd.read_parquet(filename)
        check_shape(frame, filename)
        df_li.append(frame)

    print("Concatenating dataframes")
    df = pd.concat(df_li)

    print("Deleting df_li")
    del(df_li)

    print(f'Filling {df[df.isnull().any(1)].size} NaNs')
    df.fillna(0, inplace=True)
    data = np.array(df).reshape(-1, 5596)
    np.save(numpy_file_path, )

Loading numpy file...


In [14]:
print("Delete df after getting array")
try:
    del(df)
except:
    pass

Delete df after getting array


In [15]:
# split into training and test set
x_train, x_test, y_train, y_test = train_test_split(data[:, :-1], data[:, -1], test_size=0.2, random_state=SEED, shuffle=True)

# upsample minority class
x_minority = x_train[y_train == 0]
y_minority = y_train[y_train == 0]
x_majority = x_train[y_train == 1]
y_majority = y_train[y_train == 1]

x_majority_subsample, y_majority_subsample = resample(x_majority, y_majority, replace=False, n_samples=x_minority.shape[0], random_state=SEED)

x_train_balanced = np.concatenate((x_majority_subsample, x_minority))
y_train_balanced = np.concatenate((y_majority_subsample, y_minority))


In [17]:
model_search = RandomForestClassifier(random_state=SEED)
param_grid = {'n_estimators': [100, 200, 500, 1000],
              'max_depth': [6, 8, 10, 12, 14],
              'max_features': ['auto', 'sqrt', 'log2'],
              'criterion': ['gini', 'entropy']}

CV_model = GridSearchCV(estimator=model_search, param_grid=param_grid, cv=5, n_jobs=-1, verbose=100)
CV_model.fit(x_train_balanced, y_train_balanced)

Fitting 5 folds for each of 120 candidates, totalling 600 fits
[CV 3/5; 2/120] START criterion=gini, max_depth=6, max_features=auto, n_estimators=200
[CV 3/5; 3/120] START criterion=gini, max_depth=6, max_features=auto, n_estimators=500
[CV 2/5; 10/120] START criterion=gini, max_depth=6, max_features=log2, n_estimators=200
[CV 5/5; 4/120] START criterion=gini, max_depth=6, max_features=auto, n_estimators=1000
[CV 3/5; 15/120] START criterion=gini, max_depth=8, max_features=auto, n_estimators=500
[CV 5/5; 35/120] START criterion=gini, max_depth=10, max_features=log2, n_estimators=500
[CV 3/5; 4/120] START criterion=gini, max_depth=6, max_features=auto, n_estimators=1000
[CV 1/5; 10/120] START criterion=gini, max_depth=6, max_features=log2, n_estimators=200
[CV 1/5; 11/120] START criterion=gini, max_depth=6, max_features=log2, n_estimators=500
[CV 2/5; 18/120] START criterion=gini, max_depth=8, max_features=sqrt, n_estimators=200
[CV 4/5; 1/120] START criterion=gini, max_depth=6, max_fea

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=1706),
             n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [6, 8, 10, 12, 14],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [100, 200, 500, 1000]},
             verbose=100)

In [21]:
pickle.dump(CV_model, open('models/RandomForest_GridSearch.pkl', 'wb'))

In [None]:
print(confusion_matrix(y_test, y_pred ))
print(roc_curve(y_test, y_pred))
print(roc_auc_score(y_test, y_pred))