In [1]:
# import deep learning libraries
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.utils import resample
from sklearn.model_selection import GridSearchCV
import glob
from tqdm import tqdm


In [2]:
SEED = 1706

In [3]:
path = 'data/processed/pd_feature_array'
all_files = glob.glob(path + "/*.parquet")

df_li = []
fail = []

def check_shape(frame, filename):
    arr = np.array(frame)
    arr = arr.reshape(-1, 5596)
    target = arr[:, -1]
    set_target = set(target)
    for i in set_target:
        assert i in [0, 1], "Error in {}".format(filename)

for filename in tqdm(all_files):
    frame = pd.read_parquet(filename)
    check_shape(frame, filename)
    df_li.append(frame)

print("Concatenating dataframes")
df = pd.concat(df_li)

print("Deleting df_li")
del(df_li)

print(f'Filling {df[df.isnull().any(1)].size} NaNs')
df.fillna(0, inplace=True)

100%|██████████| 225/225 [00:51<00:00,  4.41it/s]


Concatenating dataframes
Deleting df_li
Filling 9060 NaNs


In [4]:
arr = np.array(df).reshape(-1, 5596)

print("Delete df after getting array")
del(df)

Delete df after getting array


In [5]:
# split into training and test set
x_train, x_test, y_train, y_test = train_test_split(arr[:, :-1], arr[:, -1], test_size=0.2, random_state=SEED)

# upsample minority class
x_minority = x_train[y_train == 0]
y_minority = y_train[y_train == 0]
x_majority = x_train[y_train == 1]
y_majority = y_train[y_train == 1]

x_minority_oversampled, y_minority_oversampled = resample(x_minority, y_minority, replace=True, n_samples=len(x_majority), random_state=SEED)

x_train_balanced = np.concatenate((x_majority, x_minority_oversampled))
y_train_balanced = np.concatenate((y_majority, y_minority_oversampled))


In [6]:
model_search = RandomForestClassifier(random_state=SEED)
param_grid = {'n_estimators': [10, 50, 100, 200, 500, 1000],
              'max_depth': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
              'max_features': ['auto', 'sqrt', 'log2'],
              'criterion': ['gini', 'entropy']}

CV_model = GridSearchCV(estimator=model_search, param_grid=param_grid, cv=5, n_jobs=-1, verbose=5)
CV_model.fit(x_train_balanced, y_train_balanced)

Fitting 5 folds for each of 360 candidates, totalling 1800 fits
[CV 1/5] END criterion=gini, max_depth=2, max_features=auto, n_estimators=10;, score=0.764 total time=  35.1s
[CV 5/5] END criterion=gini, max_depth=2, max_features=auto, n_estimators=10;, score=0.763 total time=  44.8s
[CV 4/5] END criterion=gini, max_depth=2, max_features=auto, n_estimators=10;, score=0.756 total time=  44.8s
[CV 2/5] END criterion=gini, max_depth=2, max_features=auto, n_estimators=10;, score=0.763 total time=  44.8s
[CV 3/5] END criterion=gini, max_depth=2, max_features=auto, n_estimators=10;, score=0.765 total time=  44.8s
[CV 1/5] END criterion=gini, max_depth=2, max_features=auto, n_estimators=50;, score=0.766 total time= 1.4min
[CV 5/5] END criterion=gini, max_depth=2, max_features=auto, n_estimators=50;, score=0.772 total time= 1.5min
[CV 4/5] END criterion=gini, max_depth=2, max_features=auto, n_estimators=50;, score=0.766 total time= 1.5min
[CV 3/5] END criterion=gini, max_depth=2, max_features=a

KeyboardInterrupt: 