In [44]:
import time
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score, roc_curve, accuracy_score
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.neural_network import MLPClassifier

In [3]:
from sklearn.ensemble import RandomForestClassifier

In [4]:
from sklearn.linear_model import SGDClassifier

In [5]:
from lightgbm import LGBMClassifier

In [6]:
from xgboost import XGBClassifier

# Load Data

In [7]:
# import tensorflow as tf
# from tensorflow.keras.models import Sequential 
# from tensorflow.keras.layers import Dense, Activation, LeakyReLU

# scaler = StandardScaler()
# scaler.fit(x_tr)
# x_tr = scaler.transform(x_tr)
# x_val = scaler.transform(x_val)

# input_size = 1200
# hidden1 = 64
# hidden2 = 32
# classes = 1

# model = Sequential()     
# model.add(Dense(hidden1, input_dim=input_size))
# model.add(LeakyReLU(alpha=0.05))
# model.add(Dense(hidden2))
# model.add(LeakyReLU(alpha=0.05))
# model.add(Dense(classes, activation='sigmoid'))

# # Compilation
# model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), 
#               metrics=['accuracy'], 
#               optimizer='adam')

# model.summary()

# batch_size = 32
# epch = 20

# history = model.fit(x_data, y_train, 
#                     validation_data=(x_val, y_val),
#                     batch_size=batch_size,
#                     epochs=epch,
#                     verbose=1)

# pdf = pd.DataFrame.from_dict(history.history)

# plt.plot(pdf["loss"], label="train_loss")
# plt.plot(pdf["val_loss"], label="val_loss")
# plt.legend()
# plt.show()

# plt.plot(pdf["accuracy"], label="train_accuracy")
# plt.plot(pdf["val_accuracy"], label="val_accuracy")
# plt.legend()
# plt.show()

In [8]:
fname = "./../data.npz"

In [9]:
data = np.load(fname, allow_pickle=True)

In [10]:
train_data = data["train"]

In [11]:
test_data = data["test"]

In [12]:
x_train = train_data[:, 0]
y_train = train_data[:, 1]

In [13]:
x_data = []
for i in range(x_train.shape[0]):
    x = x_train[i]
    x = x.flatten()
    x_data.append(x)

In [14]:
x_data = np.array(x_data)

In [15]:
x_data.shape

(5000, 1200)

In [16]:
x_data = np.asarray(x_data).astype('float32')

In [17]:
y_train = np.asarray(y_train).astype('float32')

## Load Test Data

In [18]:
x_test = []
for i in range(test_data.shape[0]):
    x = test_data[i]
    x = x.flatten()
    x_test.append(x)
    


In [19]:
x_test = np.array(x_test)

In [20]:
x_test = np.asarray(x_test).astype('float32')

## Instantiate Models

In [21]:
mlp_model = MLPClassifier(hidden_layer_sizes=512, 
                          learning_rate="adaptive", 
                          early_stopping=True,
                          random_state=42)

In [22]:
rf_model = RandomForestClassifier(n_estimators=300, max_depth=10, random_state=42)

In [23]:
sgd_model = SGDClassifier(loss="log", penalty="elasticnet", random_state=42)

In [24]:
lgb_model = LGBMClassifier(learning_rate=0.01, n_estimators=300, reg_alpha=1.0, reg_lambda=0.5, random_state=42)

In [25]:
xgb_model = XGBClassifier()

## Create Cross Validation Loop

In [47]:
def cv_model(model, x_train , y_train, x_test, kfolds=5):
    t1 = time.time()
    accuracy = []
    f1score = []
    roc_auc = []
    train_mat = np.empty((x_train.shape[0], kfolds))
    test_mat = np.empty((x_test.shape[0], kfolds))
    
    kf = StratifiedKFold(n_splits=kfolds, shuffle=True, random_state=88)
    counter = 0
    for train_idx, val_idx in kf.split(x_train, y_train):
        # get fold data
        xt, xv = x_train[train_idx, :], x_train[val_idx, :]
        yt, yv = y_train[train_idx], y_train[val_idx]
        
        # train model
        model.fit(xt, yt)
        
        # evaluate model
        y_pred = model.predict_proba(xv)[:,1]
        
        fold_roc = roc_auc_score(yv, y_pred)
        
        y_out = np.where(y_pred > 0.5, 1, 0)
        fold_f1 = f1_score(yv, y_out)
        fold_acc = accuracy_score(yv, y_out)
        
        accuracy.append(fold_acc)
        f1score.append(fold_f1)
        roc_auc.append(fold_roc)
        
        # create meta learner data
        train_mat[:, counter] = model.predict_proba(x_train)[:,1]
        test_mat[:, counter] = model.predict_proba(x_test)[:,1]
        
        counter += 1
        
    meta_train_data = np.mean(train_mat, axis=1, keepdims=True)
    meta_test_data = np.mean(test_mat, axis=1, keepdims=True)
    avg_acc = np.mean(accuracy)
    avg_f1 = np.mean(f1score)
    avg_roc = np.mean(roc_auc)
    
    t2 = time.time()
    
    return meta_train_data, meta_test_data, avg_acc, avg_f1, avg_roc, t2-t1

In [39]:
models = {
    "mlp": mlp_model,
    "rf": rf_model,
    "sgd": sgd_model,
    "lgb": lgb_model,
    "xgb": xgb_model
}

In [40]:
meta_train_data = np.empty((x_data.shape[0], len(models)))

In [41]:
meta_test_data = np.empty((x_test.shape[0], len(models)))

In [51]:
counter = 0
for model_name, model in models.items():
    mtr, mts, acc, f1, roc, dt = cv_model(model, x_data, y_train, x_test)
    meta_train_data[:, counter] = mtr[:,0]
    meta_test_data[:, counter] = mts[:,0]
    
    counter += 1
    
    print("Model Name: ", model_name)
    print("Accuracy: ", acc)
    print("F1-Score: ", f1)
    print("ROC-AUC: ", roc)
    print("Time Taken: ", dt)
    print("--"*40)

Model Name:  mlp
Accuracy:  0.9818
F1-Score:  0.9693646906033042
ROC-AUC:  0.98866
Time Taken:  13.126729965209961
--------------------------------------------------------------------------------
Model Name:  rf
Accuracy:  0.9846
F1-Score:  0.9739492686283591
ROC-AUC:  0.9953238095238095
Time Taken:  10.770376920700073
--------------------------------------------------------------------------------
Model Name:  sgd
Accuracy:  0.9763999999999999
F1-Score:  0.9595633209178972
ROC-AUC:  0.9700014285714286
Time Taken:  3.7644460201263428
--------------------------------------------------------------------------------
Model Name:  lgb
Accuracy:  0.982
F1-Score:  0.9695451761334475
ROC-AUC:  0.993535238095238
Time Taken:  10.30993103981018
--------------------------------------------------------------------------------
















Model Name:  xgb
Accuracy:  0.9832000000000001
F1-Score:  0.9716348139846023
ROC-AUC:  0.9945304761904762
Time Taken:  12.838800191879272
--------------------------------------------------------------------------------


## Rough model testing

In [72]:
# dummy_x = x_data[:10, :]
# dummy_y = y_train[:5]

In [73]:
# dummy_y.shape

In [74]:
# mlp_model.fit(x_data, y_train)

In [75]:
# mlp_model.predict_proba(x_test[:5])

In [76]:
# lgb_model.predict(x_test[:5])

## Meta Learner Model

In [52]:
meta_model = XGBClassifier()

In [53]:
meta_model.fit(meta_train_data, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [54]:
y_preds = meta_model.predict(meta_test_data)

In [60]:
# ensemble solution
y_preds = np.mean(meta_test_data, axis=1)

In [61]:
y_pred_out = np.where(y_preds > 0.5, 1, 0)

In [62]:
sdf = pd.read_csv("./../sample_submission.csv")

In [63]:
sdf.head()

Unnamed: 0,label
0,1
1,0
2,0
3,0
4,1


In [64]:
sdf["label"] = y_pred_out

In [65]:
sdf.to_csv("./assets/submission.csv", index=False)