In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
%matplotlib inline

In [6]:
naive_bayes_predictions = np.loadtxt("naive_bayes_predictions.txt") 
lightGBM_predictions = np.loadtxt("lightGBM_predictions.txt") 
y_test = pd.read_csv("y_test.csv", header=None, names = ['target'])

# Models Blending:

In [23]:
# Models blending: LightGBM + Gaussian Naive Bayes 
# Calculating the weights

auc = {}
for weight in [x/100 for x in range(0,101)]:
    combined_preds = []
    for i in range(y_test.shape[0]):
        combined_pred = naive_bayes_predictions[i] * weight + lightGBM_predictions[i] * (1-weight)
        combined_preds.append(combined_pred)
    auc[weight] = roc_auc_score(y_test['target'], combined_preds)

In [24]:
optimal_weight = max(auc, key=auc.get)

In [26]:
final_combined_pred = naive_bayes_predictions*optimal_weight + lightGBM_predictions*(1-optimal_weight)

In [27]:
print("CV score: {:<8.5f}".format(roc_auc_score(y_test['target'], final_combined_pred)))

CV score: 0.90084 


In [95]:
np.savetxt('blending_result.txt', final_combined_pred)

# Models Stacking:

In [33]:
import pandas as pd
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB 
from sklearn import model_selection
import numpy as np
from mlxtend.classifier import StackingCVClassifier
from lightgbm import LGBMClassifier

In [58]:
# Use the data with Frequency Count FE.

x_train = pd.read_csv("x_train_FE.csv")
y_train = pd.read_csv("y_train_FE.csv")
x_test = pd.read_csv("x_test_FE.csv")
y_test = pd.read_csv("y_test_FE.csv")

print("x_train shape: ", x_train.shape)
print("y_train shape: ", y_train.shape)
print("x_test shape: ", x_test.shape)
print("y_test shape: ", y_test.shape)

x_train shape:  (100000, 400)
y_train shape:  (100000, 1)
x_test shape:  (100000, 400)
y_test shape:  (100000, 1)


In [59]:
# Scale data
scaler = preprocessing.StandardScaler()
scaled_df = scaler.fit_transform(x_train)
x_train = pd.DataFrame(scaled_df)
scaled_df = scaler.fit_transform(x_test)
x_test = pd.DataFrame(scaled_df)

In [77]:
# Set up level 1 models
RANDOM_SEED = 16
lgbm = LGBMClassifier(application='binary',
                      boosting='gbdt',
                      learning_rate= 0.01,
                      num_leaves=73,
                      tree_learner='serial',
                      num_threads=0,                      
                      max_depth=23,
                      min_data_in_leaf=5,
                      min_sum_hessian_in_leaf=35.74237455449936,
                      bagging_fraction=0.9476322568516703,
                      bagging_freq=0,
                      feature_fraction=1.0,
                      lambda_l1=2.9721007219556626,
                      lambda_l2=2.1415434246484737,
                      boost_from_average=True,                      
                      metric='auc'
                     )
nb = GaussianNB()

# set up the meta classifier (level 2 model)
from mlxtend.classifier import StackingCVClassifier

np.random.seed(RANDOM_SEED)
lr = LogisticRegression(max_iter=1000, class_weight='balanced', penalty='l1', C=0.1, solver='liblinear')
sclf = StackingCVClassifier(classifiers=[nb, lgbm], 
                            use_probas=True,
                            use_features_in_secondary=True,
                            meta_classifier=lr,
                            cv=6)

# Set up K-Fold cross validation and predictions
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

num_folds = 6
folds = KFold(n_splits=num_folds, random_state=16)
test_result = np.zeros(len(y_test))
auc_score = 0


for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train, y_train)):
    print("Fold: ", fold_ + 1)
    
    X_train, Y_train = x_train.iloc[trn_idx], y_train.iloc[trn_idx]
    X_valid, Y_valid = x_train.iloc[val_idx], y_train.iloc[val_idx]
    
    sclf.fit(X_train.values, Y_train['0'].values)
    
    Y_pred = sclf.predict_proba(X_valid)
    auc = roc_auc_score(Y_valid, Y_pred[:, 1])
    print(auc)
    auc_score += auc

    preds = sclf.predict_proba(x_test)
    test_result += preds[:, 1]

Fold:  1
0.8796017858649708
Fold:  2
0.8821009840981637
Fold:  3
0.8740916499870713
Fold:  4
0.8774067419095065
Fold:  5
0.8767693248527761
Fold:  6
0.8732832182569688


In [78]:
# print the average AUC across the folds and compute the final results on the test data
auc_score = auc_score / folds.n_splits
print("AUC score: ", auc_score)
test_result = test_result / folds.n_splits

AUC score:  0.8772089508282429


In [80]:
print("CV score: {:<8.5f}".format(roc_auc_score(y_test, test_result)))

CV score: 0.53393 


In [86]:
# Use data without feature engineering FE

In [87]:
data = pd.read_csv("whole_data.csv")

x_train = pd.read_csv("x_train.csv")
y_train = pd.read_csv("y_train.csv", header=None, names = ['target'])
x_test = pd.read_csv("x_test.csv")
y_test = pd.read_csv("y_test.csv", header=None, names = ['target'])

print("x_train shape: ", x_train.shape)
print("y_train shape: ", y_train.shape)
print("x_test shape: ", x_test.shape)
print("y_test shape: ", y_test.shape)

x_train shape:  (100000, 200)
y_train shape:  (100000, 1)
x_test shape:  (100000, 200)
y_test shape:  (100000, 1)


In [88]:
scaler = preprocessing.StandardScaler()
scaled_df = scaler.fit_transform(x_train)
x_train = pd.DataFrame(scaled_df)
scaled_df = scaler.fit_transform(x_test)
x_test = pd.DataFrame(scaled_df)

In [90]:
# Set up level 1 models.
RANDOM_SEED = 16
lgbm = LGBMClassifier(application='binary',
                      boosting='gbdt',
                      learning_rate= 0.01,
                      num_leaves=73,
                      tree_learner='serial',
                      num_threads=0,                      
                      max_depth=23,
                      min_data_in_leaf=5,
                      min_sum_hessian_in_leaf=35.74237455449936,
                      bagging_fraction=0.9476322568516703,
                      bagging_freq=0,
                      feature_fraction=1.0,
                      lambda_l1=2.9721007219556626,
                      lambda_l2=2.1415434246484737,
                      boost_from_average=True,                      
                      metric='auc'
                     )
nb = GaussianNB()

# set up the meta classifier (level 2 model)
from mlxtend.classifier import StackingCVClassifier

np.random.seed(RANDOM_SEED)
lr = LogisticRegression(max_iter=1000, class_weight='balanced', penalty='l1', C=0.1, solver='liblinear')
sclf = StackingCVClassifier(classifiers=[nb, lgbm], 
                            use_probas=True,
                            use_features_in_secondary=True,
                            meta_classifier=lr,
                            cv=6)

# Set up K-Fold cross validation and predictions
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

num_folds = 6
folds = KFold(n_splits=num_folds, random_state=16)
test_result = np.zeros(len(y_test))
auc_score = 0


for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train, y_train)):
    print("Fold: ", fold_ + 1)
    
    X_train, Y_train = x_train.iloc[trn_idx], y_train.iloc[trn_idx]
    X_valid, Y_valid = x_train.iloc[val_idx], y_train.iloc[val_idx]
    
    sclf.fit(X_train.values, Y_train['target'].values)
    
    Y_pred = sclf.predict_proba(X_valid)
    auc = roc_auc_score(Y_valid, Y_pred[:, 1])
    print(auc)
    auc_score += auc

    preds = sclf.predict_proba(x_test)
    test_result += preds[:, 1]

Fold:  1
0.8846742376077751
Fold:  2
0.8849005074548123
Fold:  3
0.8781716354303336
Fold:  4
0.882001322964492
Fold:  5
0.8826309227676468
Fold:  6
0.8793455362360649


In [91]:
# print the average AUC across the folds and compute the final results on the test data
auc_score = auc_score / folds.n_splits
print("AUC score: ", auc_score)
test_result = test_result / folds.n_splits

AUC score:  0.8819540270768541


In [92]:
print("CV score: {:<8.5f}".format(roc_auc_score(y_test, test_result)))

CV score: 0.88264 


In [94]:
np.savetxt('stacking_result.txt', test_result)