In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np 
import pandas as pd 
from sklearn.svm import SVC
from time import time  # measure running time
from sklearn.neural_network import MLPClassifier  # neural network
from sklearn.model_selection import train_test_split  # split train test size
from sklearn.naive_bayes import BernoulliNB, GaussianNB  # naive bayesian
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix  # metrics
from sklearn.ensemble import RandomForestClassifier, VotingClassifier  # ensemble methods

In [None]:
d_train = pd.read_csv('../input/ppd-for-xuecheng/PPDtrain-en.csv')
d_test = pd.read_csv('../input/ppd-for-xuecheng/PPDtest-en.csv')

In [None]:
d_train.head()

In [None]:
# feature engineering
d_train
d_test

In [None]:
# drop non-numerical vars
cate_vars = []
for i in d_train:
    if d_train[i].dtype not in [np.float, np.int]:
        cate_vars.append(i)
print(cate_vars)
train = d_train.drop(cate_vars, 1)
test = d_test.drop(cate_vars, 1)

In [None]:
# fillna
print(train.isna().mean().mean())
for i in train:
    train[i].fillna(train[i].median(), inplace=True)
    test[i].fillna(test[i].median(), inplace=True)
print(train.isna().mean().mean())

In [None]:
# pop id var
id_train = train.pop('Idx')
id_test = test.pop('Idx')
# pop Y
Y = 'target'
train_Y = train.pop(Y)
test_Y = test.pop(Y)

In [None]:
# train valid split
np.random.seed(123)
train_X, valid_X, train_Y, valid_Y = train_test_split(train, train_Y, test_size=0.3)
valid_X

In [None]:
# benchmark
valid_bench = np.zeros(valid_Y.shape)
valid_conf = confusion_matrix(valid_Y,valid_bench)
print(valid_conf)
test_bench = np.zeros(test_Y.shape)
test_conf = confusion_matrix(test_Y,test_bench)
print(valid_conf)

In [None]:
def roi(conf, ratio=[0.1,-1]):
    return np.mean(conf[:,0] * ratio)
print(roi(valid_conf))
print(roi(test_conf))

In [None]:
# Naive Bayesian
m = BernoulliNB()
# train and valid
m.fit(train_X, train_Y)
valid_pred = m.predict(valid_X)
valid_conf = confusion_matrix(valid_Y, valid_pred)
print(valid_conf)
print(roi(valid_conf))

In [None]:
# voting
# methods for voting
m1 = BernoulliNB()
m2 = GaussianNB()

# init voting class [('label', method)]
m = VotingClassifier(estimators=[('BernoulliNB', m1), 
                                 ('RF', m2)], voting='hard')

# train and valid
m.fit(train_X, train_Y)
valid_pred = m.predict(valid_X)
valid_conf = confusion_matrix(valid_Y, valid_pred)
print(valid_conf)
print(roi(valid_conf))

# try voting with three and more methods, and find the best combination

In [None]:
# NN
m = MLPClassifier(hidden_layer_sizes =(128,64,32))  # try different hls

# train and valid
m.fit(train_X, train_Y)
valid_pred = m.predict(valid_X)
valid_conf = confusion_matrix(valid_Y, valid_pred)
print(valid_conf)
print(roi(valid_conf))

# try voting with three and more methods, and find the best combination

In [None]:
from imblearn.over_sampling import SMOTE, ADASYN
m_resample = SMOTE()
train2_X, train2_Y = m_resample.fit_resample(train_X, train_Y)
print(train2_Y.mean())
train2_X

In [None]:
# NN
m = MLPClassifier(hidden_layer_sizes =(128,64,32))  # try different hls

# train and valid
m.fit(train2_X, train2_Y)
valid_pred = m.predict(valid_X)
valid_conf = confusion_matrix(valid_Y, valid_pred)
print(valid_conf)
print(roi(valid_conf))

In [None]:
# SVM
m = SVC()  # try different hls

# train and valid
m.fit(train_X, train_Y)
valid_pred = m.predict(valid_X)
valid_conf = confusion_matrix(valid_Y, valid_pred)
print(valid_conf)
print(roi(valid_conf))

# try voting with three and more methods, and find the best combination