In [None]:
import pandas as pd
import numpy as np
import gc
from sklearn.metrics import roc_auc_score, f1_score, make_scorer,confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV,StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression



data = pd.read_csv('clean/train.csv')
data.fillna(np.nan, inplace=True)
data.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in data.columns]

In [None]:
data.replace([np.inf, -np.inf], np.nan, inplace=True)

In [None]:
data.shape

In [None]:
data["TARGET"].value_counts()

### Handle missing value

In [None]:
missing = np.sum(data.isnull(),axis=0)/data.shape[0]

In [None]:
# drop the feature in which the percentage of missing value exceed 20%
# drop 163
drop_col = list(missing[missing>0.2].index)

In [None]:
data_drop = data.drop(drop_col,axis=1)

In [None]:
missing = np.sum(data_drop.isnull(),axis=0)/data_drop.shape[0]
missing_col = list(missing[missing>0].index)

In [None]:
binary_col = []
continuous_col = []
for col in missing_col:
    if sorted(list(data_drop[col].unique())) == [0,1]:
        binary_col.append(col)
    else:
        continuous_col.append(col)

In [None]:
len(binary_col)

In [None]:
# all missing are continuous feature, imputing with median
fill_na = lambda col:col.fillna(col.median())
data_drop[continuous_col] = data_drop[continuous_col].apply(fill_na, axis=0)

In [None]:
del data
gc.collect()

In [None]:
data_drop.shape

### Prepare feature

In [None]:
Y = data_drop['TARGET']

### 1. Use all features we have

In [None]:
X_origin = data_drop[list(data_drop.columns[2:])]

In [None]:
x_train_origin, x_test_origin, y_train_origin, y_test_origin = train_test_split(X_origin,Y,test_size=0.2,random_state=0)

### 2. Dimension reduction: PCA

In [None]:
# scale each column to (0,1)
from sklearn.preprocessing import MinMaxScaler
scale = MinMaxScaler()
scale.fit(X_origin)
X_scale = scale.transform(X_origin)

In [None]:
from sklearn.decomposition import PCA

pca = PCA()
pca.fit(X_scale)

In [None]:
var = pca.explained_variance_ratio_
for n_components in range(0,X_scale.shape[1]):
    total = np.sum(var[:n_components])
    if total >= 0.99:
        print(n_components)
        break

In [None]:
np.sum(np.sqrt(pca.explained_variance_))

In [None]:
X_pca = pca.transform(X_scale)

In [None]:
X_pca = X_pca[:,:n_components]

In [None]:
x_train_pca, x_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca,Y,test_size=0.2,random_state=0)

### 3. Oversampling: SMOTE with all features

In [None]:
from imblearn.over_sampling import SMOTE
# after resampling # minority / # majority = 4/6
oversample = SMOTE(sampling_strategy=4/6, k_neighbors=5, random_state=0)
x_train_origin_os, y_train_origin_os = oversample.fit_resample(x_train_origin, y_train_origin)

### 4. Oversampling: SMOTE with PCA features

In [None]:
oversample = SMOTE(sampling_strategy=4/6, k_neighbors=5, random_state=0)
x_train_pca_os, y_train_pca_os = oversample.fit_resample(x_train_pca, y_train_pca)

### Modeling  

Four versions of (X,Y):  
1) x_train_origin, y_train_origin  
2) x_train_pca, y_train_pca  
3) x_train_origin_os, y_train_origin_os  
4) x_train_pca_os, y_train_pca_os  

Use 3-fold CV (GridSearchCV) to tune hyperparameters  

Metircs: auc score

In [None]:
### Hyper Parameter Tuning Codes
# Hyper parameter tuning, input model, output best set of parameters
def tuning(x1_data, y, model_name):
    assert model_name in ['Random Forest','Decision Tree','Naive Bayes','Logistic']
    
    if model_name =="Decision Tree":
        max_depth = [3,5,10]
        min_samples_split = [2,4,6]
        min_samples_leaf = [1,2,4] 
        class_weight=[{1:1}, {1:6},{2:1}]
        hyperparameters = dict(max_depth = max_depth,min_samples_split = min_samples_split, 
                               min_samples_leaf = min_samples_leaf,class_weight=class_weight)
        dt=DecisionTreeClassifier(random_state=0)
        grid = GridSearchCV(dt, hyperparameters, scoring=make_scorer(roc_auc_score, needs_proba=True),cv=3,n_jobs=-1,verbose=10)
        best_param = grid.fit(x1_data, y)
        return best_param
    
    elif model_name =='Random Forest':
        n_estimators = [10,50,100]
        max_depth = [3,5,20]
        min_samples_split = [2, 4, 6]
        min_samples_leaf = [1, 2, 4]
        class_weight=[{1:1},{1:6},{2:1}]
        hyperparameters = dict(n_estimators = n_estimators, max_depth = max_depth,
                               min_samples_split = min_samples_split, min_samples_leaf = min_samples_leaf,class_weight=class_weight)
        rf=RandomForestClassifier(random_state=0)
        grid = GridSearchCV(rf, hyperparameters, scoring=make_scorer(roc_auc_score, needs_proba=True),cv=3,n_jobs=-1,verbose=10)
        best_param = grid.fit(x1_data, y)
        return best_param
    
    elif model_name =="Naive Bayes":
        hyperparameters = dict()
        NB=GaussianNB()
        grid = GridSearchCV(NB,hyperparameters,cv=5,n_jobs=-1,verbose=10)
        best_param = grid.fit(x1_data,y)
        return best_param
    
    else:
        class_weight=[{2:1},{1:1},{1:4},{1:6}]
        c = np.linspace(0.01, 10, 5)
        penalty=['l1','l2']
        hyperparameters = dict(C=c, class_weight=class_weight, penalty=penalty)
        lr=LogisticRegression(solver='liblinear',random_state=0)
        grid = GridSearchCV(lr, hyperparameters, scoring=make_scorer(roc_auc_score, needs_proba=True),cv=3,n_jobs=-1,verbose=10)
        best_param = grid.fit(x1_data, y)
        return best_param

                          
                          
                          

In [None]:
#Testing scores, confusion matrix
def Testing(y_test,y_pred,y_pred_pr):
    print(f1_score(y_test, y_pred))
    print(confusion_matrix(y_test,y_pred))
    print(roc_auc_score(y_test,y_pred_pr))
    
#print result, draw bar graph for feature importance for non-pca method
def print_result(model,x_test):
    print(model)
    feat_importances = pd.Series(model.best_estimator_.feature_importances_, index=x_test.columns)
    feat_importances.nlargest(15).plot(kind='barh')
    return pd.DataFrame(model.cv_results_)

## Decision Tree

#### Decision Tree 1

In [None]:
dt=tuning(x_train_origin,y_train_origin,"Decision Tree")
y_dt_1_pr=dt.predict_proba(x_test_origin)[:,1]
y_dt_1=dt.predict(x_test_origin)
Testing(y_test_origin,y_dt_1,y_dt_1_pr)
cv_dt=print_result(dt,x_test_origin)

#### Decision Tree 2

In [None]:
dt2=tuning(x_train_pca,y_train_pca,"Decision Tree")
y_dt_2_pr=dt2.predict_proba(x_test_pca)[:,1]
y_dt_2=dt2.predict(x_test_pca)
Testing(y_test_pca,y_dt_2,y_dt_2_pr)
cv_dt2=pd.DataFrame(dt2.cv_results_)

#### Decision Tree 3

In [None]:
dt3=tuning(x_train_origin_os,y_train_origin_os,"Decision Tree")
y_dt_3_pr=dt3.predict_proba(x_test_origin)[:,1]
y_dt_3=dt3.predict(x_test_origin)
Testing(y_test_origin,y_dt_3,y_dt_3_pr)
cv_dt3=print_result(dt3,x_test_origin)

#### Decision Tree 4

In [None]:
dt4=tuning(x_train_pca_os,y_train_pca_os,"Decision Tree")
y_dt_4_pr=dt4.predict_proba(x_test_pca)[:,1]
y_dt_4=dt4.predict(x_test_pca)
Testing(y_test_pca,y_dt_4,y_dt_4_pr)
cv_dt4=pd.DataFrame(dt4.cv_results_)

## Naive Bayes

#### Naive Bayes 1

In [None]:
nb=tuning(x_train_origin,y_train_origin,"Naive Bayes")
y_nb_1_pr=nb.predict_proba(x_test_origin)[:,1]
y_nb_1=nb.predict(x_test_origin)
Testing(y_test_origin,y_nb_1,y_nb_1_pr)
cv_nb=pd.DataFrame(nb.cv_results_)

#### Naive Bayes 2

In [None]:
nb2=tuning(x_train_pca,y_train_pca,"Naive Bayes")
y_nb_2_pr=nb2.predict_proba(x_test_pca)[:,1]
y_nb_2=nb2.predict(x_test_pca)
Testing(y_test_pca,y_nb_2,y_nb_2_pr)
cv_nb2=pd.DataFrame(nb2.cv_results_)

#### Naive Bayes 3

In [None]:
nb3=tuning(x_train_origin_os,y_train_origin_os,"Naive Bayes")
y_nb_3_pr=nb3.predict_proba(x_test_origin)[:,1]
y_nb_3=nb3.predict(x_test_origin)
Testing(y_test_origin,y_nb_3,y_nb_3_pr)
cv_nb3=pd.DataFrame(nb3.cv_results_)

#### Naive Bayes 4

In [None]:
nb4=tuning(x_train_pca_os,y_train_pca_os,"Naive Bayes")
y_nb_4_pr=nb4.predict_proba(x_test_pca)[:,1]
y_nb_4=nb4.predict(x_test_pca)
Testing(y_test_pca,y_nb_4,y_nb_4_pr)
cv_nb4=pd.DataFrame(nb4.cv_results_)

## Random Forest

#### Random Forest 1

In [None]:
rf=tuning(x_train_origin,y_train_origin,"Random Forest")
y_rf_1_pr=rf.predict_proba(x_test_origin)[:,1]
y_rf_1=rf.predict(x_test_origin)
Testing(y_test_origin,y_rf_1,y_rf_1_pr)
cv_rf=print_result(rf,x_test_origin)

#### Random Forest 2

In [None]:
rf2=tuning(x_train_pca,y_train_pca,"Random Forest")
y_rf_2_pr=rf2.predict_proba(x_test_pca)[:,1]
y_rf_2=rf2.predict(x_test_pca)
Testing(y_test_pca,y_rf_2,y_rf_2_pr)
cv_rf2=pd.DataFrame(rf2.cv_results_)

#### Random Forest 3

In [None]:
# rf3=tuning(x_train_origin_os,y_train_origin_os,"Random Forest")
y_rf_3_pr=rf3.predict_proba(x_test_origin)[:,1]
y_rf_3=rf3.predict(x_test_origin)
Testing(y_test_origin,y_rf_3,y_rf_3_pr)
cv_rf3=print_result(rf3,x_test_origin)

#### Random Forest 4

In [None]:

rf4=tuning(x_train_pca_os,y_train_pca_os,"Random Forest")
y_rf_4_pr=rf4.predict_proba(x_test_pca)[:,1]
y_rf_4=rf4.predict(x_test_pca)
Testing(y_test_pca,y_rf_4,y_rf_4_pr)
cv_rf4=pd.DataFrame(rf4.cv_results_)

## Logistic Regression

In [None]:
# Scale Data for Logistic Regression
scaler = MinMaxScaler(feature_range = (0,1))

scaler.fit(x_train_origin)
x_train_1 =  scaler.transform(x_train_origin)
x_test_1 = scaler.transform(x_test_origin)
x_train_2 =  scaler.transform(x_train_origin_os)
x_test_2 = scaler.transform(x_test_origin)

#### Logistic Regression 1

In [None]:
clf_logit_1 = tuning(x_train_origin, y_train_origin, 'Logistic')
display(clf_logit_1)
pred_logit_1 = clf_logit_1.predict(x_test_origin)
pred_logit_1_pr=clf_logit_1.predict_proba(x_test_1)
Testing(y_test_origin, pred_logit_1, pred_logit_1_pr[:,1])

#### Logistic Regression 2

In [None]:
clf_logit_2 = tuning(x_train_pca, y_train_pca, 'Logistic')
display(clf_logit_2)
pred_logit_2 = clf_logit_2.predict(x_test_pca)
pred_logit_2_pr=clf_logit_2.predict_proba(x_test_pca)
Testing(y_test_pca, pred_logit_2, pred_logit_2_pr[:,1])

#### Logistic Regression 3

In [None]:
clf_logit_3 = tuning(x_train_2, y_train_origin_os, 'Logistic')
display(clf_logit_3)
pred_logit_3 = clf_logit_3.predict(x_test_2)
pred_logit_3_pr=clf_logit_3.predict_proba(x_test_2)
Testing(y_test_origin, pred_logit_3, pred_logit_3_pr[:,1])

#### Logistic Regression 4

In [None]:
clf_logit_4 = tuning(x_train_pca_os, y_train_pca_os, 'Logistic')
display(clf_logit_4)
pred_logit_4 = clf_logit_4.predict(x_test_pca)
pred_logit_4_pr=clf_logit_4.predict_proba(x_test_pca)
Testing(y_test_pca, pred_logit_4, pred_logit_4_pr[:,1])

## Neural network model

In [None]:
neural=MLPClassifier(solver='adam',hidden_layer_sizes=(10,3,3), random_state=0,max_iter=100,verbose=10)
nu=neural.fit(x_train_pca,y_train_pca)
y_pred_neural_pr=nu.predict_proba(x_test_pca)[:,1]
y_pred_neural=nu.predict(x_test_pca)
print(f1_score(y_test_pca,y_pred_neural))
print(confusion_matrix(y_test_pca,y_pred_neural))
print(roc_auc_score(y_test_pca,y_pred_neural_pr))