In [84]:
import numpy as np
import matplotlib.pyplot as plt
import mltools as ml
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,confusion_matrix,roc_auc_score
from imblearn.under_sampling import RandomUnderSampler 
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression


np.random.seed(0);

X = np.genfromtxt("data/X_train.txt",delimiter = None)
#X = X[:, [8,9,10,11]]
Y = np.genfromtxt("data/Y_train.txt",delimiter = None)
test_data = np.genfromtxt("data/X_test.txt",delimiter = None)
# feature extraction
model = LogisticRegression(class_weight = 'balanced')
rfe = RFE(model, 4)
fit = rfe.fit(X, Y)
print(fit.n_features_)
print(fit.support_)
print(fit.ranking_)

4
[False False False False False False False False  True  True  True  True
 False False]
[ 2  7  6  4 11 10  9  3  1  1  1  1  5  8]


In [85]:
sc = StandardScaler()
sc1 = StandardScaler()
sc2 = StandardScaler()

X = X[:, [0,8,9,10,11]]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=0, stratify=Y)
sc.fit(X_train)
X_train = sc.transform(X_train)
#fit_train = PCA(n_components = 4).fit(X_train)
#X_train = fit_train.transform(X_train)
sc1.fit(X_test)
X_test = sc1.transform(X_test)
#fit_test1 = PCA(n_components = 4).fit(X_test)
#X_test = fit_test1.transform(X_test)
sc2.fit(test_data)
test_data = sc2.transform(test_data)
#fit_test = PCA(n_components = 4).fit(test_data)
#test_data = fit_test.transform(test_data)

In [87]:
from imblearn.combine import SMOTEENN

rus = SMOTEENN(random_state=0)

x_rus1,y_rus1 = rus.fit_sample(X_train,y_train)
mlp1 = MLPClassifier(activation='logistic',solver='lbfgs',hidden_layer_sizes=(350,350),batch_size=8)
mlp1.fit(x_rus1,y_rus1)

MLPClassifier(activation='logistic', alpha=0.0001, batch_size=8, beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(350, 350), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='lbfgs', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [88]:
predictions = mlp1.predict(X_test)
print(confusion_matrix(y_test,predictions))

[[20076 12779]
 [ 6303 10842]]


In [89]:
print(classification_report(y_test,predictions))
print(roc_auc_score(y_test,predictions))

             precision    recall  f1-score   support

        0.0       0.76      0.61      0.68     32855
        1.0       0.46      0.63      0.53     17145

avg / total       0.66      0.62      0.63     50000

0.621709750138


In [90]:
predictions1 = mlp1.predict_proba(test_data[:,[0,8,9,10,11]])
# Create the data for submission by taking the P(Y=1) column from probs and just add a running index as the first column.
Y_sub1 = np.vstack([np.arange(test_data.shape[0]), predictions1[:, 1]]).T

# We specify the header (ID, Prob1) and also specify the comments as '' so the header won't be commented out with
# the # sign.
np.savetxt('data/Y_sub1.txt', Y_sub1, '%d, %.5f', header='ID,Prob1', comments='', delimiter=',')

In [91]:
Y_train = y_train
Y_train[y_train<1] = -1;
x_rus2,y_rus2 = rus.fit_sample(X_train,Y_train)
mlp2 = MLPClassifier(activation='tanh',solver='lbfgs',hidden_layer_sizes=(350,350),batch_size=8)
mlp2.fit(x_rus2,y_rus2)

MLPClassifier(activation='tanh', alpha=0.0001, batch_size=8, beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(350, 350), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='lbfgs', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [92]:
Y_test = y_test
Y_test[y_test<1] = -1;
predictions = mlp2.predict(X_test)
print(confusion_matrix(Y_test,predictions))

[[20217 12638]
 [ 6138 11007]]


In [93]:
print(classification_report(Y_test,predictions))
print(roc_auc_score(Y_test,predictions))

             precision    recall  f1-score   support

       -1.0       0.77      0.62      0.68     32855
        1.0       0.47      0.64      0.54     17145

avg / total       0.66      0.62      0.63     50000

0.628667440767


In [94]:
predictions2 = mlp2.predict_proba(test_data[:,[0,8,9,10,11]])
# Create the data for submission by taking the P(Y=1) column from probs and just add a running index as the first column.
Y_sub2 = np.vstack([np.arange(test_data.shape[0]), predictions2[:, 1]]).T

# We specify the header (ID, Prob1) and also specify the comments as '' so the header won't be commented out with
# the # sign.
np.savetxt('data/Y_sub2.txt', Y_sub2, '%d, %.5f', header='ID,Prob1', comments='', delimiter=',')

In [95]:
mlp3 = MLPClassifier(activation='relu',solver='lbfgs',hidden_layer_sizes=(350,350),batch_size=8)
mlp3.fit(x_rus1,y_rus1)

MLPClassifier(activation='relu', alpha=0.0001, batch_size=8, beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(350, 350), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='lbfgs', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [96]:
predictions = mlp3.predict(X_test)
y_test[y_test<1] = 0
print(confusion_matrix(y_test,predictions))

[[    0 20235 12620]
 [    0     0     0]
 [    0  6060 11085]]


In [97]:
print(classification_report(y_test,predictions))
print(roc_auc_score(y_test,predictions))

             precision    recall  f1-score   support

       -1.0       0.00      0.00      0.00     32855
        0.0       0.00      0.00      0.00         0
        1.0       0.47      0.65      0.54     17145

avg / total       0.16      0.22      0.19     50000

0.631216087336


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [98]:
predictions3 = mlp3.predict_proba(test_data[:,[0,8,9,10,11]])
# Create the data for submission by taking the P(Y=1) column from probs and just add a running index as the first column.
Y_sub3 = np.vstack([np.arange(test_data.shape[0]), predictions3[:, 1]]).T

# We specify the header (ID, Prob1) and also specify the comments as '' so the header won't be commented out with
# the # sign.
np.savetxt('data/Y_sub3.txt', Y_sub3, '%d, %.5f', header='ID,Prob1', comments='', delimiter=',')

In [99]:
#Ensembled version of these three models to be done tomorrow