In [1]:
import numpy as np
import matplotlib.pyplot as plt
import mltools as ml
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,confusion_matrix,roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from imblearn.combine import SMOTETomek
#from sklearn.decomposition import PCA

np.random.seed(0);
X = np.genfromtxt("data/X_train.txt",delimiter = None)
#X = X[:, [8,9,10,11]]
Y = np.genfromtxt("data/Y_train.txt",delimiter = None)
test_data = np.genfromtxt("data/X_test.txt",delimiter = None)

In [2]:
sc1 = StandardScaler()
XValues, xVa, YValues, yVa = train_test_split(X,Y,shuffle=True,random_state = 12,test_size = 0.1,stratify = Y)
sc1.fit(XValues)
XValues = sc1.transform(XValues)
xVa = sc1.transform(xVa)
smotetomek = SMOTETomek(random_state = 0)
XValues, YValues = smotetomek.fit_sample(XValues,YValues)

In [3]:
model = LogisticRegression()
model.fit(XValues,YValues)
predictions = model.predict(xVa)
print(roc_auc_score(yVa,predictions))
print(confusion_matrix(yVa,predictions))

0.615180253523
[[8623 4519]
 [2920 3938]]


In [4]:
##POWER 2 TRANSFORMATION
XtR1 = ml.transforms.fpoly(XValues,2)
XvR1 = ml.transforms.fpoly(xVa,2)
#ipca = IncrementalPCA(n_components = 14)
model = LogisticRegression()
model.fit(XtR1,YValues)
predictions = model.predict(XvR1)
print(roc_auc_score(yVa,predictions))
print(confusion_matrix(yVa,predictions))
print(XtR1.shape)

0.632878659153
[[9184 3958]
 [2970 3888]]
(224980, 120)


In [5]:
##STUMP TRANSFORMATION PERFORMANCE

XtR2 = ml.transforms.fkitchensink(XValues,119,typ='stump')
XvR2 = ml.transforms.fkitchensink(xVa,119,typ='stump')
#ipca = IncrementalPCA(n_components = 14)
XtR2 = np.dot(XtR2[0],XtR2[1].T)
XvR2 = np.dot(XvR2[0],XvR2[1].T)
model2 = LogisticRegression()
model2.fit(XtR2,YValues)
predictions2 = model2.predict(XvR2)
print(roc_auc_score(yVa,predictions2))
print(confusion_matrix(yVa,predictions2))
print(XtR2.shape)

0.501868767824
[[  235 12907]
 [   97  6761]]
(224980, 2)


In [6]:
##SIGMOID TRANSFORMATION PERFORMANCE

XtR3 = ml.transforms.fkitchensink(XValues,119,typ='sigmoid')
XvR3 = ml.transforms.fkitchensink(xVa,119,typ='sigmoid')
XtR3 = XtR3[0]
XvR3 = XvR3[0]
#ipca = IncrementalPCA(n_components = 14)
model3 = LogisticRegression()
model3.fit(XtR3,YValues)
predictions3 = model3.predict(XvR3)
print(roc_auc_score(yVa,predictions3))
print(confusion_matrix(yVa,predictions3))
print(XtR3.shape)

0.573915621363
[[7891 5251]
 [3104 3754]]
(224980, 119)


In [8]:
##SUM TRANSFORMATION PERFORMANCE

XtR5 = np.zeros(shape=(XValues.shape[0],105))
XvR5 = np.zeros(shape=(xVa.shape[0],105))

j = 0
k = 0
for i in range(0,105):
    XtR5[:,i] = XValues[:,j] + XValues[:,k]
    XvR5[:,i] = xVa[:,j] + xVa[:,k]
    if k+1 > 13:
        k = j + 1
        j = j + 1
    else:
        k = k+1
model5 = LogisticRegression()
model5.fit(XtR5,YValues)
predictions5 = model5.predict(XvR5)
print(roc_auc_score(yVa,predictions5))
print(confusion_matrix(yVa,predictions5))
print(XtR5.shape)

0.615180253523
[[8623 4519]
 [2920 3938]]
(224980, 105)


In [9]:
##DIVISION TRANSFORMATION PEFORMANCE

XtR7 = np.zeros(shape=(XValues.shape[0],91))
XvR7 = np.zeros(shape=(xVa.shape[0],91))

j = 0
k = 1
for i in range(0,91):
    XtR7[:,i] = XValues[:,j]/XValues[:,k]
    XvR7[:,i] = xVa[:,j]/xVa[:,k]
    if k+1 > 13:
        k = j + 2
        j = j + 1
    else:
        k = k+1
        
model7 = LogisticRegression()
model7.fit(XtR7,YValues)
predictions7 = model7.predict(XvR7)
print(roc_auc_score(yVa,predictions7))
print(confusion_matrix(yVa,predictions7))
print(XtR7.shape)

0.550433996884
[[8460 4682]
 [3723 3135]]
(224980, 91)


In [10]:
##SUBTRACTION TRANSFORMATION PERFORMANCE

XtR8 = np.zeros(shape=(XValues.shape[0],91))
XvR8 = np.zeros(shape=(xVa.shape[0],91))

j = 0
k = 1
for i in range(0,91):
    XtR8[:,i] = XValues[:,j]-XValues[:,k]
    XvR8[:,i] = xVa[:,j]-xVa[:,k]
    if k+1 > 13:
        k = j + 2
        j = j + 1
    else:
        k = k+1
model8 = LogisticRegression()
model8.fit(XtR8,YValues)
predictions8 = model8.predict(XvR8)
print(roc_auc_score(yVa,predictions8))
print(confusion_matrix(yVa,predictions8))
print(XtR8.shape)

0.613840256855
[[8743 4399]
 [3001 3857]]
(224980, 91)


In [11]:
##ZSCORE TRANSFORMATION PERFORMANCE

from scipy import stats

XtR9 = np.zeros(shape=(XValues.shape[0],406))
XtR9[:,:14] = stats.zscore(XValues)
XvR9 = np.zeros(shape=(xVa.shape[0],406))
XvR9[:,:14] = stats.zscore(xVa)

j = 0
k = 0
for i in range(14,119):
    XtR9[:,i] = stats.zscore(XValues[:,j]*XValues[:,k])
    XvR9[:,i] = stats.zscore(xVa[:,j]*xVa[:,k])
    if k+1 > 13:
        k = j + 1
        j = j + 1
    else:
        k = k+1
        
j = 0
k = 0
for i in range(119,224):
    XtR9[:,i] = stats.zscore(XValues[:,j]+XValues[:,k])
    XvR9[:,i] = stats.zscore(xVa[:,j]+xVa[:,k])
    if k+1 > 13:
        k = j + 1
        j = j + 1
    else:
        k = k+1

j = 0
k = 1
for i in range(224,315):
    XtR9[:,i] = stats.zscore(XValues[:,j]/XValues[:,k])
    XvR9[:,i] = stats.zscore(xVa[:,j]/xVa[:,k])
    if k+1 > 13:
        k = j + 2
        j = j + 1
    else:
        k = k+1

j = 0
k = 1
for i in range(315,406):
    XtR9[:,i] = stats.zscore(XValues[:,j]-XValues[:,k])
    XvR9[:,i] = stats.zscore(xVa[:,j]-xVa[:,k])
    if k+1 > 13:
        k = j + 2
        j = j + 1
    else:
        k = k+1
        
model9 = LogisticRegression()
model9.fit(XtR9,YValues)
predictions9 = model9.predict(XvR9)
print(roc_auc_score(yVa,predictions9))
print(confusion_matrix(yVa,predictions9))
print(XtR9.shape)

0.622266044422
[[8334 4808]
 [2672 4186]]
(224980, 406)


In [14]:
##PROJECTION TRANSFORMATION PERFORMANCE

XtR10 = ml.transforms.fkitchensink(XValues,200,typ='linear')
XvR10 = ml.transforms.fkitchensink(xVa,200,typ='linear')
XtR10 = np.dot(XtR10[0],XtR10[1].T)
XvR10 = np.dot(XvR10[0],XvR10[1].T)
#ipca = IncrementalPCA(n_components = 14)
model10 = LogisticRegression()
model10.fit(XtR10,YValues)
predictions10 = model10.predict(XvR10)
print(roc_auc_score(yVa,predictions10))
print(confusion_matrix(yVa,predictions10))
print(XtR10.shape)

0.602447760978
[[8319 4823]
 [2936 3922]]
(224980, 14)


In [15]:
##LOG TRANSFORMATION PERFORMANCE

XtR11 = np.zeros(shape=(XValues.shape[0],406))
XtR11[:,:14] = np.log1p(np.absolute(XValues))
XvR11 = np.zeros(shape=(xVa.shape[0],406))
XvR11[:,:14] = np.log1p(np.absolute(xVa))

j = 0
k = 0
for i in range(14,119):
    XtR11[:,i] = np.log1p(np.absolute(XValues[:,j]*XValues[:,k]))
    XvR11[:,i] = np.log1p(np.absolute(xVa[:,j]*xVa[:,k]))
    if k+1 > 13:
        k = j + 1
        j = j + 1
    else:
        k = k+1
        
j = 0
k = 0
for i in range(119,224):
    XtR11[:,i] = np.log1p(np.absolute(XValues[:,j]+XValues[:,k]))
    XvR11[:,i] = np.log1p(np.absolute(xVa[:,j]+xVa[:,k]))
    if k+1 > 13:
        k = j + 1
        j = j + 1
    else:
        k = k+1

j = 0
k = 1
for i in range(224,315):
    XtR11[:,i] = np.log1p(np.absolute(XValues[:,j]/XValues[:,k]))
    XvR11[:,i] = np.log1p(np.absolute(xVa[:,j]/xVa[:,k]))
    if k+1 > 13:
        k = j + 2
        j = j + 1
    else:
        k = k+1

j = 0
k = 1
for i in range(315,406):
    XtR11[:,i] = np.log1p(np.absolute(XValues[:,j]-XValues[:,k]))
    XvR11[:,i] = np.log1p(np.absolute(xVa[:,j]-xVa[:,k]))
    if k+1 > 13:
        k = j + 2
        j = j + 1
    else:
        k = k+1
        
model11 = LogisticRegression()
model11.fit(XtR11,YValues)
predictions11 = model11.predict(XvR11)
print(roc_auc_score(yVa,predictions11))
print(confusion_matrix(yVa,predictions11))
print(XtR11.shape)

0.637678485923
[[9268 3874]
 [2948 3910]]
(224980, 406)


In [22]:
##BOX-COX TRANSFORMATION PERFORMANCE

XtR12 = np.zeros(shape=(XValues.shape[0],406))
for i in range(0,14):
    XtR12[:,i],_ = stats.boxcox(np.absolute(XValues[:,i]))
XvR12 = np.zeros(shape=(xVa.shape[0],406))
for i in range(0,14):
    XvR12[:,i],_ = stats.boxcox(np.absolute(xVa[:,i]))

j = 0
k = 0
for i in range(14,119):
    XtR12[:,i],_ = stats.boxcox(np.absolute(XValues[:,j]*XValues[:,k]))
    XvR12[:,i],_ = stats.boxcox(np.absolute(xVa[:,j]*xVa[:,k]))
    if k+1 > 13:
        k = j + 1
        j = j + 1
    else:
        k = k+1

j = 0
k = 0
for i in range(119,224):
    XtR12[:,i],_ = stats.boxcox(np.absolute(XValues[:,j]+XValues[:,k]))
    XvR12[:,i],_ = stats.boxcox(np.absolute(xVa[:,j]+xVa[:,k]))
    if k+1 > 13:
        k = j + 1
        j = j + 1
    else:
        k = k+1

j = 0
k = 1
for i in range(224,315):
    XtR12[:,i],_ = stats.boxcox(np.absolute(XValues[:,j]/XValues[:,k]))
    XvR12[:,i],_ = stats.boxcox(np.absolute(xVa[:,j]/xVa[:,k]))
    if k+1 > 13:
        k = j + 2
        j = j + 1
    else:
        k = k+1

j = 0
k = 1
for i in range(315,406):
    XtR12[:,i],_ = stats.boxcox(np.absolute(XValues[:,j]-XValues[:,k]))
    XvR12[:,i],_ = stats.boxcox(np.absolute(xVa[:,j]-xVa[:,k]))
    if k+1 > 13:
        k = j + 2
        j = j + 1
    else:
        k = k+1        

model12 = LogisticRegression()
model12.fit(XtR12,YValues)
predictions12 = model12.predict(XvR12)
print(roc_auc_score(yVa,predictions12))
print(confusion_matrix(yVa,predictions12))
print(XtR12.shape)

0.620611294828
[[8325 4817]
 [2690 4168]]
(224980, 406)


In [23]:
#to_test = sc1.transform(test_data)
#np.savetxt('data/transformed_test.txt',to_test)
#np.savetxt('data/transformed_train.txt',XValues)
#np.savetxt('data/transformed_validation.txt',xVa)
#np.savetxt('data/train_Y.txt',YValues)
#np.savetxt('data/validation_Y.txt',yVa)

In [29]:
#Store for Power2
#np.savetxt('data/train_power_2.txt',XtR1)
#np.savetxt('data/validation_power_2.txt',XvR1)

In [30]:
#Store for Stump
#np.savetxt('data/train_stump.txt',XtR2)
#np.savetxt('data/validation_stump.txt',XvR2)

In [33]:
#Store for Sigmoid
#np.savetxt('data/train_sigmoid.txt',XtR3)
#np.savetxt('data/validation_sigmoid.txt',XvR3)

In [34]:
#Store for Sum
#np.savetxt('data/train_sum.txt',XtR5)
#np.savetxt('data/validation_sum.txt',XvR5)

In [35]:
#Store for Division
#np.savetxt('data/train_division.txt',XtR7)
#np.savetxt('data/validation_division.txt',XvR7)

In [36]:
#Store for Subtraction
#np.savetxt('data/train_subtraction.txt',XtR8)
#np.savetxt('data/validation_subtraction.txt',XvR8)

In [37]:
#Store for ZScore
#np.savetxt('data/train_zscore.txt',XtR9)
#np.savetxt('data/validation_zscore.txt',XvR9)

In [38]:
#Store for Projection
#np.savetxt('data/train_projection.txt',XtR10)
#np.savetxt('data/validation_projection.txt',XvR10)

In [39]:
#Store for Log 
#np.savetxt('data/train_log.txt',XtR11)
#np.savetxt('data/validation_log.txt',XvR11)

In [40]:
#Store for Box-Cox
#np.savetxt('data/train_box-cox.txt',XtR12)
#np.savetxt('data/validation_box-cox.txt',XvR12)

In [41]:
#positive_data = X[Y==1]
#negative_data = X[Y==0]
#positive_data = np.random.permutation(positive_data)
#negative_data = np.random.permutation(negative_data)

In [42]:
#positive_subsets = np.array_split(positive_data,4)
#positive_subsets

In [43]:
#negative_subsets = np.array_split(negative_data,7)
#negative_subsets

In [44]:
#print(np.shape(positive_subsets[0]))
#print(np.shape(negative_subsets[0]))

#for i in range(0,len(positive_subsets)):
#    for j in range(0,len(negative_subsets)):
#        shape_positive = np.shape(positive_subsets[i])
#        positive_data = np.hstack((positive_subsets[i],np.ones(shape=(shape_positive[0],1))))
#        negative_subsets[j] = negative_subsets[j][:shape_positive[0]+1,:]
#        shape_negative = np.shape(negative_subsets[j])
#        negative_data = np.hstack((negative_subsets[j],np.zeros(shape=(shape_negative[0],1))))
#        final_data = np.vstack([positive_data,negative_data])
#        final_data = np.random.permutation(final_data)
#        file_no = 7*i + j
#        file_name = 'data_'+str(file_no)+'.txt'
#        np.savetxt("data/"+file_name,final_data)

In [45]:
#print(np.shape(positive_subsets[0]))
#print(np.shape(negative_subsets[0]))

In [46]:
#model = MLPClassifier(activation='relu',solver='lbfgs',hidden_layer_sizes=(350,350),early_stopping=True,batch_size=1)
#sc = StandardScaler()
#data_file_no = "data_"+str(0)+".txt"
#data = np.genfromtxt("data/"+data_file_no,delimiter = None)
#X_train = data[:,:-1]
#y_train = data[:,-1]
#sc.fit(X_train)
#X_train = sc.transform(X_train)
#X_test = sc.transform(X)
#model.fit(X_train,y_train)
#predictions = model.predict(X_test)
#np.savetxt("data/prelim_confi_predictions.txt",predictions)
#print(roc_auc_score(Y,predictions))

In [47]:
#print(classification_report(Y,predictions))
#print(confusion_matrix(Y,predictions))