In [None]:
## This file implements neural networks before and after lasso selection for p11kpresabsSTCC_qual with four replicates.
## We compute the mean and standarad deviation of training and test accuracies.
## We also compute the mean and standard deviation of AUC ROC values for each model.

In [1]:
from numpy.random import seed
import numpy as np
seed(100)
import tensorflow
tensorflow.random.set_seed(123)

In [2]:
import pandas as pd

df = pd.read_csv('/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/p11kpresabsSTCC_qual.csv')
df.shape

(253, 824)

In [3]:
df.rename(columns={'Unnamed: 0':'id'}, inplace=True)

In [4]:
df['pheno']

0      2
1      1
2      2
3      2
4      2
      ..
248    2
249    1
250    2
251    2
252    2
Name: pheno, Length: 253, dtype: int64

In [5]:
df.head()

Unnamed: 0,id,TTTTTTGTAATTTT,TTTTTTGTAATTTTT,TTTTTTATTTTGGAT,TTTTTTATTTTGGATAA,TTTTTTATTTTGGATAAAAGGAG,TTTTTTAGTCGTTTTT,TTTTTATCGTTTACT,TTTTTAGTCGTTTTT,TTTTTAGTCGTTTTTT,...,AATCACCCCTT,AAGGGGTGATTT,AAGGGGTGATTTT,AAGATGATTTATCCAACTTT,AACTTTCTAGGTT,AACCTAGAAAGTTT,AACATCTTTTATTT,ST,CC,pheno
0,107,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,5,5,2
1,109,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,8,8,1
2,115,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,5,5,2
3,120335,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,5,5,2
4,120337,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,5,5,2


In [6]:
df['pheno'].value_counts()

2    181
1     47
0     25
Name: pheno, dtype: int64

In [7]:
df_clean = df.drop(columns=['id'])

In [8]:
df_clean.shape

(253, 823)

In [9]:
df_clean.head()

Unnamed: 0,TTTTTTGTAATTTT,TTTTTTGTAATTTTT,TTTTTTATTTTGGAT,TTTTTTATTTTGGATAA,TTTTTTATTTTGGATAAAAGGAG,TTTTTTAGTCGTTTTT,TTTTTATCGTTTACT,TTTTTAGTCGTTTTT,TTTTTAGTCGTTTTTT,TTTTTAGGTAAGG,...,AATCACCCCTT,AAGGGGTGATTT,AAGGGGTGATTTT,AAGATGATTTATCCAACTTT,AACTTTCTAGGTT,AACCTAGAAAGTTT,AACATCTTTTATTT,ST,CC,pheno
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,5,5,2
1,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,8,8,1
2,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,5,5,2
3,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,5,5,2
4,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,5,5,2


In [10]:
X = df.loc[:, df.columns != 'pheno']
y = df['pheno']
print(X.shape, y.shape)

(253, 823) (253,)


In [11]:
# over-sampling
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler
overS = RandomOverSampler(random_state=100)
X_over, y_over = overS.fit_resample(X, y)
print(sorted(Counter(y_over).items()))

Using TensorFlow backend.


[(0, 181), (1, 181), (2, 181)]




In [12]:
############# Fully-Connected Neural Network ################

In [13]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.regularizers import l1

In [14]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over,
                                                    test_size = 0.3,
                                                    random_state=123,
                                                    stratify=y_over)

In [15]:
dat = pd.DataFrame(X_test_over[:,0])
dat['test'] = y_test_over

In [16]:
dat

Unnamed: 0,0,test
0,SR1129,2
1,NRS185,2
2,NRS243,1
3,BCH-SA-04,0
4,504,1
...,...,...
158,CFBREBSa131,2
159,CFBREBSa133,1
160,NRS256,2
161,GA48963,1


In [17]:
X_train_over = X_train_over[:,1:]
X_test_over = X_test_over[:,1:]

In [18]:
#### neural network on over-sampling data
model1_over = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],)),
    Dense(3, activation='softmax'),
])

In [19]:
model1_over.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [20]:
model1_over.fit(X_train_over, y_train_over,
          batch_size=64, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 380 samples, validate on 163 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
E

<keras.callbacks.callbacks.History at 0x1a33e8f128>

In [53]:
acc_test_over = model1_over.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over*100))

over-sampling test accuracy: 82.82%


In [21]:
pred = model1_over.predict_classes(X_test_over)
pred

array([0, 2, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 2, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 0, 0, 0, 2, 1, 1, 1, 2, 1, 2, 0, 2, 0, 1, 0, 1, 1, 2,
       0, 0, 2, 1, 1, 1, 0, 1, 1, 1, 1, 2, 0, 0, 2, 1, 2, 1, 1, 0, 0, 1,
       1, 2, 0, 0, 1, 0, 1, 2, 2, 0, 2, 1, 0, 0, 0, 1, 2, 1, 1, 0, 1, 2,
       2, 1, 1, 1, 2, 0, 0, 1, 1, 1, 0, 2, 1, 2, 0, 2, 1, 0, 2, 0, 0, 1,
       2, 1, 1, 1, 0, 0, 2, 0, 0, 2, 0, 2, 2, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 0, 2, 2, 2, 1, 1])

In [22]:
dat['pred'] = pred
dat

Unnamed: 0,0,test,pred
0,SR1129,2,0
1,NRS185,2,2
2,NRS243,1,1
3,BCH-SA-04,0,0
4,504,1,1
...,...,...,...
158,CFBREBSa131,2,2
159,CFBREBSa133,1,2
160,NRS256,2,2
161,GA48963,1,1


In [23]:
proba1 = model1_over.predict_proba(X_test_over)
dat_proba1 = pd.DataFrame(proba1)

In [24]:
dat_proba1

Unnamed: 0,0,1,2
0,0.953901,0.017629,0.028470
1,0.000158,0.061263,0.938579
2,0.000603,0.858406,0.140991
3,0.988969,0.008119,0.002912
4,0.062080,0.865571,0.072349
...,...,...,...
158,0.479931,0.014191,0.505878
159,0.000769,0.488876,0.510355
160,0.000399,0.068585,0.931016
161,0.001275,0.866945,0.131780


In [25]:
dat_proba1.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba1.csv", index = False,
         header=None)

In [26]:
dat.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/1p11ST.csv", index = False,
         header=None)

In [57]:
hist1_over = model1_over.fit(X_train_over, y_train_over,
          batch_size=64, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 380 samples, validate on 163 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
E

In [58]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist1_over.history['accuracy'])*100))

over-sampling train accuracy: 91.99%


In [19]:
df_proba = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_2.xlsx",
                        sheet_name=0,
                        index_col=None)

In [20]:
df_proba

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p002ykpresabs_qual,CFBRSa26,0,0,0.758914,0.241086,4.638713e-07
1,p002ykpresabs_qual,NRS109,2,2,0.005361,0.016236,9.784034e-01
2,p002ykpresabs_qual,NRS112,0,0,0.726623,0.273376,1.520979e-06
3,p002ykpresabs_qual,NRS216,1,1,0.138322,0.861665,1.334123e-05
4,p002ykpresabs_qual,NRS021,0,0,0.882176,0.117824,1.414530e-10
...,...,...,...,...,...,...,...
4279,pyopresabsSTCC_qual,NRS148,2,2,0.000007,0.000099,9.998934e-01
4280,pyopresabsSTCC_qual,NRS255,2,2,0.000257,0.002048,9.976944e-01
4281,pyopresabsSTCC_qual,NRS205,2,2,0.000011,0.000045,9.999435e-01
4282,pyopresabsSTCC_qual,NRS255,2,2,0.000257,0.002048,9.976944e-01


In [21]:
y_prob = df_proba[df_proba['phage']=='p11kpresabsSTCC_qual'].iloc[:,-3:]
y_prob = y_prob.to_numpy()
y_prob

array([[9.53901300e-01, 1.76289320e-02, 2.84697230e-02],
       [1.58122540e-04, 6.12626600e-02, 9.38579200e-01],
       [6.03212860e-04, 8.58405650e-01, 1.40991050e-01],
       [9.88968600e-01, 8.11905100e-03, 2.91239380e-03],
       [6.20801450e-02, 8.65571000e-01, 7.23488550e-02],
       [5.02775850e-01, 3.38603700e-01, 1.58620450e-01],
       [9.86663800e-01, 8.72669650e-03, 4.60948700e-03],
       [6.92549500e-01, 1.37520300e-02, 2.93698460e-01],
       [9.80278200e-02, 8.30269600e-01, 7.17025250e-02],
       [2.09496920e-02, 9.38645200e-01, 4.04051950e-02],
       [2.09350330e-02, 6.03786000e-01, 3.75278950e-01],
       [8.83258100e-01, 1.03461710e-01, 1.32801550e-02],
       [9.68945550e-02, 5.36189560e-01, 3.66915940e-01],
       [2.56062000e-01, 3.86030850e-01, 3.57907180e-01],
       [8.83258100e-01, 1.03461710e-01, 1.32801550e-02],
       [9.98986300e-01, 9.80773200e-04, 3.28470100e-05],
       [5.12586600e-02, 4.81528850e-01, 4.67212470e-01],
       [7.62314950e-02, 1.24863

In [14]:
## Retrieved from https://github.com/scikit-learn/scikit-learn/issues/3298
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelBinarizer

def rocauc_ovo(truth, pred, average="macro", multi_class="ovo"):

    lb = LabelBinarizer()
    lb.fit(truth)

    truth = lb.transform(truth)   
    
    return roc_auc_score(truth, pred, average=average, multi_class=multi_class)

In [23]:
ovo1 = rocauc_ovo(y_test_over, y_prob, average="macro", multi_class="ovo")
ovo1

0.9356207333271552

In [15]:
def rocauc_ovr(truth, pred, average="macro", multi_class="ovr"):

    lb = LabelBinarizer()
    lb.fit(truth)

    truth = lb.transform(truth)   

    return roc_auc_score(truth, pred, average=average, multi_class=multi_class)

In [25]:
ovr1 = rocauc_ovr(y_test_over, y_prob, average="macro", multi_class="ovr")
ovr1

0.9356207333271552

In [26]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over,
                                                    test_size = 0.3,
                                                    random_state=234,
                                                    stratify=y_over)

In [27]:
dat2 = pd.DataFrame(X_test_over[:,0])
dat2['test'] = y_test_over

In [28]:
dat2

Unnamed: 0,0,test
0,NRS027,0
1,CFBRSa07,0
2,CFBRSa27,1
3,504,1
4,CFBREBSa129,0
...,...,...
158,SR3569,2
159,NRS243,1
160,GA48963,1
161,504,1


In [29]:
X_train_over = X_train_over[:,1:]
X_test_over = X_test_over[:,1:]

In [31]:
model1_over2 = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],)),
    Dense(3, activation='softmax'),
])

In [32]:
model1_over2.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [33]:
model1_over2.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 380 samples, validate on 163 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1a343df0f0>

In [74]:
acc_test_over2 = model1_over2.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over2*100))

over-sampling test accuracy: 87.12%


In [34]:
pred2 = model1_over2.predict_classes(X_test_over)
pred2

array([0, 0, 2, 1, 0, 0, 0, 1, 1, 1, 2, 0, 1, 1, 1, 2, 1, 1, 2, 2, 1, 0,
       1, 0, 2, 1, 1, 0, 1, 2, 0, 0, 0, 2, 1, 0, 2, 1, 0, 0, 0, 0, 2, 1,
       1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 2, 0, 0, 0, 2, 1, 2, 0, 2, 1,
       0, 2, 1, 1, 2, 0, 1, 2, 0, 1, 2, 0, 0, 0, 1, 2, 1, 0, 1, 1, 1, 0,
       2, 0, 2, 0, 1, 0, 0, 1, 0, 2, 2, 0, 0, 0, 1, 0, 1, 2, 0, 2, 1, 0,
       1, 1, 0, 0, 2, 2, 2, 0, 1, 0, 2, 1, 2, 2, 1, 0, 1, 0, 0, 2, 0, 0,
       2, 0, 1, 2, 1, 2, 2, 2, 1, 2, 0, 2, 0, 2, 0, 2, 0, 2, 1, 0, 0, 2,
       1, 1, 2, 2, 2, 1, 1, 1, 0])

In [35]:
dat2['pred'] = pred2
dat2

Unnamed: 0,0,test,pred
0,NRS027,0,0
1,CFBRSa07,0,0
2,CFBRSa27,1,2
3,504,1,1
4,CFBREBSa129,0,0
...,...,...,...
158,SR3569,2,2
159,NRS243,1,1
160,GA48963,1,1
161,504,1,1


In [36]:
proba2 = model1_over2.predict_proba(X_test_over)
dat_proba2 = pd.DataFrame(proba2)

In [37]:
dat_proba2

Unnamed: 0,0,1,2
0,0.994396,0.001330,0.004274
1,0.961208,0.000881,0.037912
2,0.053490,0.266687,0.679823
3,0.085465,0.727735,0.186799
4,0.998393,0.000543,0.001063
...,...,...,...
158,0.006205,0.370404,0.623390
159,0.000759,0.846746,0.152496
160,0.003221,0.879930,0.116850
161,0.085465,0.727735,0.186799


In [38]:
dat_proba2.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba2.csv", index = False,
         header=None)

In [39]:
dat2.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/2p11ST.csv", index = False,
         header=None)

In [79]:
hist1_over2 = model1_over2.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 380 samples, validate on 163 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [80]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist1_over2.history['accuracy'])*100))

over-sampling train accuracy: 92.68%


In [30]:
df_proba2 = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_2.xlsx",
                        sheet_name=1,
                        index_col=None)

In [31]:
df_proba2

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p002ykpresabs_qual,NRS148,2,2,0.000056,1.748042e-03,9.981960e-01
1,p002ykpresabs_qual,BCH-SA-03,1,0,0.712007,2.879924e-01,9.646217e-07
2,p002ykpresabs_qual,NRS218,1,1,0.006222,9.937732e-01,4.482882e-06
3,p002ykpresabs_qual,NRS036,0,0,0.882617,1.173831e-01,2.310933e-10
4,p002ykpresabs_qual,NRS386,1,0,0.571179,4.288184e-01,2.444667e-06
...,...,...,...,...,...,...,...
4279,pyopresabsSTCC_qual,NRS112,1,1,0.001860,9.979747e-01,1.653396e-04
4280,pyopresabsSTCC_qual,SR1065,0,0,0.982940,1.705227e-02,7.349168e-06
4281,pyopresabsSTCC_qual,NRS203,0,0,0.997093,1.962516e-03,9.441347e-04
4282,pyopresabsSTCC_qual,CFBREBSa129,0,0,1.000000,3.031141e-13,3.208205e-09


In [32]:
y_prob2 = df_proba2[df_proba2['phage']=='p11kpresabsSTCC_qual'].iloc[:,-3:]
y_prob2 = y_prob2.to_numpy()
y_prob2

array([[9.94396000e-01, 1.32980950e-03, 4.27422160e-03],
       [9.61207750e-01, 8.80568640e-04, 3.79116500e-02],
       [5.34895360e-02, 2.66687100e-01, 6.79823340e-01],
       [8.54651000e-02, 7.27735460e-01, 1.86799440e-01],
       [9.98393360e-01, 5.43411650e-04, 1.06315510e-03],
       [7.70516750e-01, 3.76311040e-03, 2.25720140e-01],
       [9.98393360e-01, 5.43411650e-04, 1.06315510e-03],
       [9.30541600e-03, 8.23767800e-01, 1.66926770e-01],
       [6.01886660e-02, 7.81871300e-01, 1.57940090e-01],
       [1.23160430e-02, 9.86002100e-01, 1.68187890e-03],
       [5.67502200e-03, 1.92466120e-02, 9.75078340e-01],
       [9.99978400e-01, 2.46232960e-06, 1.90811720e-05],
       [9.29017500e-02, 6.85308100e-01, 2.21790210e-01],
       [4.35464200e-02, 8.62436650e-01, 9.40169900e-02],
       [1.13634600e-05, 9.97874000e-01, 2.11451970e-03],
       [1.96913160e-03, 7.52602600e-03, 9.90504800e-01],
       [7.43332840e-04, 9.15225860e-01, 8.40307700e-02],
       [2.34066000e-04, 7.14771

In [33]:
ovo2 = rocauc_ovo(y_test_over, y_prob2, average="macro", multi_class="ovo")
ovo2

0.9582851965114961

In [34]:
ovr2 = rocauc_ovr(y_test_over, y_prob2, average="macro", multi_class="ovr")
ovr2

0.9582851965114961

In [35]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over,
                                                    test_size = 0.3,
                                                    random_state=345,
                                                    stratify=y_over)

In [36]:
dat3 = pd.DataFrame(X_test_over[:,0])
dat3['test'] = y_test_over

In [37]:
dat3

Unnamed: 0,0,test
0,NRS149,0
1,EUH13,0
2,NRS106,2
3,NRS214,1
4,CFBREBSa129,0
...,...,...
158,NRS027,0
159,CFBRSa70,2
160,CFBREBSa130,0
161,NRS214,1


In [38]:
X_train_over = X_train_over[:,1:]
X_test_over = X_test_over[:,1:]

In [39]:
model1_over3 = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],)),
    Dense(3, activation='softmax'),
])

In [45]:
model1_over3.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [46]:
model1_over3.fit(X_train_over, y_train_over,
          batch_size=64, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 380 samples, validate on 163 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1a349f0438>

In [108]:
acc_test_over3 = model1_over3.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over3*100))

over-sampling test accuracy: 84.66%


In [47]:
pred3 = model1_over3.predict_classes(X_test_over)
pred3

array([0, 0, 2, 1, 0, 2, 0, 0, 2, 0, 1, 2, 1, 0, 0, 1, 2, 0, 0, 2, 2, 2,
       0, 2, 2, 1, 2, 1, 0, 0, 1, 1, 1, 0, 2, 1, 0, 2, 1, 2, 0, 1, 1, 1,
       1, 0, 2, 1, 0, 0, 2, 0, 0, 0, 2, 0, 2, 1, 2, 1, 2, 1, 0, 1, 1, 0,
       0, 2, 0, 1, 0, 0, 2, 0, 0, 1, 0, 2, 0, 0, 1, 1, 2, 2, 2, 0, 0, 1,
       2, 0, 0, 0, 0, 0, 1, 2, 1, 2, 0, 2, 2, 0, 1, 1, 1, 0, 2, 1, 1, 1,
       0, 2, 0, 0, 2, 2, 2, 1, 2, 2, 2, 2, 0, 1, 1, 1, 2, 1, 1, 1, 1, 0,
       1, 1, 0, 2, 0, 0, 0, 2, 1, 0, 1, 0, 1, 1, 0, 0, 0, 2, 0, 2, 1, 0,
       1, 0, 1, 1, 0, 1, 0, 1, 1])

In [48]:
dat3['pred'] = pred3
dat3

Unnamed: 0,0,test,pred
0,NRS149,0,0
1,EUH13,0,0
2,NRS106,2,2
3,NRS214,1,1
4,CFBREBSa129,0,0
...,...,...,...
158,NRS027,0,0
159,CFBRSa70,2,1
160,CFBREBSa130,0,0
161,NRS214,1,1


In [49]:
proba3 = model1_over3.predict_proba(X_test_over)
dat_proba3 = pd.DataFrame(proba3)

In [50]:
dat_proba3

Unnamed: 0,0,1,2
0,0.945460,0.019924,0.034616
1,0.923442,0.076492,0.000066
2,0.426062,0.041316,0.532622
3,0.035017,0.934156,0.030827
4,0.993218,0.003219,0.003564
...,...,...,...
158,0.988671,0.006116,0.005213
159,0.080087,0.609810,0.310103
160,0.747739,0.001925,0.250336
161,0.035017,0.934156,0.030827


In [51]:
dat_proba3.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba3.csv", index = False,
         header=None)

In [52]:
dat3.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/3p11ST.csv", index = False,
         header=None)

In [112]:
hist1_over3 = model1_over3.fit(X_train_over, y_train_over,
          batch_size=64, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 380 samples, validate on 163 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
E

In [113]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist1_over3.history['accuracy'])*100))

over-sampling train accuracy: 93.67%


In [40]:
df_proba3 = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_2.xlsx",
                        sheet_name=2,
                        index_col=None)

In [41]:
df_proba3

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p002ykpresabs_qual,NRS109,2,2,0.004477,0.013518,9.820048e-01
1,p002ykpresabs_qual,NRS109,2,2,0.004477,0.013518,9.820048e-01
2,p002ykpresabs_qual,NRS222,0,0,0.851725,0.148269,5.980786e-06
3,p002ykpresabs_qual,NRS109,2,2,0.004477,0.013518,9.820048e-01
4,p002ykpresabs_qual,GA50245,0,0,0.812055,0.187945,1.161034e-07
...,...,...,...,...,...,...,...
4279,pyopresabsSTCC_qual,NRS255,2,2,0.000633,0.000928,9.984396e-01
4280,pyopresabsSTCC_qual,NRS255,2,2,0.000633,0.000928,9.984396e-01
4281,pyopresabsSTCC_qual,NRS266,1,1,0.025932,0.974061,7.323514e-06
4282,pyopresabsSTCC_qual,NRS001,1,1,0.000597,0.999403,3.675362e-10


In [42]:
y_prob3 = df_proba3[df_proba3['phage']=='p11kpresabsSTCC_qual'].iloc[:,-3:]
y_prob3 = y_prob3.to_numpy()
y_prob3

array([[9.45459900e-01, 1.99238710e-02, 3.46162500e-02],
       [9.23442070e-01, 7.64918850e-02, 6.61021840e-05],
       [4.26061870e-01, 4.13158900e-02, 5.32622300e-01],
       [3.50171700e-02, 9.34155940e-01, 3.08269100e-02],
       [9.93217770e-01, 3.21877580e-03, 3.56351580e-03],
       [3.36265560e-02, 4.51625850e-01, 5.14747600e-01],
       [9.88671200e-01, 6.11592130e-03, 5.21287800e-03],
       [4.46328220e-01, 3.20847060e-01, 2.32824700e-01],
       [1.43827280e-03, 3.35688440e-01, 6.62873300e-01],
       [8.24092270e-01, 8.68281050e-02, 8.90796200e-02],
       [9.02212100e-03, 9.43538500e-01, 4.74393550e-02],
       [1.02018565e-02, 3.00128830e-02, 9.59785300e-01],
       [4.24063400e-04, 9.65135340e-01, 3.44405960e-02],
       [7.82782850e-01, 6.99917600e-03, 2.10217970e-01],
       [9.76262300e-01, 1.52440990e-02, 8.49358550e-03],
       [3.37084800e-04, 8.58221350e-01, 1.41441550e-01],
       [4.07931360e-02, 4.16685340e-01, 5.42521540e-01],
       [4.46328220e-01, 3.20847

In [43]:
ovo3 = rocauc_ovo(y_test_over, y_prob3, average="macro", multi_class="ovo")
ovo3

0.9531312101648494

In [44]:
ovr3 = rocauc_ovr(y_test_over, y_prob3, average="macro", multi_class="ovr")
ovr3

0.9531312101648494

In [45]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over,
                                                    test_size = 0.3,
                                                    random_state=456,
                                                    stratify=y_over)

In [46]:
dat4 = pd.DataFrame(X_test_over[:,0])
dat4['test'] = y_test_over

In [47]:
dat4

Unnamed: 0,0,test
0,SR2852,2
1,CFBREBSa138,0
2,BCH-SA-12,0
3,EUH13,0
4,EUH13,0
...,...,...
158,NRS036,1
159,CA105,1
160,CFBRSa51,1
161,NRS102,1


In [48]:
X_train_over = X_train_over[:,1:]
X_test_over = X_test_over[:,1:]

In [57]:
model1_over4 = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],)),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax'),
])

In [58]:
model1_over4.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [59]:
model1_over4.fit(X_train_over, y_train_over,
          batch_size=64, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 380 samples, validate on 163 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1a351f5048>

In [129]:
acc_test_over4 = model1_over4.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over4*100))

over-sampling test accuracy: 84.66%


In [60]:
pred4 = model1_over4.predict_classes(X_test_over)
pred4

array([2, 0, 0, 0, 0, 1, 0, 2, 0, 0, 1, 0, 2, 0, 2, 0, 0, 0, 2, 0, 2, 2,
       0, 1, 1, 2, 1, 0, 2, 0, 0, 2, 1, 1, 2, 0, 0, 2, 0, 0, 0, 1, 1, 1,
       1, 1, 2, 0, 0, 0, 1, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1,
       1, 0, 2, 0, 0, 1, 0, 0, 1, 2, 1, 1, 0, 2, 0, 0, 2, 1, 0, 1, 0, 2,
       0, 0, 0, 1, 1, 2, 1, 1, 0, 1, 1, 1, 0, 0, 2, 1, 2, 2, 1, 2, 1, 0,
       2, 0, 1, 1, 0, 0, 0, 1, 2, 1, 1, 2, 2, 2, 1, 1, 2, 1, 2, 0, 1, 0,
       1, 1, 2, 1, 0, 2, 1, 1, 0, 0, 2, 0, 2, 0, 1, 1, 1, 0, 0, 1, 2, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2])

In [61]:
dat4['pred'] = pred4
dat4

Unnamed: 0,0,test,pred
0,SR2852,2,2
1,CFBREBSa138,0,0
2,BCH-SA-12,0,0
3,EUH13,0,0
4,EUH13,0,0
...,...,...,...
158,NRS036,1,1
159,CA105,1,1
160,CFBRSa51,1,1
161,NRS102,1,1


In [62]:
proba4 = model1_over4.predict_proba(X_test_over)
dat_proba4 = pd.DataFrame(proba4)

In [63]:
dat_proba4

Unnamed: 0,0,1,2
0,0.028310,0.266584,0.705106
1,0.998137,0.000130,0.001734
2,0.996181,0.000866,0.002953
3,0.999984,0.000015,0.000001
4,0.999984,0.000015,0.000001
...,...,...,...
158,0.015832,0.963925,0.020242
159,0.001470,0.996410,0.002120
160,0.002234,0.980447,0.017319
161,0.002602,0.994918,0.002480


In [64]:
dat_proba4.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba4.csv", index = False,
         header=None)

In [65]:
dat4.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/4p11ST.csv", index = False,
         header=None)

In [133]:
hist1_over4 = model1_over4.fit(X_train_over, y_train_over,
          batch_size=64, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 380 samples, validate on 163 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [134]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist1_over4.history['accuracy'])*100))

over-sampling train accuracy: 94.96%


In [49]:
df_proba4 = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_2.xlsx",
                        sheet_name=3,
                        index_col=None)

In [50]:
df_proba4

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p002ykpresabs_qual,NRS110,1,1,0.000003,0.999997,5.870196e-13
1,p002ykpresabs_qual,NRS216,1,1,0.039254,0.960745,9.078969e-07
2,p002ykpresabs_qual,NRS386,1,1,0.326752,0.673248,1.061032e-07
3,p002ykpresabs_qual,CFBRSa25,0,0,0.611084,0.388916,7.664974e-07
4,p002ykpresabs_qual,BCH-SA-03,1,0,0.611084,0.388916,7.664974e-07
...,...,...,...,...,...,...,...
4279,pyopresabsSTCC_qual,NRS236,1,1,0.000052,0.999768,1.803156e-04
4280,pyopresabsSTCC_qual,NRS029,0,1,0.322350,0.677496,1.533154e-04
4281,pyopresabsSTCC_qual,NRS148,2,2,0.000006,0.000026,9.999682e-01
4282,pyopresabsSTCC_qual,CFBRSa28,0,0,0.999288,0.000176,5.361527e-04


In [51]:
y_prob4 = df_proba4[df_proba4['phage']=='p11kpresabsSTCC_qual'].iloc[:,-3:]
y_prob4 = y_prob4.to_numpy()
y_prob4

array([[2.83096380e-02, 2.66584280e-01, 7.05106140e-01],
       [9.98136500e-01, 1.29698850e-04, 1.73373710e-03],
       [9.96180800e-01, 8.66016140e-04, 2.95317940e-03],
       [9.99984150e-01, 1.45838620e-05, 1.30405950e-06],
       [9.99984150e-01, 1.45838620e-05, 1.30405950e-06],
       [2.38913190e-02, 9.45177800e-01, 3.09309740e-02],
       [8.25692000e-01, 1.57026050e-01, 1.72819480e-02],
       [3.62487980e-03, 5.95922000e-03, 9.90415930e-01],
       [9.20543550e-01, 5.36754130e-02, 2.57810260e-02],
       [9.97803750e-01, 2.49861480e-04, 1.94638180e-03],
       [1.12001800e-01, 8.10517670e-01, 7.74806100e-02],
       [4.17404680e-01, 3.62671550e-01, 2.19923700e-01],
       [1.47601500e-01, 3.97865150e-01, 4.54533370e-01],
       [9.20543550e-01, 5.36754130e-02, 2.57810260e-02],
       [9.98682700e-03, 4.00817500e-02, 9.49931440e-01],
       [9.42491700e-01, 1.40537550e-03, 5.61030060e-02],
       [9.20543550e-01, 5.36754130e-02, 2.57810260e-02],
       [9.99516100e-01, 3.55040

In [52]:
ovo4 = rocauc_ovo(y_test_over, y_prob4, average="macro", multi_class="ovo")
ovo4

0.9406575953212037

In [53]:
ovr4 = rocauc_ovr(y_test_over, y_prob4, average="macro", multi_class="ovr")
ovr4

0.9406575953212037

In [54]:
ovos = [ovo1, ovo2, ovo3, ovo4]
np.mean(ovos)

0.9469236838311761

In [55]:
np.std(ovos)

0.009146555483889455

In [56]:
ovrs = [ovr1, ovr2, ovr3, ovr4]
np.mean(ovrs)

0.9469236838311761

In [57]:
np.std(ovrs)

0.009146555483889455

In [135]:
accs = [acc_test_over, acc_test_over2, acc_test_over3, acc_test_over4]

In [136]:
mean = np.mean(accs)
print('over-sampling test accuracy mean: %.2f%%' % (mean*100))

over-sampling test accuracy mean: 84.82%


In [137]:
std = np.std(accs)
print('over-sampling test accuracy standard deviation:', std)

over-sampling test accuracy standard deviation: 0.015260545241225637


In [138]:
accs_train = [np.mean(hist1_over.history['accuracy']), np.mean(hist1_over2.history['accuracy']), np.mean(hist1_over3.history['accuracy']),
             np.mean(hist1_over4.history['accuracy'])]

In [139]:
mean_train = np.mean(accs_train)
print('over-sampling train accuracy mean: %.2f%%' % (mean_train*100))

over-sampling train accuracy mean: 93.33%


In [140]:
std_train = np.std(accs_train)
print('over-sampling train accuracy standard deviation:', std_train)

over-sampling train accuracy standard deviation: 0.011173564


In [46]:
############ Feature selection using lasso ##########

In [16]:
## Retrieved from https://towardsdatascience.com/feature-selection-using-regularisation-a3678b71e499
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
import numpy as np

In [17]:
selection = SelectFromModel(LogisticRegression(C=1, penalty='l1', solver='liblinear'))
selection.fit(X_over[:,1:], y_over)

SelectFromModel(estimator=LogisticRegression(C=1, class_weight=None, dual=False,
                                             fit_intercept=True,
                                             intercept_scaling=1, l1_ratio=None,
                                             max_iter=100, multi_class='auto',
                                             n_jobs=None, penalty='l1',
                                             random_state=None,
                                             solver='liblinear', tol=0.0001,
                                             verbose=0, warm_start=False),
                max_features=None, norm_order=1, prefit=False, threshold=None)

In [18]:
names = np.array(df_clean.columns).tolist()
names.remove('pheno')

In [19]:
X_train_features_over = np.vstack((names, X_over[:,1:]))
X_train_features_over = pd.DataFrame(X_train_features_over)

In [20]:
sel_features = X_train_features_over.columns[(selection.get_support())]
print('total features: {}'.format((X_train_features_over.shape[1])))
print('selected features: {}'.format(len(sel_features)))

total features: 822
selected features: 184


In [21]:
cols = sel_features.values
cols.reshape((1, -1))

array([[  0,   1,   3,   4,  12,  13,  14,  20,  21,  27,  29,  36,  39,
         74,  75,  87,  94, 100, 101, 102, 104, 116, 125, 135, 138, 140,
        146, 156, 161, 164, 166, 168, 177, 194, 196, 198, 199, 202, 204,
        206, 208, 213, 220, 229, 231, 232, 233, 241, 242, 243, 250, 265,
        266, 270, 277, 280, 288, 296, 303, 307, 308, 310, 316, 318, 319,
        323, 325, 334, 338, 344, 345, 352, 357, 359, 363, 367, 369, 374,
        380, 386, 389, 390, 395, 396, 399, 410, 417, 428, 433, 442, 444,
        445, 447, 455, 457, 463, 466, 467, 468, 470, 471, 476, 477, 481,
        488, 490, 497, 503, 506, 515, 517, 518, 519, 521, 534, 538, 541,
        544, 549, 553, 558, 559, 560, 561, 562, 563, 564, 571, 572, 573,
        581, 585, 586, 588, 589, 593, 597, 602, 603, 604, 605, 606, 608,
        614, 617, 619, 628, 634, 639, 651, 652, 670, 673, 677, 678, 685,
        702, 703, 704, 712, 713, 716, 722, 737, 738, 742, 746, 750, 751,
        757, 759, 766, 768, 772, 779, 781, 784, 788

In [22]:
names_arr = np.array(names)
names_arr[cols]

array(['TTTTTTGTAATTTT', 'TTTTTTGTAATTTTT', 'TTTTTTATTTTGGATAA',
       'TTTTTTATTTTGGATAAAAGGAG', 'TTTTCTTTTCGT', 'TTTTCTTCTAATC',
       'TTTTCTATTGTC', 'TTTTATGGAAGGTAATTTTAAAAATGTAAAGAAGCTTAT',
       'TTTTATGGAAGGTAATTTTAAAAATGTAAAGAAGCTTATTTACGAAG',
       'TTTTAATAGCTAGCACTTAATTGTGTTGGCTATTTTTTATGTCCAAAACGTGCTGATGACATAAAAAGCACGCATGGAAAAACAGTCGACAGACTATAAA',
       'TTTGCCAGTATC', 'TTTCGCAAACTA', 'TTTCAGCGACT', 'TTGGTTTTAAATTT',
       'TTGGTTTTAAATTTTT', 'TTGATAAAGTTTA', 'TTCTTTACATTTTTA',
       'TTCTCTTCCATC', 'TTCTCTTCCATCCCTCATC', 'TTCTCTTCCATCCCTCATCCTCCTC',
       'TTCTATAAAAAGT', 'TTCATCGTCGA', 'TTCAATCTAGAT', 'TTATTAGGTTCAAC',
       'TTATCATCAAATG',
       'TTATAGTCTGTCGACTGTTTTTCCATGCGTGCTTTTTATGTCATCAGCACGTTTTGGACATAAAAAATAGCCAACACAATTAAGTGCTAGCTATTAAAAG',
       'TTAGGCGAAGAT', 'TTACGCAATAGTTTAGATGTAGA', 'TTACCTAAAAATAAAT',
       'TTAATTGAATAACGGGAAGTAGCTCAGCTTGGTAGAGCACTTGGTTTGGGACCAAGGGGTCGCAGGTTCGAATCCTGTCTTCCCGATTACTTCTTAAATT',
       'TTAACGAATAC', 'TTAAATTTTGC

In [23]:
###### keep selected variables as a new dataframe
df_sel = df_clean.loc[:,names_arr[cols]].copy()
df_sel['pheno'] = df_clean['pheno']

In [24]:
df_sel['strain'] = X.iloc[:,0]

In [25]:
df_sel

Unnamed: 0,TTTTTTGTAATTTT,TTTTTTGTAATTTTT,TTTTTTATTTTGGATAA,TTTTTTATTTTGGATAAAAGGAG,TTTTCTTTTCGT,TTTTCTTCTAATC,TTTTCTATTGTC,TTTTATGGAAGGTAATTTTAAAAATGTAAAGAAGCTTAT,TTTTATGGAAGGTAATTTTAAAAATGTAAAGAAGCTTATTTACGAAG,TTTTAATAGCTAGCACTTAATTGTGTTGGCTATTTTTTATGTCCAAAACGTGCTGATGACATAAAAAGCACGCATGGAAAAACAGTCGACAGACTATAAA,...,AGGCTAACTT,AGCATCTACTTTT,ACTGCGTTAGT,ACTAAATTCGT,ACGCAATAGTT,AACCTAGAAAGTTT,ST,CC,pheno,strain
0,0,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,5,5,2,107
1,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,8,8,1,109
2,1,1,1,1,1,1,1,0,0,1,...,1,1,1,1,1,1,5,5,2,115
3,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,5,5,2,120335
4,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,5,5,2,120337
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
248,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,5,5,2,SR4152
249,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,3812,5,1,SR4153
250,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,5,5,2,SR4155
251,1,1,1,1,1,1,1,0,0,1,...,1,1,1,1,1,1,5,5,2,SR4156


In [26]:
X_sel = df_sel.loc[:, df_sel.columns != 'pheno']
y_sel = df_sel['pheno']
print(X_sel.shape, y_sel.shape, df_sel.shape)

(253, 185) (253,) (253, 186)


In [27]:
df_sel['pheno'].value_counts()

2    181
1     47
0     25
Name: pheno, dtype: int64

In [28]:
# over-sampling
from imblearn.over_sampling import RandomOverSampler
overS = RandomOverSampler(random_state=100)
X_sel_over, y_sel_over = overS.fit_resample(X_sel, y_sel)
print(sorted(Counter(y_sel_over).items()))

[(0, 181), (1, 181), (2, 181)]




In [29]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_sel_train_over, X_sel_test_over, y_sel_train_over, y_sel_test_over = train_test_split(X_sel_over, y_sel_over,
                                                    test_size = 0.3,
                                                    random_state=567,
                                                    stratify=y_sel_over)

In [30]:
dat5 = pd.DataFrame(X_sel_test_over[:,-1])
dat5['test'] = y_sel_test_over

In [31]:
dat5

Unnamed: 0,0,test
0,CFBRSa49,1
1,NRS108,2
2,MN105,2
3,CFBRSa03,2
4,BCH-SA-01,0
...,...,...
158,NRS027,0
159,BCH-SA-04,0
160,SR3585,2
161,504,1


In [32]:
X_sel_train_over = X_sel_train_over[:,:-1]
X_sel_test_over = X_sel_test_over[:,:-1]

In [83]:
#### neural network on over-sampling data
model2_over = Sequential([
    Dense(32, activation='relu', input_shape=(X_sel_train_over.shape[1],)),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax'),
])

In [84]:
model2_over.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [85]:
model2_over.fit(X_sel_train_over, y_sel_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_sel_test_over, y_sel_test_over))

Train on 380 samples, validate on 163 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1a36728128>

In [175]:
acc_test2_over = model2_over.evaluate(X_sel_test_over, y_sel_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test2_over*100))

over-sampling test accuracy: 87.12%


In [86]:
pred5 = model2_over.predict_classes(X_sel_test_over)
pred5

array([1, 2, 2, 2, 0, 1, 2, 2, 0, 2, 1, 1, 1, 0, 0, 0, 1, 2, 1, 2, 1, 0,
       0, 2, 0, 0, 2, 1, 0, 1, 2, 0, 1, 0, 1, 1, 1, 0, 1, 2, 2, 2, 2, 1,
       0, 2, 1, 0, 1, 2, 1, 2, 0, 2, 1, 2, 0, 0, 0, 0, 1, 1, 2, 2, 1, 2,
       0, 0, 2, 1, 2, 0, 2, 0, 0, 1, 2, 2, 0, 0, 0, 0, 2, 1, 1, 2, 1, 0,
       0, 1, 2, 1, 1, 0, 2, 2, 0, 1, 0, 0, 1, 0, 1, 0, 2, 1, 0, 2, 1, 2,
       1, 0, 2, 1, 2, 0, 0, 1, 0, 2, 1, 0, 2, 0, 0, 0, 2, 0, 2, 1, 1, 2,
       2, 0, 2, 2, 1, 0, 0, 0, 0, 1, 2, 2, 1, 0, 2, 0, 1, 0, 0, 2, 1, 1,
       0, 1, 0, 1, 0, 0, 1, 1, 0])

In [87]:
dat5['pred'] = pred5
dat5

Unnamed: 0,0,test,pred
0,CFBRSa49,1,1
1,NRS108,2,2
2,MN105,2,2
3,CFBRSa03,2,2
4,BCH-SA-01,0,0
...,...,...,...
158,NRS027,0,0
159,BCH-SA-04,0,0
160,SR3585,2,1
161,504,1,1


In [88]:
proba5 = model2_over.predict_proba(X_sel_test_over)
dat_proba5 = pd.DataFrame(proba5)

In [89]:
dat_proba5

Unnamed: 0,0,1,2
0,0.420049,0.430507,0.149443
1,0.000494,0.020135,0.979372
2,0.006240,0.311664,0.682096
3,0.000198,0.001413,0.998389
4,0.971427,0.000023,0.028550
...,...,...,...
158,0.999550,0.000049,0.000401
159,0.997324,0.000231,0.002445
160,0.072240,0.672646,0.255114
161,0.010667,0.975452,0.013880


In [90]:
dat_proba5.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba5.csv", index = False,
         header=None)

In [91]:
dat5.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/5p11ST.csv", index = False,
         header=None)

In [179]:
hist2_over = model2_over.fit(X_sel_train_over, y_sel_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_sel_test_over, y_sel_test_over))

Train on 380 samples, validate on 163 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100


Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [180]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist2_over.history['accuracy'])*100))

over-sampling train accuracy: 94.91%


In [33]:
df_proba5 = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_lasso_2.xlsx",
                        sheet_name=0,
                        index_col=None)

In [34]:
df_proba5

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p0006kpresabs_qual,NRS245,1,2,1.345807e-02,2.164788e-01,7.700630e-01
1,p0006kpresabs_qual,NY439,2,2,2.674153e-02,9.294230e-04,9.723290e-01
2,p0006kpresabs_qual,CA544,1,0,4.147484e-01,3.626331e-01,2.226184e-01
3,p0006kpresabs_qual,CA541,2,0,4.147484e-01,3.626331e-01,2.226184e-01
4,p0006kpresabs_qual,EUH15,1,0,4.147484e-01,3.626331e-01,2.226184e-01
...,...,...,...,...,...,...,...
984,p0017Skpresabs_qual,CA541,1,1,3.723218e-01,6.276781e-01,1.945911e-08
985,p0017Skpresabs_qual,SR4152,1,0,7.372800e-01,2.627200e-01,4.197748e-08
986,p0017Skpresabs_qual,NRS110,2,2,4.194510e-08,7.508231e-09,1.000000e+00
987,p0017Skpresabs_qual,CFBRSa70,0,0,7.372800e-01,2.627200e-01,4.197748e-08


In [35]:
y_prob5 = df_proba5[df_proba5['phage']=='p11kpresabsSTCC_qual'].iloc[:,-3:]
y_prob5 = y_prob5.to_numpy()
y_prob5

array([[4.20049160e-01, 4.30507400e-01, 1.49443400e-01],
       [4.93855500e-04, 2.01345530e-02, 9.79371670e-01],
       [6.23981750e-03, 3.11664460e-01, 6.82095770e-01],
       [1.97649230e-04, 1.41288110e-03, 9.98389500e-01],
       [9.71426960e-01, 2.32201380e-05, 2.85497400e-02],
       [7.35996800e-03, 8.10768660e-01, 1.81871340e-01],
       [2.23812120e-04, 3.21989150e-01, 6.77787000e-01],
       [1.01898300e-05, 2.37906870e-02, 9.76199150e-01],
       [9.89897700e-01, 3.38650870e-03, 6.71580530e-03],
       [3.88977230e-01, 7.66556860e-02, 5.34367000e-01],
       [1.31853385e-05, 8.71106000e-01, 1.28880770e-01],
       [3.46173380e-07, 9.99766900e-01, 2.32738440e-04],
       [2.62814160e-01, 5.87796500e-01, 1.49389360e-01],
       [9.32949500e-01, 4.66466730e-02, 2.04038080e-02],
       [9.83665170e-01, 4.02055540e-03, 1.23142380e-02],
       [7.27915500e-01, 1.49583000e-01, 1.22501460e-01],
       [6.56433900e-02, 8.57159440e-01, 7.71971600e-02],
       [2.38698750e-06, 2.57115

In [36]:
ovo5 = rocauc_ovo(y_sel_test_over, y_prob5, average="macro", multi_class="ovo")
ovo5

0.9579078244215861

In [37]:
ovr5 = rocauc_ovr(y_sel_test_over, y_prob5, average="macro", multi_class="ovr")
ovr5

0.9579078244215861

In [38]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_sel_train_over, X_sel_test_over, y_sel_train_over, y_sel_test_over = train_test_split(X_sel_over, y_sel_over,
                                                    test_size = 0.3,
                                                    random_state=678,
                                                    stratify=y_sel_over)

In [39]:
dat6 = pd.DataFrame(X_sel_test_over[:,-1])
dat6['test'] = y_sel_test_over

In [40]:
dat6

Unnamed: 0,0,test
0,GA48963,1
1,SR4187,2
2,NRS182,2
3,CFBREBSa125,2
4,NRS188,1
...,...,...
158,BCH-SA-05,0
159,NRS027,0
160,CFBREBSa123,0
161,NRS199,2


In [41]:
X_sel_train_over = X_sel_train_over[:,:-1]
X_sel_test_over = X_sel_test_over[:,:-1]

In [42]:
model2_over2 = Sequential([
    Dense(32, activation='relu', input_shape=(X_sel_train_over.shape[1],)),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax'),
])

In [97]:
model2_over2.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [98]:
model2_over2.fit(X_sel_train_over, y_sel_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_sel_test_over, y_sel_test_over))

Train on 380 samples, validate on 163 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1a36dc90f0>

In [208]:
acc_test2_over2 = model2_over2.evaluate(X_sel_test_over, y_sel_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test2_over2*100))

over-sampling test accuracy: 83.44%


In [99]:
pred6 = model2_over2.predict_classes(X_sel_test_over)
pred6

array([1, 0, 2, 1, 0, 0, 1, 1, 2, 1, 0, 2, 0, 0, 1, 0, 2, 2, 2, 1, 2, 1,
       0, 1, 1, 2, 0, 2, 0, 2, 0, 0, 0, 0, 1, 1, 0, 2, 1, 0, 1, 1, 1, 0,
       2, 2, 2, 2, 0, 1, 2, 2, 2, 0, 0, 0, 1, 2, 0, 0, 1, 1, 2, 1, 0, 1,
       2, 2, 0, 0, 2, 1, 1, 1, 1, 2, 2, 1, 0, 1, 1, 1, 2, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 2, 2, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1,
       2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 1, 0, 2, 0, 2, 0, 0, 1, 2, 1, 0, 1,
       0, 1, 2, 2, 2, 0, 2, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 2, 0, 0,
       2, 1, 0, 0, 0, 0, 0, 0, 0])

In [100]:
dat6['pred'] = pred6
dat6

Unnamed: 0,0,test,pred
0,GA48963,1,1
1,SR4187,2,0
2,NRS182,2,2
3,CFBREBSa125,2,1
4,NRS188,1,0
...,...,...,...
158,BCH-SA-05,0,0
159,NRS027,0,0
160,CFBREBSa123,0,0
161,NRS199,2,0


In [101]:
proba6 = model2_over2.predict_proba(X_sel_test_over)
dat_proba6 = pd.DataFrame(proba6)

In [102]:
dat_proba6

Unnamed: 0,0,1,2
0,0.000472,9.793897e-01,2.013863e-02
1,1.000000,4.588559e-09,3.666089e-07
2,0.012329,3.610478e-03,9.840609e-01
3,0.344447,5.116983e-01,1.438548e-01
4,0.731556,1.455345e-01,1.229093e-01
...,...,...,...
158,0.999211,1.880544e-04,6.010687e-04
159,0.999506,2.402587e-04,2.541915e-04
160,0.996627,2.101521e-04,3.162904e-03
161,0.583259,9.557659e-02,3.211643e-01


In [103]:
dat_proba6.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba6.csv", index = False,
         header=None)

In [104]:
dat6.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/6p11ST.csv", index = False,
         header=None)

In [212]:
hist2_over2 = model2_over2.fit(X_sel_train_over, y_sel_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_sel_test_over, y_sel_test_over))

Train on 380 samples, validate on 163 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100


Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [213]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist2_over2.history['accuracy'])*100))

over-sampling train accuracy: 94.31%


In [43]:
df_proba6 = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_lasso_2.xlsx",
                        sheet_name=1,
                        index_col=None)

In [44]:
df_proba6

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p0006kpresabs_qual,NRS249,2,1,1.888869e-01,5.108038e-01,3.003094e-01
1,p0006kpresabs_qual,NRS188,1,1,1.888869e-01,5.108038e-01,3.003094e-01
2,p0006kpresabs_qual,NRS232,2,2,4.222906e-01,7.029924e-02,5.074101e-01
3,p0006kpresabs_qual,NY439,2,2,3.558408e-04,2.976018e-04,9.993465e-01
4,p0006kpresabs_qual,GA27,2,1,3.940971e-01,4.184215e-01,1.874814e-01
...,...,...,...,...,...,...,...
984,p0017Skpresabs_qual,NRS252,0,0,7.239556e-01,2.760444e-01,1.176030e-09
985,p0017Skpresabs_qual,SR2852,1,1,1.052276e-07,9.999999e-01,1.101559e-28
986,p0017Skpresabs_qual,NRS108,1,1,1.540350e-17,1.000000e+00,9.011977e-16
987,p0017Skpresabs_qual,NRS202,0,0,6.888959e-01,3.111042e-01,2.228958e-09


In [45]:
y_prob6 = df_proba6[df_proba6['phage']=='p11kpresabsSTCC_qual'].iloc[:,-3:]
y_prob6 = y_prob6.to_numpy()
y_prob6

array([[4.71694540e-04, 9.79389700e-01, 2.01386330e-02],
       [9.99999640e-01, 4.58855900e-09, 3.66608930e-07],
       [1.23285720e-02, 3.61047780e-03, 9.84060940e-01],
       [3.44446800e-01, 5.11698300e-01, 1.43854830e-01],
       [7.31556300e-01, 1.45534460e-01, 1.22909285e-01],
       [9.99352400e-01, 1.81110990e-05, 6.29490650e-04],
       [1.92075320e-03, 9.87569400e-01, 1.05098240e-02],
       [1.90296410e-01, 4.87845700e-01, 3.21857960e-01],
       [1.12405255e-01, 2.58055800e-01, 6.29538950e-01],
       [1.80047610e-03, 7.55556460e-01, 2.42643000e-01],
       [6.89505200e-01, 1.87554820e-01, 1.22939980e-01],
       [2.06933800e-03, 3.26454130e-01, 6.71476540e-01],
       [9.28176460e-01, 5.24530220e-02, 1.93705170e-02],
       [9.90178350e-01, 6.37063500e-03, 3.45102260e-03],
       [1.53595830e-02, 9.81426540e-01, 3.21382540e-03],
       [9.99479000e-01, 7.68373500e-06, 5.13433600e-04],
       [3.83327220e-04, 7.64060700e-03, 9.91976100e-01],
       [4.55404070e-02, 3.42202

In [46]:
ovo6 = rocauc_ovo(y_sel_test_over, y_prob6, average="macro", multi_class="ovo")
ovo6

0.9378939754322017

In [47]:
ovr6 = rocauc_ovr(y_sel_test_over, y_prob6, average="macro", multi_class="ovr")
ovr6

0.9378939754322017

In [48]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_sel_train_over, X_sel_test_over, y_sel_train_over, y_sel_test_over = train_test_split(X_sel_over, y_sel_over,
                                                    test_size = 0.3,
                                                    random_state=789,
                                                    stratify=y_sel_over)

In [49]:
dat7 = pd.DataFrame(X_sel_test_over[:,-1])
dat7['test'] = y_sel_test_over

In [50]:
dat7

Unnamed: 0,0,test
0,CFBREBSa127,1
1,NRS145,0
2,CFBRSa66B,1
3,NRS204,1
4,BCH-SA-13,2
...,...,...
158,NRS233,2
159,NRS204,1
160,CFBRSa07,0
161,CFBREBSa117,1


In [51]:
X_sel_train_over = X_sel_train_over[:,:-1]
X_sel_test_over = X_sel_test_over[:,:-1]

In [109]:
model2_over3 = Sequential([
    Dense(32, activation='relu', input_shape=(X_sel_train_over.shape[1],)),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax'),
])

In [110]:
model2_over3.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [111]:
model2_over3.fit(X_sel_train_over, y_sel_train_over,
          batch_size=64, epochs=100,
          validation_data=(X_sel_test_over, y_sel_test_over))

Train on 380 samples, validate on 163 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100


Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1a373eb0b8>

In [246]:
acc_test2_over3 = model2_over3.evaluate(X_sel_test_over, y_sel_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test2_over3*100))

over-sampling test accuracy: 83.44%


In [112]:
pred7 = model2_over3.predict_classes(X_sel_test_over)
pred7

array([1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 2, 1, 1, 0, 1, 2, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 2, 1, 2, 2, 2, 1, 2, 2, 1, 0, 0, 2, 2, 2, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 2, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 2, 1, 1, 0, 1, 1, 2, 0, 0, 0, 2, 2, 0, 2, 0, 2, 0,
       0, 0, 1, 1, 2, 2, 2, 2, 1, 0, 2, 0, 0, 2, 1, 2, 1, 2, 1, 1, 1, 2,
       0, 0, 0, 1, 2, 0, 0, 1, 0, 2, 2, 2, 0, 1, 1, 1, 1, 2, 0, 2, 1, 0,
       2, 1, 1, 0, 0, 1, 1, 1, 1, 2, 0, 1, 2, 2, 0, 1, 1, 2, 1, 1, 0, 1,
       2, 0, 0, 2, 2, 1, 0, 0, 0])

In [113]:
dat7['pred'] = pred7
dat7

Unnamed: 0,0,test,pred
0,CFBREBSa127,1,1
1,NRS145,0,0
2,CFBRSa66B,1,1
3,NRS204,1,1
4,BCH-SA-13,2,1
...,...,...,...
158,NRS233,2,2
159,NRS204,1,1
160,CFBRSa07,0,0
161,CFBREBSa117,1,0


In [114]:
proba7 = model2_over3.predict_proba(X_sel_test_over)
dat_proba7 = pd.DataFrame(proba7)

In [115]:
dat_proba7

Unnamed: 0,0,1,2
0,0.002984,0.993895,0.003121
1,0.984843,0.013513,0.001644
2,0.057898,0.705047,0.237055
3,0.011230,0.899893,0.088877
4,0.049694,0.910956,0.039350
...,...,...,...
158,0.000002,0.000338,0.999660
159,0.011230,0.899893,0.088877
160,0.993775,0.000064,0.006161
161,0.805753,0.148299,0.045948


In [116]:
dat_proba7.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba7.csv", index = False,
         header=None)

In [117]:
dat7.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/7p11ST.csv", index = False,
         header=None)

In [250]:
hist2_over3 = model2_over3.fit(X_sel_train_over, y_sel_train_over,
          batch_size=64, epochs=100,
          validation_data=(X_sel_test_over, y_sel_test_over))

Train on 380 samples, validate on 163 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
E

In [251]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist2_over3.history['accuracy'])*100))

over-sampling train accuracy: 92.63%


In [52]:
df_proba7 = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_lasso_2.xlsx",
                        sheet_name=2,
                        index_col=None)

In [53]:
df_proba7

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p0006kpresabs_qual,NRS210,0,0,6.132076e-01,2.812180e-01,1.055744e-01
1,p0006kpresabs_qual,NRS205,2,2,1.993202e-04,6.834937e-07,9.998000e-01
2,p0006kpresabs_qual,312,2,1,3.589463e-01,3.982787e-01,2.427750e-01
3,p0006kpresabs_qual,GA15,2,1,3.589463e-01,3.982787e-01,2.427750e-01
4,p0006kpresabs_qual,SR4035,0,1,3.589463e-01,3.982787e-01,2.427750e-01
...,...,...,...,...,...,...,...
984,p0017Skpresabs_qual,NRS383,1,0,5.477194e-01,4.522807e-01,1.761374e-08
985,p0017Skpresabs_qual,NRS218,1,1,6.953657e-05,9.999305e-01,3.132419e-10
986,p0017Skpresabs_qual,NRS209,2,2,2.713214e-09,6.656316e-09,1.000000e+00
987,p0017Skpresabs_qual,SR2852,1,1,9.956684e-12,1.000000e+00,7.441288e-26


In [54]:
y_prob7 = df_proba7[df_proba7['phage']=='p11kpresabsSTCC_qual'].iloc[:,-3:]
y_prob7 = y_prob7.to_numpy()
y_prob7

array([[2.98438450e-03, 9.93894500e-01, 3.12100700e-03],
       [9.84843130e-01, 1.35127930e-02, 1.64416230e-03],
       [5.78978600e-02, 7.05047100e-01, 2.37055060e-01],
       [1.12300610e-02, 8.99893100e-01, 8.88768800e-02],
       [4.96935430e-02, 9.10956260e-01, 3.93501370e-02],
       [9.80025650e-01, 4.50999480e-03, 1.54642640e-02],
       [9.99996400e-01, 3.38867200e-06, 2.77013640e-07],
       [9.99754250e-01, 5.15126700e-05, 1.94318390e-04],
       [9.16462700e-02, 5.82356450e-01, 3.25997350e-01],
       [8.05752900e-01, 1.48298830e-01, 4.59483340e-02],
       [1.85752570e-01, 5.78999040e-01, 2.35248360e-01],
       [2.98438450e-03, 9.93894500e-01, 3.12100700e-03],
       [6.24211200e-02, 5.78912260e-01, 3.58666660e-01],
       [1.72049780e-04, 1.63745750e-04, 9.99664200e-01],
       [2.21175800e-01, 5.84191500e-01, 1.94632660e-01],
       [7.39271640e-02, 4.86829340e-01, 4.39243470e-01],
       [9.95143530e-01, 7.18895750e-04, 4.13758500e-03],
       [9.23220700e-02, 4.88149

In [55]:
ovo7 = rocauc_ovo(y_sel_test_over, y_prob7, average="macro", multi_class="ovo")
ovo7

0.9534383076432006

In [56]:
ovr7 = rocauc_ovr(y_sel_test_over, y_prob7, average="macro", multi_class="ovr")
ovr7

0.9534383076432006

In [57]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_sel_train_over, X_sel_test_over, y_sel_train_over, y_sel_test_over = train_test_split(X_sel_over, y_sel_over,
                                                    test_size = 0.3,
                                                    random_state=890,
                                                    stratify=y_sel_over)

In [58]:
dat8 = pd.DataFrame(X_sel_test_over[:,-1])
dat8['test'] = y_sel_test_over

In [59]:
dat8

Unnamed: 0,0,test
0,SR2852,2
1,NRS054,1
2,NRS157,2
3,NY224,1
4,NRS070,1
...,...,...
158,NY417,2
159,NRS051,1
160,NRS226,1
161,EUH13,0


In [60]:
X_sel_train_over = X_sel_train_over[:,:-1]
X_sel_test_over = X_sel_test_over[:,:-1]

In [102]:
model2_over4 = Sequential([
    Dense(32, activation='relu', input_shape=(X_sel_train_over.shape[1],)),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax'),
])

In [123]:
model2_over4.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [124]:
model2_over4.fit(X_sel_train_over, y_sel_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_sel_test_over, y_sel_test_over))

Train on 380 samples, validate on 163 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1a357a75f8>

In [279]:
acc_test2_over4 = model2_over4.evaluate(X_sel_test_over, y_sel_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test2_over4*100))

over-sampling test accuracy: 81.60%


In [125]:
pred8 = model2_over4.predict_classes(X_sel_test_over)
pred8

array([2, 1, 2, 2, 1, 1, 2, 0, 2, 0, 2, 2, 2, 1, 1, 0, 2, 1, 2, 0, 0, 1,
       2, 2, 0, 2, 2, 2, 2, 0, 0, 0, 2, 2, 1, 0, 0, 2, 1, 0, 2, 1, 1, 0,
       0, 2, 2, 0, 1, 2, 1, 0, 1, 1, 1, 1, 0, 1, 2, 1, 0, 2, 2, 0, 0, 0,
       0, 1, 1, 0, 2, 0, 1, 2, 1, 1, 1, 2, 1, 0, 0, 0, 2, 2, 2, 1, 2, 2,
       0, 2, 0, 2, 0, 2, 2, 1, 2, 0, 2, 1, 1, 2, 0, 1, 1, 2, 2, 0, 2, 2,
       1, 1, 1, 0, 0, 2, 0, 2, 1, 2, 1, 2, 2, 2, 2, 2, 0, 0, 0, 1, 2, 0,
       2, 1, 1, 1, 0, 1, 0, 1, 1, 1, 2, 1, 1, 1, 2, 0, 0, 1, 1, 2, 0, 0,
       2, 2, 2, 2, 2, 1, 1, 2, 0])

In [126]:
dat8['pred'] = pred8
dat8

Unnamed: 0,0,test,pred
0,SR2852,2,2
1,NRS054,1,1
2,NRS157,2,2
3,NY224,1,2
4,NRS070,1,1
...,...,...,...
158,NY417,2,2
159,NRS051,1,1
160,NRS226,1,1
161,EUH13,0,2


In [127]:
proba8 = model2_over4.predict_proba(X_sel_test_over)
dat_proba8 = pd.DataFrame(proba8)

In [128]:
dat_proba8

Unnamed: 0,0,1,2
0,5.758889e-03,4.763594e-01,0.517882
1,9.463946e-04,7.362460e-01,0.262808
2,1.785490e-03,2.730639e-02,0.970908
3,3.112390e-02,7.180278e-02,0.897073
4,2.501458e-06,9.983050e-01,0.001693
...,...,...,...
158,3.112390e-02,7.180278e-02,0.897073
159,1.700311e-02,9.685482e-01,0.014449
160,4.715491e-05,9.585645e-01,0.041388
161,4.814243e-07,6.112870e-10,1.000000


In [129]:
dat_proba8.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba8.csv", index = False,
         header=None)

In [130]:
dat8.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/8p11ST.csv", index = False,
         header=None)

In [283]:
hist2_over4 = model2_over4.fit(X_sel_train_over, y_sel_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_sel_test_over, y_sel_test_over))

Train on 380 samples, validate on 163 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [284]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist2_over4.history['accuracy'])*100))

over-sampling train accuracy: 91.59%


In [61]:
df_proba8 = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_lasso_2.xlsx",
                        sheet_name=3,
                        index_col=None)

In [62]:
df_proba8

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p0006kpresabs_qual,NRS236,1,2,1.321970e-02,2.446264e-01,7.421539e-01
1,p0006kpresabs_qual,NRS113,2,2,3.478230e-02,2.806685e-01,6.845492e-01
2,p0006kpresabs_qual,CFBRSa23,0,0,4.090251e-01,3.405008e-01,2.504741e-01
3,p0006kpresabs_qual,NRS249,2,1,1.987907e-01,5.331044e-01,2.681049e-01
4,p0006kpresabs_qual,107,1,0,4.090251e-01,3.405008e-01,2.504741e-01
...,...,...,...,...,...,...,...
984,p0017Skpresabs_qual,CFBRSa30,0,0,7.207667e-01,2.792331e-01,2.571588e-07
985,p0017Skpresabs_qual,NRS383,1,0,6.129044e-01,3.870795e-01,1.601290e-05
986,p0017Skpresabs_qual,NRS110,2,2,3.260306e-07,7.910664e-07,9.999989e-01
987,p0017Skpresabs_qual,NRS209,2,2,3.604249e-12,2.698129e-07,9.999998e-01


In [63]:
y_prob8 = df_proba8[df_proba8['phage']=='p11kpresabsSTCC_qual'].iloc[:,-3:]
y_prob8 = y_prob8.to_numpy()
y_prob8

array([[5.75888860e-03, 4.76359430e-01, 5.17881750e-01],
       [9.46394600e-04, 7.36246050e-01, 2.62807550e-01],
       [1.78548960e-03, 2.73063930e-02, 9.70908100e-01],
       [3.11238950e-02, 7.18027800e-02, 8.97073300e-01],
       [2.50145800e-06, 9.98304960e-01, 1.69252790e-03],
       [2.07022240e-04, 9.94086500e-01, 5.70646800e-03],
       [1.34257070e-02, 1.39981250e-01, 8.46592960e-01],
       [8.06272400e-01, 1.56548770e-02, 1.78072770e-01],
       [2.40216720e-05, 2.15479170e-02, 9.78428070e-01],
       [9.78429800e-01, 7.14531400e-03, 1.44248900e-02],
       [5.60953430e-02, 3.99860350e-01, 5.44044300e-01],
       [2.99292150e-04, 1.10775860e-03, 9.98592900e-01],
       [4.99543800e-02, 3.52227630e-01, 5.97818100e-01],
       [9.46394600e-04, 7.36246050e-01, 2.62807550e-01],
       [5.50834760e-02, 5.23399400e-01, 4.21517100e-01],
       [9.84205300e-01, 6.10456000e-04, 1.51843260e-02],
       [1.58854690e-02, 1.61717610e-03, 9.82497400e-01],
       [2.53534400e-03, 6.31727

In [64]:
ovo8 = rocauc_ovo(y_sel_test_over, y_prob8, average="macro", multi_class="ovo")
ovo8

0.8725504278256572

In [65]:
ovr8 = rocauc_ovr(y_sel_test_over, y_prob8, average="macro", multi_class="ovr")
ovr8

0.8725504278256572

In [66]:
ovos2 = [ovo5, ovo6, ovo7, ovo8]
np.mean(ovos2)

0.9304476338306613

In [67]:
np.std(ovos2)

0.03424239921529013

In [68]:
ovrs2 = [ovr5, ovr6, ovr7, ovr8]
np.mean(ovrs2)

0.9304476338306613

In [69]:
np.std(ovrs2)

0.03424239921529013

In [285]:
accs_l_over = [acc_test2_over, acc_test2_over2, acc_test2_over3, acc_test2_over4]

In [286]:
mean_l_over = np.mean(accs_l_over)
print('over-sampling test accuracy mean after lasso: %.2f%%' % (mean_l_over*100))

over-sampling test accuracy mean after lasso: 83.90%


In [287]:
std_l_over = np.std(accs_l_over)
print('over-sampling test accuracy standard deviation after lasso:', std_l_over)

over-sampling test accuracy standard deviation after lasso: 0.020056275911380646


In [288]:
accs_train_l_over = [np.mean(hist2_over.history['accuracy']), np.mean(hist2_over2.history['accuracy']), np.mean(hist2_over3.history['accuracy']),
             np.mean(hist2_over4.history['accuracy'])]

In [289]:
mean_train_l_over = np.mean(accs_train_l_over)
print('over-sampling train accuracy mean after lasso: %.2f%%' % (mean_train_l_over*100))

over-sampling train accuracy mean after lasso: 93.36%


In [290]:
std_train_l_over = np.std(accs_train_l_over)
print('over-sampling train accuracy standard deviation after lasso:', std_train_l_over)

over-sampling train accuracy standard deviation after lasso: 0.013198962
