In [None]:
## This file implements neural networks with/without dropout and regularizer for p003ppresabs_qual with four replicates.
## We compute the mean and standarad deviation of training and test accuracies.
## We also compute the mean and standard deviation of AUC ROC values for each model.

In [1]:
from numpy.random import seed
import numpy as np
seed(100)
import tensorflow
tensorflow.random.set_seed(123)

In [2]:
import pandas as pd

df = pd.read_csv('/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/p003ppresabs_qual.csv')
df.shape

(253, 1091)

In [3]:
df.rename(columns={'Unnamed: 0':'id'}, inplace=True)

In [4]:
df['pheno']

0      0
1      0
2      1
3      0
4      0
      ..
248    0
249    0
250    0
251    0
252    0
Name: pheno, Length: 253, dtype: int64

In [5]:
df.head()

Unnamed: 0,id,TTTTTTTAGTTTATCCAATGATTGATGTTATAATAATACTAAATTTGTATCTATAAAAAAGTAATGAGCATTTGTGCGCATATGATG,TTTTTTTAGTTTATCCAATGATTGATGTTATAATAATACTAAATTTGTATCTATAAAAAAGTAATGAGCATTTGTGCGCATATGATGA,TTTTTTTAGTTTATCCAATGATTGATGTTATAATAATACTAAATTTGTATCTATAAAAAAGTAATGAGCATTTGTGCGCATATGATGATGTAAAGCGTAA,TTTTTTCTTTTCATAACTGTGTTGGAAATGAATTAAATTAACAGCTCTTTGTGCTTTACGGTGTGTTGC,TTTTTTCAGCATTGTCTACATTACTTAACATTCGTGTTTGTAAGTAATATTGACCGCCAATATTTAGACACTTTATAAGTATGCCATTCATCATTTTTAA,TTTTTTATCTCACCAATTTTGTAATACATCGTTCTCGTCCTCCTTGTCTTCTTCGTCCTCCTCGTTATCTTCTTCGTTTTGTAATTCATAAATTTTGTTT,TTTTTTAGTTTATCCAATGATTGATGTTATAATAATACTAAATTTGTATCTATAAAAAAGTAATGAGCATTTGTGCGCATATGATGATGTAAAGCGTAAA,TTTTTTAGGTACC,TTTTTGCATTCA,...,group_8042,group_8177,group_8643,group_8644,group_8645,group_8646,group_8815,group_8892,group_9489,pheno
0,107,1,1,1,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,109,1,1,1,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,115,1,1,1,1,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,1
3,120335,1,1,1,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
4,120337,1,1,1,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [6]:
df['pheno'].value_counts()

0    224
1     26
2      3
Name: pheno, dtype: int64

In [7]:
df_clean = df.drop(columns=['id'])

In [8]:
df_clean.shape

(253, 1090)

In [9]:
df_clean.head()

Unnamed: 0,TTTTTTTAGTTTATCCAATGATTGATGTTATAATAATACTAAATTTGTATCTATAAAAAAGTAATGAGCATTTGTGCGCATATGATG,TTTTTTTAGTTTATCCAATGATTGATGTTATAATAATACTAAATTTGTATCTATAAAAAAGTAATGAGCATTTGTGCGCATATGATGA,TTTTTTTAGTTTATCCAATGATTGATGTTATAATAATACTAAATTTGTATCTATAAAAAAGTAATGAGCATTTGTGCGCATATGATGATGTAAAGCGTAA,TTTTTTCTTTTCATAACTGTGTTGGAAATGAATTAAATTAACAGCTCTTTGTGCTTTACGGTGTGTTGC,TTTTTTCAGCATTGTCTACATTACTTAACATTCGTGTTTGTAAGTAATATTGACCGCCAATATTTAGACACTTTATAAGTATGCCATTCATCATTTTTAA,TTTTTTATCTCACCAATTTTGTAATACATCGTTCTCGTCCTCCTTGTCTTCTTCGTCCTCCTCGTTATCTTCTTCGTTTTGTAATTCATAAATTTTGTTT,TTTTTTAGTTTATCCAATGATTGATGTTATAATAATACTAAATTTGTATCTATAAAAAAGTAATGAGCATTTGTGCGCATATGATGATGTAAAGCGTAAA,TTTTTTAGGTACC,TTTTTGCATTCA,TTTTTGAAAATAATCATTAGCTTGCTCACTATATAATTTGATGAATATATTTCGTGAAAGTGGGTATTTATTTAATGATTATTCTATATATGATAGTATA,...,group_8042,group_8177,group_8643,group_8644,group_8645,group_8646,group_8815,group_8892,group_9489,pheno
0,1,1,1,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,1,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,1
3,1,1,1,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
X = df.loc[:, df.columns != 'pheno']
y = df['pheno']
print(X.shape, y.shape)

(253, 1090) (253,)


In [11]:
# over-sampling
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler
overS = RandomOverSampler(random_state=100)
X_over, y_over = overS.fit_resample(X, y)
print(sorted(Counter(y_over).items()))

Using TensorFlow backend.


[(0, 224), (1, 224), (2, 224)]




In [12]:
############# Fully-Connected Neural Network ################

In [13]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.regularizers import l1

In [14]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over,
                                                    test_size = 0.3,
                                                    random_state=123,
                                                    stratify=y_over)

In [15]:
dat = pd.DataFrame(X_test_over[:,0])
dat['test'] = y_test_over

In [16]:
dat

Unnamed: 0,0,test
0,NRS265,1
1,GA984,0
2,NRS119,0
3,NRS249,1
4,NRS255,2
...,...,...
197,NRS035,1
198,NRS387,1
199,NRS222,0
200,BCH-SA-11,1


In [17]:
X_train_over = X_train_over[:,1:]
X_test_over = X_test_over[:,1:]

In [18]:
#### neural network on over-sampling data
model1_over = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],)),
    Dense(3, activation='softmax'),
])

In [19]:
model1_over.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [20]:
model1_over.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 470 samples, validate on 202 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1a36b914e0>

In [27]:
acc_test_over = model1_over.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over*100))

over-sampling test accuracy: 96.04%


In [21]:
pred = model1_over.predict_classes(X_test_over)
pred

array([1, 0, 0, 1, 2, 1, 1, 1, 2, 0, 1, 0, 1, 0, 1, 2, 0, 0, 2, 2, 2, 0,
       0, 0, 1, 1, 1, 2, 2, 1, 2, 2, 0, 2, 0, 0, 0, 2, 1, 0, 2, 1, 2, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 1, 1, 0,
       0, 2, 1, 2, 1, 2, 1, 2, 0, 2, 1, 1, 2, 2, 0, 2, 1, 2, 2, 1, 1, 0,
       2, 1, 2, 2, 0, 0, 2, 2, 1, 1, 0, 0, 1, 2, 0, 0, 2, 1, 2, 2, 1, 1,
       1, 1, 2, 0, 2, 0, 0, 0, 2, 0, 0, 1, 1, 2, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 2, 2, 1, 1, 2, 0, 0, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 1, 0, 2,
       1, 0, 2, 1, 0, 0, 2, 0, 2, 0, 2, 0, 2, 1, 2, 1, 1, 2, 1, 1, 2, 0,
       1, 2, 2, 1, 0, 2, 2, 2, 1, 1, 1, 2, 1, 0, 1, 1, 2, 0, 1, 1, 1, 1,
       1, 0, 1, 2])

In [22]:
dat['pred'] = pred
dat

Unnamed: 0,0,test,pred
0,NRS265,1,1
1,GA984,0,0
2,NRS119,0,0
3,NRS249,1,1
4,NRS255,2,2
...,...,...,...
197,NRS035,1,1
198,NRS387,1,1
199,NRS222,0,0
200,BCH-SA-11,1,1


In [23]:
proba1 = model1_over.predict_proba(X_test_over)
dat_proba1 = pd.DataFrame(proba1)

In [24]:
dat_proba1

Unnamed: 0,0,1,2
0,5.285770e-04,0.999036,4.354483e-04
1,9.999979e-01,0.000002,2.910657e-10
2,9.995348e-01,0.000465,4.638188e-13
3,5.523895e-03,0.994476,9.884415e-09
4,3.970241e-06,0.000202,9.997938e-01
...,...,...,...
197,1.956496e-03,0.997029,1.015072e-03
198,5.211424e-03,0.994770,1.875819e-05
199,9.999911e-01,0.000009,1.052898e-09
200,1.581237e-02,0.984188,3.957302e-10


In [25]:
dat_proba1.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba1.csv", index = False,
         header=None)

In [26]:
dat.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/1p003pp.csv", index = False,
         header=None)

In [31]:
hist1_over = model1_over.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 470 samples, validate on 202 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [32]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist1_over.history['accuracy'])*100))

over-sampling train accuracy: 100.00%


In [18]:
df_proba = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_2.xlsx",
                        sheet_name=0,
                        index_col=None)

In [19]:
df_proba

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p002ykpresabs_qual,CFBRSa26,0,0,0.758914,0.241086,4.638713e-07
1,p002ykpresabs_qual,NRS109,2,2,0.005361,0.016236,9.784034e-01
2,p002ykpresabs_qual,NRS112,0,0,0.726623,0.273376,1.520979e-06
3,p002ykpresabs_qual,NRS216,1,1,0.138322,0.861665,1.334123e-05
4,p002ykpresabs_qual,NRS021,0,0,0.882176,0.117824,1.414530e-10
...,...,...,...,...,...,...,...
4279,pyopresabsSTCC_qual,NRS148,2,2,0.000007,0.000099,9.998934e-01
4280,pyopresabsSTCC_qual,NRS255,2,2,0.000257,0.002048,9.976944e-01
4281,pyopresabsSTCC_qual,NRS205,2,2,0.000011,0.000045,9.999435e-01
4282,pyopresabsSTCC_qual,NRS255,2,2,0.000257,0.002048,9.976944e-01


In [20]:
y_prob = df_proba[df_proba['phage']=='p003ppresabs_qual'].iloc[:,-3:]
y_prob = y_prob.to_numpy()
y_prob

array([[5.28576960e-04, 9.99036100e-01, 4.35448280e-04],
       [9.99997850e-01, 2.13410570e-06, 2.91065700e-10],
       [9.99534850e-01, 4.65188260e-04, 4.63818800e-13],
       [5.52389500e-03, 9.94476140e-01, 9.88441500e-09],
       [3.97024100e-06, 2.02186860e-04, 9.99793800e-01],
       [4.92807200e-04, 9.99507200e-01, 1.81465720e-08],
       [5.64873600e-04, 9.83086350e-01, 1.63487620e-02],
       [1.38570110e-02, 9.86120200e-01, 2.27580700e-05],
       [3.97024100e-06, 2.02186860e-04, 9.99793800e-01],
       [9.99372540e-01, 6.27344000e-04, 1.44910360e-07],
       [9.87894200e-03, 9.90121100e-01, 1.14227430e-09],
       [9.99950500e-01, 4.94118030e-05, 1.90908990e-13],
       [5.21142800e-03, 9.94769900e-01, 1.87581700e-05],
       [9.99994900e-01, 5.08587530e-06, 3.79110880e-09],
       [9.95335900e-04, 9.98410000e-01, 5.94596150e-04],
       [3.97024100e-06, 2.02186860e-04, 9.99793800e-01],
       [9.93308540e-01, 6.69149450e-03, 1.96822040e-11],
       [9.99961730e-01, 3.79842

In [21]:
## Retrieved from https://github.com/scikit-learn/scikit-learn/issues/3298
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelBinarizer

def rocauc_ovo(truth, pred, average="macro", multi_class="ovo"):

    lb = LabelBinarizer()
    lb.fit(truth)

    truth = lb.transform(truth)   
    
    return roc_auc_score(truth, pred, average=average, multi_class=multi_class)

In [22]:
ovo1 = rocauc_ovo(y_test_over, y_prob, average="macro", multi_class="ovo")
ovo1

0.9915975677169707

In [23]:
def rocauc_ovr(truth, pred, average="macro", multi_class="ovr"):

    lb = LabelBinarizer()
    lb.fit(truth)

    truth = lb.transform(truth)   

    return roc_auc_score(truth, pred, average=average, multi_class=multi_class)

In [24]:
ovr1 = rocauc_ovr(y_test_over, y_prob, average="macro", multi_class="ovr")
ovr1

0.9915975677169707

In [25]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over,
                                                    test_size = 0.3,
                                                    random_state=234,
                                                    stratify=y_over)

In [26]:
dat2 = pd.DataFrame(X_test_over[:,0])
dat2['test'] = y_test_over

In [27]:
dat2

Unnamed: 0,0,test
0,CFBRSa05,0
1,NRS114,0
2,NRS168,1
3,NRS255,2
4,NRS209,2
...,...,...
197,NRS196,0
198,NRS255,2
199,NRS249,1
200,NRS209,2


In [28]:
X_train_over = X_train_over[:,1:]
X_test_over = X_test_over[:,1:]

In [31]:
model1_over2 = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],)),
    Dense(3, activation='softmax'),
])

In [32]:
model1_over2.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [33]:
model1_over2.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 470 samples, validate on 202 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1a371b9320>

In [40]:
acc_test_over2 = model1_over2.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over2*100))

over-sampling test accuracy: 94.55%


In [34]:
pred2 = model1_over2.predict_classes(X_test_over)
pred2

array([0, 0, 1, 2, 2, 2, 2, 1, 2, 2, 0, 1, 0, 0, 2, 1, 0, 2, 1, 2, 2, 2,
       0, 2, 0, 1, 2, 2, 2, 2, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 2, 0, 1, 0,
       2, 0, 1, 2, 2, 2, 1, 0, 0, 2, 2, 1, 1, 2, 0, 2, 0, 0, 1, 0, 1, 1,
       1, 2, 1, 2, 0, 1, 2, 1, 0, 0, 1, 1, 2, 1, 1, 2, 0, 0, 1, 0, 0, 1,
       1, 2, 0, 1, 2, 1, 0, 1, 0, 2, 0, 1, 1, 1, 1, 2, 2, 1, 0, 0, 1, 0,
       2, 0, 1, 1, 1, 2, 0, 2, 1, 1, 0, 2, 1, 1, 0, 1, 0, 0, 0, 2, 1, 1,
       1, 0, 1, 1, 1, 1, 2, 2, 0, 2, 0, 2, 2, 0, 1, 2, 1, 0, 1, 2, 2, 2,
       0, 2, 1, 1, 0, 2, 2, 2, 1, 1, 0, 2, 1, 2, 0, 0, 2, 0, 2, 2, 2, 1,
       1, 1, 2, 1, 2, 0, 1, 0, 1, 2, 1, 2, 1, 1, 0, 0, 1, 1, 2, 2, 1, 0,
       2, 1, 2, 0])

In [35]:
dat2['pred'] = pred2
dat2

Unnamed: 0,0,test,pred
0,CFBRSa05,0,0
1,NRS114,0,0
2,NRS168,1,1
3,NRS255,2,2
4,NRS209,2,2
...,...,...,...
197,NRS196,0,0
198,NRS255,2,2
199,NRS249,1,1
200,NRS209,2,2


In [36]:
proba2 = model1_over2.predict_proba(X_test_over)
dat_proba2 = pd.DataFrame(proba2)

In [37]:
dat_proba2

Unnamed: 0,0,1,2
0,1.000000,1.643428e-07,5.775328e-08
1,0.999949,5.072287e-05,8.993385e-09
2,0.071051,9.283946e-01,5.545159e-04
3,0.000097,2.764877e-05,9.998754e-01
4,0.000002,7.668140e-05,9.999214e-01
...,...,...,...
197,0.999568,4.305906e-04,1.676442e-06
198,0.000097,2.764872e-05,9.998754e-01
199,0.042857,9.571427e-01,6.215257e-07
200,0.000002,7.668147e-05,9.999214e-01


In [38]:
dat_proba2.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba2.csv", index = False,
         header=None)

In [39]:
dat2.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/2p003pp.csv", index = False,
         header=None)

In [44]:
hist1_over2 = model1_over2.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 470 samples, validate on 202 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [45]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist1_over2.history['accuracy'])*100))

over-sampling train accuracy: 99.88%


In [29]:
df_proba2 = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_2.xlsx",
                        sheet_name=1,
                        index_col=None)

In [30]:
df_proba2

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p002ykpresabs_qual,NRS148,2,2,0.000056,1.748042e-03,9.981960e-01
1,p002ykpresabs_qual,BCH-SA-03,1,0,0.712007,2.879924e-01,9.646217e-07
2,p002ykpresabs_qual,NRS218,1,1,0.006222,9.937732e-01,4.482882e-06
3,p002ykpresabs_qual,NRS036,0,0,0.882617,1.173831e-01,2.310933e-10
4,p002ykpresabs_qual,NRS386,1,0,0.571179,4.288184e-01,2.444667e-06
...,...,...,...,...,...,...,...
4279,pyopresabsSTCC_qual,NRS112,1,1,0.001860,9.979747e-01,1.653396e-04
4280,pyopresabsSTCC_qual,SR1065,0,0,0.982940,1.705227e-02,7.349168e-06
4281,pyopresabsSTCC_qual,NRS203,0,0,0.997093,1.962516e-03,9.441347e-04
4282,pyopresabsSTCC_qual,CFBREBSa129,0,0,1.000000,3.031141e-13,3.208205e-09


In [31]:
y_prob2 = df_proba2[df_proba2['phage']=='p003ppresabs_qual'].iloc[:,-3:]
y_prob2 = y_prob2.to_numpy()
y_prob2

array([[9.99999900e-01, 1.64342810e-07, 5.77532760e-08],
       [9.99949200e-01, 5.07228700e-05, 8.99338500e-09],
       [7.10509050e-02, 9.28394600e-01, 5.54515900e-04],
       [9.69614340e-05, 2.76487680e-05, 9.99875400e-01],
       [1.82985890e-06, 7.66814000e-05, 9.99921440e-01],
       [9.69614340e-05, 2.76487680e-05, 9.99875400e-01],
       [1.82985890e-06, 7.66814000e-05, 9.99921440e-01],
       [9.57064100e-04, 9.98357100e-01, 6.85767560e-04],
       [1.82985890e-06, 7.66814000e-05, 9.99921440e-01],
       [1.75247420e-05, 7.76682630e-04, 9.99205770e-01],
       [9.99999640e-01, 2.41265700e-07, 1.03096234e-07],
       [1.74968410e-03, 9.98245360e-01, 5.04387600e-06],
       [9.99997600e-01, 2.23030360e-06, 1.45069320e-07],
       [9.99962700e-01, 1.59491570e-05, 2.13460150e-05],
       [9.69614340e-05, 2.76487680e-05, 9.99875400e-01],
       [7.10509050e-02, 9.28394600e-01, 5.54515900e-04],
       [9.99961850e-01, 3.76090900e-05, 6.27830050e-07],
       [9.69614340e-05, 2.76487

In [32]:
ovo2 = rocauc_ovo(y_test_over, y_prob2, average="macro", multi_class="ovo")
ovo2

0.9874050228162022

In [33]:
ovr2 = rocauc_ovr(y_test_over, y_prob2, average="macro", multi_class="ovr")
ovr2

0.9874050228162022

In [34]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over,
                                                    test_size = 0.3,
                                                    random_state=345,
                                                    stratify=y_over)

In [35]:
dat3 = pd.DataFrame(X_test_over[:,0])
dat3['test'] = y_test_over

In [36]:
dat3

Unnamed: 0,0,test
0,NRS148,2
1,NRS209,2
2,NRS187,1
3,CFBREBSa116,0
4,NRS187,1
...,...,...
197,GA12,0
198,NRS209,2
199,NRS265,1
200,NRS253,1


In [37]:
X_train_over = X_train_over[:,1:]
X_test_over = X_test_over[:,1:]

In [44]:
model1_over3 = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],)),
    Dense(3, activation='softmax'),
])

In [45]:
model1_over3.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [46]:
model1_over3.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 470 samples, validate on 202 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1a374e32b0>

In [53]:
acc_test_over3 = model1_over3.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over3*100))

over-sampling test accuracy: 97.52%


In [47]:
pred3 = model1_over3.predict_classes(X_test_over)
pred3

array([2, 2, 1, 0, 1, 2, 2, 1, 2, 1, 1, 0, 2, 1, 0, 1, 0, 1, 2, 1, 0, 0,
       0, 0, 1, 0, 1, 2, 0, 2, 2, 0, 1, 1, 2, 2, 0, 1, 0, 0, 1, 0, 1, 2,
       0, 1, 1, 0, 1, 0, 0, 2, 2, 1, 0, 1, 2, 2, 2, 2, 2, 1, 2, 0, 2, 2,
       2, 2, 1, 0, 1, 0, 2, 0, 0, 1, 1, 1, 2, 1, 1, 0, 1, 2, 1, 1, 1, 1,
       2, 2, 2, 1, 0, 1, 0, 1, 0, 2, 0, 1, 2, 2, 0, 2, 1, 2, 0, 1, 0, 0,
       0, 2, 1, 0, 0, 2, 1, 1, 0, 0, 0, 1, 0, 1, 1, 2, 1, 2, 0, 2, 0, 1,
       0, 0, 2, 2, 0, 0, 1, 2, 2, 2, 1, 2, 2, 2, 1, 1, 1, 0, 2, 1, 2, 0,
       0, 2, 1, 1, 1, 2, 1, 0, 0, 2, 2, 0, 1, 2, 2, 0, 2, 1, 1, 0, 2, 0,
       1, 2, 0, 1, 0, 1, 1, 2, 1, 2, 0, 2, 2, 2, 0, 0, 1, 2, 0, 1, 1, 0,
       2, 1, 1, 0])

In [48]:
dat3['pred'] = pred3
dat3

Unnamed: 0,0,test,pred
0,NRS148,2,2
1,NRS209,2,2
2,NRS187,1,1
3,CFBREBSa116,0,0
4,NRS187,1,1
...,...,...,...
197,GA12,0,0
198,NRS209,2,2
199,NRS265,1,1
200,NRS253,1,1


In [49]:
proba3 = model1_over3.predict_proba(X_test_over)
dat_proba3 = pd.DataFrame(proba3)

In [50]:
dat_proba3

Unnamed: 0,0,1,2
0,2.889818e-06,0.001785,9.982117e-01
1,4.162348e-08,0.000037,9.999627e-01
2,5.155065e-03,0.994831,1.384165e-05
3,9.999961e-01,0.000004,8.616147e-09
4,5.155065e-03,0.994831,1.384165e-05
...,...,...,...
197,9.968407e-01,0.003159,1.855058e-12
198,4.162348e-08,0.000037,9.999627e-01
199,9.772972e-05,0.999632,2.701283e-04
200,8.003599e-06,0.999955,3.659078e-05


In [51]:
dat_proba3.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba3.csv", index = False,
         header=None)

In [52]:
dat3.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/3p003pp.csv", index = False,
         header=None)

In [57]:
hist1_over3 = model1_over3.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 470 samples, validate on 202 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [58]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist1_over3.history['accuracy'])*100))

over-sampling train accuracy: 99.94%


In [38]:
df_proba3 = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_2.xlsx",
                        sheet_name=2,
                        index_col=None)

In [39]:
df_proba3

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p002ykpresabs_qual,NRS109,2,2,0.004477,0.013518,9.820048e-01
1,p002ykpresabs_qual,NRS109,2,2,0.004477,0.013518,9.820048e-01
2,p002ykpresabs_qual,NRS222,0,0,0.851725,0.148269,5.980786e-06
3,p002ykpresabs_qual,NRS109,2,2,0.004477,0.013518,9.820048e-01
4,p002ykpresabs_qual,GA50245,0,0,0.812055,0.187945,1.161034e-07
...,...,...,...,...,...,...,...
4279,pyopresabsSTCC_qual,NRS255,2,2,0.000633,0.000928,9.984396e-01
4280,pyopresabsSTCC_qual,NRS255,2,2,0.000633,0.000928,9.984396e-01
4281,pyopresabsSTCC_qual,NRS266,1,1,0.025932,0.974061,7.323514e-06
4282,pyopresabsSTCC_qual,NRS001,1,1,0.000597,0.999403,3.675362e-10


In [40]:
y_prob3 = df_proba3[df_proba3['phage']=='p003ppresabs_qual'].iloc[:,-3:]
y_prob3 = y_prob3.to_numpy()
y_prob3

array([[2.88981780e-06, 1.78543220e-03, 9.98211740e-01],
       [4.16234830e-08, 3.72347860e-05, 9.99962700e-01],
       [5.15506460e-03, 9.94831000e-01, 1.38416530e-05],
       [9.99996070e-01, 3.96543150e-06, 8.61614700e-09],
       [5.15506460e-03, 9.94831000e-01, 1.38416530e-05],
       [4.16234830e-08, 3.72347860e-05, 9.99962700e-01],
       [3.16537630e-05, 5.86198700e-05, 9.99909760e-01],
       [5.48832900e-03, 9.94511600e-01, 1.48546230e-08],
       [2.88981780e-06, 1.78543220e-03, 9.98211740e-01],
       [9.77297240e-05, 9.99632100e-01, 2.70128280e-04],
       [3.61525380e-03, 9.96367100e-01, 1.76307350e-05],
       [9.99970560e-01, 2.94614130e-05, 1.06401220e-08],
       [4.16234830e-08, 3.72347860e-05, 9.99962700e-01],
       [9.06229100e-03, 9.90733500e-01, 2.04258700e-04],
       [9.99993800e-01, 6.24100800e-06, 1.11336890e-12],
       [5.48832900e-03, 9.94511600e-01, 1.48546230e-08],
       [9.99999050e-01, 9.19617300e-07, 8.25180400e-09],
       [1.95766200e-04, 9.99113

In [41]:
ovo3 = rocauc_ovo(y_test_over, y_prob3, average="macro", multi_class="ovo")
ovo3

0.999632827149655

In [42]:
ovr3 = rocauc_ovr(y_test_over, y_prob3, average="macro", multi_class="ovr")
ovr3

0.999632827149655

In [43]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over,
                                                    test_size = 0.3,
                                                    random_state=456,
                                                    stratify=y_over)

In [44]:
dat4 = pd.DataFrame(X_test_over[:,0])
dat4['test'] = y_test_over

In [45]:
dat4

Unnamed: 0,0,test
0,CFBRSa04,0
1,NRS021,0
2,NRS073,0
3,NRS049,0
4,CA541,1
...,...,...
197,NRS387,1
198,SR1746,0
199,NRS148,2
200,NRS255,2


In [46]:
X_train_over = X_train_over[:,1:]
X_test_over = X_test_over[:,1:]

In [57]:
model1_over4 = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],)),
    Dense(3, activation='softmax'),
])

In [58]:
model1_over4.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [59]:
model1_over4.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 470 samples, validate on 202 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1a37bff898>

In [66]:
acc_test_over4 = model1_over4.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over4*100))

over-sampling test accuracy: 99.01%


In [60]:
pred4 = model1_over4.predict_classes(X_test_over)
pred4

array([0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 2, 2, 1, 0, 2, 1, 2, 0,
       0, 1, 0, 0, 0, 2, 1, 1, 0, 0, 2, 0, 2, 2, 1, 1, 1, 1, 1, 2, 0, 0,
       0, 1, 2, 1, 1, 0, 1, 2, 0, 1, 2, 1, 1, 1, 2, 1, 0, 2, 2, 2, 0, 0,
       2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 2, 1, 1, 0, 1, 1, 0, 2, 2, 0, 2,
       2, 1, 2, 0, 0, 2, 1, 1, 2, 2, 2, 1, 1, 2, 0, 2, 1, 1, 1, 2, 0, 0,
       0, 1, 2, 1, 1, 2, 0, 1, 2, 0, 0, 2, 1, 2, 1, 2, 1, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 1, 2, 0, 1, 1, 1, 0, 2, 1, 0, 1, 1, 1, 2, 2, 2, 2,
       1, 2, 1, 2, 0, 2, 0, 0, 2, 1, 1, 0, 1, 0, 2, 0, 0, 0, 2, 0, 2, 1,
       2, 2, 1, 2, 1, 2, 2, 1, 0, 2, 2, 0, 0, 2, 1, 0, 0, 2, 2, 2, 0, 1,
       0, 2, 2, 1])

In [61]:
dat4['pred'] = pred4
dat4

Unnamed: 0,0,test,pred
0,CFBRSa04,0,0
1,NRS021,0,0
2,NRS073,0,0
3,NRS049,0,0
4,CA541,1,1
...,...,...,...
197,NRS387,1,1
198,SR1746,0,0
199,NRS148,2,2
200,NRS255,2,2


In [62]:
proba4 = model1_over4.predict_proba(X_test_over)
dat_proba4 = pd.DataFrame(proba4)

In [63]:
dat_proba4

Unnamed: 0,0,1,2
0,0.999899,0.000101,2.991748e-08
1,0.999117,0.000883,3.567869e-11
2,0.997437,0.002562,8.573055e-11
3,0.659272,0.340320,4.075804e-04
4,0.002930,0.997069,4.911396e-07
...,...,...,...
197,0.002469,0.997528,2.131637e-06
198,0.983707,0.016245,4.803907e-05
199,0.000003,0.001290,9.987069e-01
200,0.000053,0.000050,9.998962e-01


In [64]:
dat_proba4.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba4.csv", index = False,
         header=None)

In [65]:
dat4.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/4p003pp.csv", index = False,
         header=None)

In [70]:
hist1_over4 = model1_over4.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 470 samples, validate on 202 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [71]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist1_over4.history['accuracy'])*100))

over-sampling train accuracy: 99.96%


In [47]:
df_proba4 = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_2.xlsx",
                        sheet_name=3,
                        index_col=None)

In [48]:
df_proba4

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p002ykpresabs_qual,NRS110,1,1,0.000003,0.999997,5.870196e-13
1,p002ykpresabs_qual,NRS216,1,1,0.039254,0.960745,9.078969e-07
2,p002ykpresabs_qual,NRS386,1,1,0.326752,0.673248,1.061032e-07
3,p002ykpresabs_qual,CFBRSa25,0,0,0.611084,0.388916,7.664974e-07
4,p002ykpresabs_qual,BCH-SA-03,1,0,0.611084,0.388916,7.664974e-07
...,...,...,...,...,...,...,...
4279,pyopresabsSTCC_qual,NRS236,1,1,0.000052,0.999768,1.803156e-04
4280,pyopresabsSTCC_qual,NRS029,0,1,0.322350,0.677496,1.533154e-04
4281,pyopresabsSTCC_qual,NRS148,2,2,0.000006,0.000026,9.999682e-01
4282,pyopresabsSTCC_qual,CFBRSa28,0,0,0.999288,0.000176,5.361527e-04


In [49]:
y_prob4 = df_proba4[df_proba4['phage']=='p003ppresabs_qual'].iloc[:,-3:]
y_prob4 = y_prob4.to_numpy()
y_prob4

array([[9.99899400e-01, 1.00561505e-04, 2.99174800e-08],
       [9.99116840e-01, 8.83180500e-04, 3.56786930e-11],
       [9.97437500e-01, 2.56248630e-03, 8.57305500e-11],
       [6.59272300e-01, 3.40320100e-01, 4.07580400e-04],
       [2.93028100e-03, 9.97069300e-01, 4.91139600e-07],
       [1.02697750e-04, 9.99897240e-01, 1.44443820e-10],
       [4.52524700e-05, 9.99878500e-01, 7.61285700e-05],
       [1.36875260e-03, 9.98611800e-01, 1.94395600e-05],
       [9.99500300e-01, 4.99646200e-04, 1.37017920e-07],
       [9.99695200e-01, 3.04887300e-04, 4.75835620e-08],
       [9.99886040e-01, 1.13362010e-04, 6.37574400e-07],
       [9.67339200e-01, 3.26603650e-02, 4.07020100e-07],
       [5.34271530e-05, 5.04083300e-05, 9.99896170e-01],
       [9.35940700e-06, 1.96173080e-05, 9.99971030e-01],
       [5.34271530e-05, 5.04083300e-05, 9.99896170e-01],
       [9.35940700e-06, 1.96173080e-05, 9.99971030e-01],
       [6.15032500e-07, 9.99993560e-01, 5.86841000e-06],
       [9.83794300e-01, 1.62057

In [50]:
ovo4 = rocauc_ovo(y_test_over, y_prob4, average="macro", multi_class="ovo")
ovo4

0.9987890070345441

In [51]:
ovr4 = rocauc_ovr(y_test_over, y_prob4, average="macro", multi_class="ovr")
ovr4

0.9987890070345441

In [52]:
ovos = [ovo1, ovo2, ovo3, ovo4]
np.mean(ovos)

0.994356106179343

In [53]:
np.std(ovos)

0.005084817807952712

In [54]:
ovrs = [ovr1, ovr2, ovr3, ovr4]
np.mean(ovrs)

0.994356106179343

In [55]:
np.std(ovrs)

0.005084817807952712

In [72]:
accs = [acc_test_over, acc_test_over2, acc_test_over3, acc_test_over4]

In [73]:
mean = np.mean(accs)
print('over-sampling test accuracy mean: %.2f%%' % (mean*100))

over-sampling test accuracy mean: 96.78%


In [74]:
std = np.std(accs)
print('over-sampling test accuracy standard deviation:', std)

over-sampling test accuracy standard deviation: 0.016604466894944863


In [75]:
accs_train = [np.mean(hist1_over.history['accuracy']), np.mean(hist1_over2.history['accuracy']), np.mean(hist1_over3.history['accuracy']),
             np.mean(hist1_over4.history['accuracy'])]

In [76]:
mean_train = np.mean(accs_train)
print('over-sampling train accuracy mean: %.2f%%' % (mean_train*100))

over-sampling train accuracy mean: 99.94%


In [77]:
std_train = np.std(accs_train)
print('over-sampling train accuracy standard deviation:', std_train)

over-sampling train accuracy standard deviation: 0.00042312706


In [56]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over,
                                                    test_size = 0.3,
                                                    random_state=567,
                                                    stratify=y_over)

In [57]:
dat5 = pd.DataFrame(X_test_over[:,0])
dat5['test'] = y_test_over

In [58]:
dat5

Unnamed: 0,0,test
0,NRS255,2
1,NRS148,2
2,NRS209,2
3,NRS386,1
4,NRS230,0
...,...,...
197,NRS209,2
198,NRS209,2
199,SR4153,0
200,NRS255,2


In [59]:
X_train_over = X_train_over[:,1:]
X_test_over = X_test_over[:,1:]

In [70]:
#### add regularizer and dropout
model1_over5 = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],), activity_regularizer=l1(0.001)),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax'),
    Dropout(0.2, ),
])

In [71]:
model1_over5.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [72]:
model1_over5.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 470 samples, validate on 202 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1a388b36d8>

In [93]:
acc_test_over5 = model1_over5.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over5*100))

over-sampling test accuracy: 97.52%


In [73]:
pred5 = model1_over5.predict_classes(X_test_over)
pred5

array([2, 2, 2, 1, 0, 0, 2, 0, 2, 0, 0, 0, 2, 0, 1, 0, 0, 2, 0, 1, 1, 2,
       2, 2, 0, 0, 0, 1, 2, 1, 2, 1, 1, 2, 1, 2, 0, 2, 0, 1, 1, 2, 2, 0,
       2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 2, 1, 2, 2, 1, 1, 2, 2, 0, 0, 0, 0,
       1, 2, 1, 0, 2, 0, 1, 2, 0, 2, 1, 2, 1, 2, 2, 0, 1, 2, 2, 2, 0, 1,
       2, 1, 0, 1, 0, 0, 1, 2, 0, 2, 1, 2, 0, 0, 0, 0, 1, 2, 0, 0, 2, 0,
       0, 2, 1, 0, 1, 2, 0, 2, 1, 2, 0, 0, 0, 2, 0, 1, 2, 0, 1, 0, 0, 1,
       2, 2, 1, 0, 1, 1, 0, 2, 1, 1, 1, 1, 0, 2, 1, 1, 1, 1, 0, 0, 0, 2,
       0, 0, 2, 1, 2, 0, 2, 1, 0, 2, 1, 1, 0, 1, 0, 2, 0, 0, 0, 0, 2, 1,
       1, 2, 1, 0, 2, 0, 2, 0, 1, 0, 0, 1, 2, 2, 1, 2, 2, 2, 2, 0, 2, 2,
       2, 0, 2, 2])

In [74]:
dat5['pred'] = pred5
dat5

Unnamed: 0,0,test,pred
0,NRS255,2,2
1,NRS148,2,2
2,NRS209,2,2
3,NRS386,1,1
4,NRS230,0,0
...,...,...,...
197,NRS209,2,2
198,NRS209,2,2
199,SR4153,0,0
200,NRS255,2,2


In [75]:
proba5 = model1_over5.predict_proba(X_test_over)
dat_proba5 = pd.DataFrame(proba5)

In [76]:
dat_proba5

Unnamed: 0,0,1,2
0,4.710171e-08,7.975358e-08,9.999999e-01
1,4.710171e-08,7.975358e-08,9.999999e-01
2,4.710171e-08,7.975358e-08,9.999999e-01
3,8.994908e-08,9.999999e-01,5.308003e-08
4,1.000000e+00,7.034752e-16,3.696870e-17
...,...,...,...
197,4.710171e-08,7.975358e-08,9.999999e-01
198,4.710171e-08,7.975358e-08,9.999999e-01
199,1.000000e+00,1.068874e-13,2.000986e-14
200,4.710171e-08,7.975358e-08,9.999999e-01


In [77]:
dat_proba5.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba5.csv", index = False,
         header=None)

In [78]:
dat5.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/5p003pp.csv", index = False,
         header=None)

In [97]:
hist1_over5 = model1_over5.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 470 samples, validate on 202 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
E

In [98]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist1_over5.history['accuracy'])*100))

over-sampling train accuracy: 80.07%


In [60]:
df_proba5 = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_regularizor_dropout_2.xlsx",
                        sheet_name=0,
                        index_col=None)

In [61]:
df_proba5

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p002ykpresabsSTCC_qual,NRS241,1,1,1.342914e-03,9.986569e-01,2.348628e-07
1,p002ykpresabsSTCC_qual,NRS148,2,2,5.170289e-08,1.017893e-07,9.999999e-01
2,p002ykpresabsSTCC_qual,NRS255,1,1,1.780311e-07,9.999999e-01,2.544841e-12
3,p002ykpresabsSTCC_qual,NRS214,0,0,1.000000e+00,2.203547e-10,5.688883e-15
4,p002ykpresabsSTCC_qual,NRS148,2,2,5.170289e-08,1.017893e-07,9.999999e-01
...,...,...,...,...,...,...,...
1977,pyopresabsSTCC_qual,BCH-SA-12,0,0,1.000000e+00,1.152503e-09,1.898730e-09
1978,pyopresabsSTCC_qual,NRS049,0,1,8.401357e-11,1.000000e+00,3.209735e-13
1979,pyopresabsSTCC_qual,NRS022,0,0,1.000000e+00,4.755084e-10,1.974275e-10
1980,pyopresabsSTCC_qual,NRS236,1,1,1.357345e-08,1.000000e+00,1.293117e-10


In [62]:
y_prob5 = df_proba5[df_proba5['phage']=='p003ppresabs_qual'].iloc[:,-3:]
y_prob5 = y_prob5.to_numpy()
y_prob5

array([[4.71017070e-08, 7.97535800e-08, 9.99999900e-01],
       [4.71017070e-08, 7.97535800e-08, 9.99999900e-01],
       [4.71017070e-08, 7.97535800e-08, 9.99999900e-01],
       [8.99490800e-08, 9.99999900e-01, 5.30800330e-08],
       [1.00000000e+00, 7.03475160e-16, 3.69686970e-17],
       [1.00000000e+00, 5.92669660e-18, 3.55473970e-19],
       [5.75826350e-02, 2.67106100e-01, 6.75311270e-01],
       [9.65128000e-01, 3.48315100e-02, 4.05364630e-05],
       [4.71017070e-08, 7.97535800e-08, 9.99999900e-01],
       [1.00000000e+00, 3.55641340e-16, 1.04817090e-15],
       [9.65128000e-01, 3.48315100e-02, 4.05364630e-05],
       [1.00000000e+00, 4.26041030e-19, 1.48830050e-18],
       [4.71017070e-08, 7.97535800e-08, 9.99999900e-01],
       [1.00000000e+00, 6.81523950e-18, 2.44143200e-17],
       [5.74440400e-15, 1.00000000e+00, 2.23779920e-14],
       [1.00000000e+00, 2.13858530e-20, 4.76377420e-20],
       [1.00000000e+00, 2.86235920e-17, 1.88004120e-16],
       [4.71017070e-08, 7.97535

In [63]:
ovo5 = rocauc_ovo(y_test_over, y_prob5, average="macro", multi_class="ovo")
ovo5

0.9979454578957067

In [64]:
ovr5 = rocauc_ovr(y_test_over, y_prob5, average="macro", multi_class="ovr")
ovr5

0.9979454578957067

In [65]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over,
                                                    test_size = 0.3,
                                                    random_state=678,
                                                    stratify=y_over)

In [66]:
dat6 = pd.DataFrame(X_test_over[:,0])
dat6['test'] = y_test_over

In [67]:
dat6

Unnamed: 0,0,test
0,115,1
1,NRS209,2
2,GA984,0
3,NRS187,1
4,NRS148,2
...,...,...
197,NRS253,1
198,EUH15,0
199,NRS180,1
200,NRS266,1


In [68]:
X_train_over = X_train_over[:,1:]
X_test_over = X_test_over[:,1:]

In [83]:
model1_over6 = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],), activity_regularizer=l1(0.001)),
   
    Dense(3, activation='softmax'),
    Dropout(0.2, ),
])

In [84]:
model1_over6.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [85]:
model1_over6.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 470 samples, validate on 202 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1a3927e128>

In [110]:
acc_test_over6 = model1_over6.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over6*100))

over-sampling test accuracy: 96.04%


In [86]:
pred6 = model1_over6.predict_classes(X_test_over)
pred6

array([1, 2, 0, 1, 2, 2, 1, 1, 1, 0, 2, 1, 2, 1, 0, 0, 0, 2, 1, 1, 0, 0,
       0, 0, 2, 0, 2, 0, 0, 0, 1, 0, 2, 0, 2, 2, 1, 0, 2, 2, 2, 1, 0, 1,
       2, 2, 2, 0, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 1, 1, 1, 1, 1, 2, 0,
       1, 1, 2, 2, 0, 2, 2, 2, 0, 1, 0, 0, 2, 2, 0, 2, 2, 2, 1, 0, 1, 1,
       0, 2, 2, 2, 2, 1, 0, 0, 2, 0, 2, 1, 0, 0, 0, 2, 2, 0, 0, 0, 2, 2,
       0, 0, 0, 2, 1, 0, 1, 1, 2, 0, 2, 2, 1, 1, 0, 0, 2, 1, 1, 1, 1, 2,
       1, 0, 2, 2, 1, 2, 0, 1, 2, 0, 1, 0, 1, 1, 1, 0, 1, 2, 0, 1, 0, 1,
       1, 2, 0, 2, 2, 0, 2, 1, 0, 2, 0, 2, 1, 2, 1, 2, 1, 0, 2, 0, 2, 2,
       1, 0, 2, 2, 2, 0, 0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 1, 1, 0, 0, 2, 1,
       0, 1, 1, 1])

In [87]:
dat6['pred'] = pred6
dat6

Unnamed: 0,0,test,pred
0,115,1,1
1,NRS209,2,2
2,GA984,0,0
3,NRS187,1,1
4,NRS148,2,2
...,...,...,...
197,NRS253,1,1
198,EUH15,0,0
199,NRS180,1,1
200,NRS266,1,1


In [88]:
proba6 = model1_over6.predict_proba(X_test_over)
dat_proba6 = pd.DataFrame(proba6)

In [89]:
dat_proba6

Unnamed: 0,0,1,2
0,1.848713e-07,9.999998e-01,4.958978e-08
1,5.694917e-08,8.750234e-08,9.999999e-01
2,1.000000e+00,4.182892e-09,1.750376e-09
3,3.422572e-08,1.000000e+00,1.106527e-09
4,8.769664e-08,1.260017e-07,9.999998e-01
...,...,...,...
197,1.208738e-12,1.000000e+00,5.136406e-13
198,1.000000e+00,8.836975e-13,6.095818e-13
199,5.182315e-08,1.000000e+00,2.894522e-08
200,1.586869e-08,1.000000e+00,1.713035e-08


In [90]:
dat_proba6.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba6.csv", index = False,
         header=None)

In [91]:
dat6.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/6p003pp.csv", index = False,
         header=None)

In [114]:
hist1_over6 = model1_over6.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 470 samples, validate on 202 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [115]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist1_over6.history['accuracy'])*100))

over-sampling train accuracy: 78.73%


In [69]:
df_proba6 = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_regularizor_dropout_2.xlsx",
                        sheet_name=1,
                        index_col=None)

In [70]:
df_proba6

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p002ykpresabsSTCC_qual,NRS209,2,2,1.790400e-08,4.141849e-08,1.000000e+00
1,p002ykpresabsSTCC_qual,NRS386,1,1,5.739934e-04,9.994259e-01,6.773014e-08
2,p002ykpresabsSTCC_qual,NRS148,2,2,5.286934e-09,1.269109e-08,1.000000e+00
3,p002ykpresabsSTCC_qual,NRS178,0,1,6.494936e-12,1.000000e+00,2.537080e-25
4,p002ykpresabsSTCC_qual,NRS237,0,1,5.701098e-02,9.399204e-01,3.068583e-03
...,...,...,...,...,...,...,...
1977,pyopresabsSTCC_qual,NRS272,0,0,9.999607e-01,3.367024e-05,5.776848e-06
1978,pyopresabsSTCC_qual,NRS112,1,1,8.275442e-08,9.999999e-01,3.739556e-09
1979,pyopresabsSTCC_qual,NRS064,1,1,2.168245e-08,1.000000e+00,9.603962e-09
1980,pyopresabsSTCC_qual,BCH-SA-04,0,0,1.000000e+00,1.026408e-15,1.630406e-14


In [71]:
y_prob6 = df_proba6[df_proba6['phage']=='p003ppresabs_qual'].iloc[:,-3:]
y_prob6 = y_prob6.to_numpy()
y_prob6

array([[1.84871270e-07, 9.99999760e-01, 4.95897800e-08],
       [5.69491720e-08, 8.75023360e-08, 9.99999900e-01],
       [1.00000000e+00, 4.18289200e-09, 1.75037640e-09],
       [3.42257170e-08, 1.00000000e+00, 1.10652680e-09],
       [8.76966400e-08, 1.26001670e-07, 9.99999760e-01],
       [5.69491720e-08, 8.75023360e-08, 9.99999900e-01],
       [1.94755770e-08, 1.00000000e+00, 4.58318140e-09],
       [2.50032200e-05, 9.99974500e-01, 5.32822900e-07],
       [1.33338320e-08, 1.00000000e+00, 7.80653700e-09],
       [1.00000000e+00, 7.63802800e-09, 5.94583630e-09],
       [5.69491720e-08, 8.75023360e-08, 9.99999900e-01],
       [2.72812630e-10, 1.00000000e+00, 3.48323400e-11],
       [5.69491720e-08, 8.75023360e-08, 9.99999900e-01],
       [2.50032200e-05, 9.99974500e-01, 5.32822900e-07],
       [1.00000000e+00, 7.84103400e-10, 6.80646000e-10],
       [1.00000000e+00, 1.71795240e-09, 1.02533550e-09],
       [9.88913240e-01, 1.10144940e-02, 7.22992550e-05],
       [5.69491720e-08, 8.75023

In [72]:
ovo6 = rocauc_ovo(y_test_over, y_prob6, average="macro", multi_class="ovo")
ovo6

0.9929590880022546

In [73]:
ovr6 = rocauc_ovr(y_test_over, y_prob6, average="macro", multi_class="ovr")
ovr6

0.9929590880022546

In [74]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over,
                                                    test_size = 0.3,
                                                    random_state=789,
                                                    stratify=y_over)

In [75]:
dat7 = pd.DataFrame(X_test_over[:,0])
dat7['test'] = y_test_over

In [76]:
dat7

Unnamed: 0,0,test
0,NRS260,0
1,NRS148,2
2,NRS205,1
3,NRS064,1
4,NRS209,2
...,...,...
197,NRS255,2
198,GA53649,0
199,NRS209,2
200,NRS210,0


In [77]:
X_train_over = X_train_over[:,1:]
X_test_over = X_test_over[:,1:]

In [96]:
model1_over7 = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],), activity_regularizer=l1(0.001)),
    
    Dense(3, activation='softmax'),
    Dropout(0.2, ),
])

In [97]:
model1_over7.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [98]:
model1_over7.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 470 samples, validate on 202 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1a39de7e48>

In [127]:
acc_test_over7 = model1_over7.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over7*100))

over-sampling test accuracy: 96.04%


In [99]:
pred7 = model1_over7.predict_classes(X_test_over)
pred7

array([0, 2, 2, 1, 2, 1, 1, 1, 2, 0, 0, 1, 2, 1, 2, 0, 2, 0, 1, 1, 0, 1,
       2, 0, 1, 2, 2, 2, 2, 1, 0, 1, 2, 1, 2, 1, 1, 1, 1, 0, 2, 0, 1, 1,
       0, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 1, 2, 1, 2, 2, 1, 0, 0, 2, 0, 1,
       2, 2, 2, 0, 1, 1, 2, 2, 1, 0, 2, 1, 2, 2, 2, 2, 0, 0, 0, 0, 0, 1,
       0, 2, 1, 2, 0, 2, 2, 1, 2, 0, 0, 2, 1, 1, 2, 2, 1, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 1, 1, 2, 0, 1, 1, 1, 2, 2, 1, 0,
       1, 1, 0, 1, 2, 2, 2, 1, 2, 1, 1, 2, 0, 0, 2, 0, 2, 0, 0, 0, 2, 2,
       1, 1, 1, 2, 1, 0, 1, 2, 2, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2,
       0, 1, 2, 0, 0, 1, 2, 1, 0, 1, 0, 2, 2, 1, 1, 2, 2, 1, 2, 0, 2, 2,
       0, 2, 0, 0])

In [100]:
dat7['pred'] = pred7
dat7

Unnamed: 0,0,test,pred
0,NRS260,0,0
1,NRS148,2,2
2,NRS205,1,2
3,NRS064,1,1
4,NRS209,2,2
...,...,...,...
197,NRS255,2,2
198,GA53649,0,0
199,NRS209,2,2
200,NRS210,0,0


In [101]:
proba7 = model1_over7.predict_proba(X_test_over)
dat_proba7 = pd.DataFrame(proba7)

In [102]:
dat_proba7

Unnamed: 0,0,1,2
0,9.994460e-01,5.363244e-04,1.764490e-05
1,3.202002e-08,6.790426e-08,9.999999e-01
2,3.680430e-01,1.410613e-01,4.908957e-01
3,5.834393e-09,1.000000e+00,6.849045e-10
4,3.490408e-08,4.486665e-08,9.999999e-01
...,...,...,...
197,3.314651e-08,4.263710e-08,9.999999e-01
198,9.999968e-01,1.463843e-06,1.837797e-06
199,3.490415e-08,4.486665e-08,9.999999e-01
200,9.822029e-01,1.778526e-02,1.183655e-05


In [103]:
dat_proba7.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba7.csv", index = False,
         header=None)

In [104]:
dat7.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/7p003pp.csv", index = False,
         header=None)

In [131]:
hist1_over7 = model1_over7.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 470 samples, validate on 202 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [132]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist1_over7.history['accuracy'])*100))

over-sampling train accuracy: 79.15%


In [78]:
df_proba7 = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_regularizor_dropout_2.xlsx",
                        sheet_name=2,
                        index_col=None)

In [79]:
df_proba7

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p002ykpresabsSTCC_qual,NRS209,2,2,8.300497e-12,1.036520e-09,1.000000e+00
1,p002ykpresabsSTCC_qual,BCH-SA-09,1,1,1.137139e-06,9.999988e-01,2.067601e-09
2,p002ykpresabsSTCC_qual,NRS224,0,0,1.000000e+00,2.093110e-31,0.000000e+00
3,p002ykpresabsSTCC_qual,NRS209,2,2,8.300497e-12,1.036520e-09,1.000000e+00
4,p002ykpresabsSTCC_qual,NRS235,1,1,2.243513e-02,9.774035e-01,1.615106e-04
...,...,...,...,...,...,...,...
1977,pyopresabsSTCC_qual,NRS035,0,0,9.354528e-01,6.414209e-02,4.051121e-04
1978,pyopresabsSTCC_qual,NRS260,1,1,4.808470e-08,1.000000e+00,7.364639e-09
1979,pyopresabsSTCC_qual,CA9,0,0,1.000000e+00,2.361323e-08,2.871247e-08
1980,pyopresabsSTCC_qual,NRS183,1,1,2.755864e-07,9.999998e-01,5.310879e-08


In [80]:
y_prob7 = df_proba7[df_proba7['phage']=='p003ppresabs_qual'].iloc[:,-3:]
y_prob7 = y_prob7.to_numpy()
y_prob7

array([[9.99446000e-01, 5.36324400e-04, 1.76449010e-05],
       [3.20200150e-08, 6.79042600e-08, 9.99999900e-01],
       [3.68043040e-01, 1.41061260e-01, 4.90895750e-01],
       [5.83439340e-09, 1.00000000e+00, 6.84904470e-10],
       [3.49040830e-08, 4.48666500e-08, 9.99999900e-01],
       [6.52873800e-07, 9.99999400e-01, 2.69478100e-08],
       [1.00801790e-06, 9.99999050e-01, 1.17269650e-09],
       [6.52873800e-07, 9.99999400e-01, 2.69478100e-08],
       [3.49040830e-08, 4.48666500e-08, 9.99999900e-01],
       [1.00000000e+00, 2.92416010e-08, 3.14146500e-08],
       [1.00000000e+00, 3.35057800e-08, 1.04133310e-09],
       [1.12340330e-04, 9.99853400e-01, 3.42142230e-05],
       [3.49040830e-08, 4.48666500e-08, 9.99999900e-01],
       [3.62443530e-07, 9.99999500e-01, 8.39555100e-08],
       [3.31465130e-08, 4.26370200e-08, 9.99999900e-01],
       [1.00000000e+00, 9.44459400e-09, 1.12767170e-08],
       [3.20200150e-08, 6.79042600e-08, 9.99999900e-01],
       [9.99512430e-01, 4.85869

In [81]:
ovo7 = rocauc_ovo(y_test_over, y_prob7, average="macro", multi_class="ovo")
ovo7

0.9898654873779251

In [82]:
ovr7 = rocauc_ovr(y_test_over, y_prob7, average="macro", multi_class="ovr")
ovr7

0.9898654873779251

In [83]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over,
                                                    test_size = 0.3,
                                                    random_state=890,
                                                    stratify=y_over)

In [84]:
dat8 = pd.DataFrame(X_test_over[:,0])
dat8['test'] = y_test_over

In [85]:
dat8

Unnamed: 0,0,test
0,CA541,1
1,SR3585,0
2,NRS232,1
3,NRS148,2
4,NRS180,1
...,...,...
197,NRS209,2
198,NRS035,1
199,506,0
200,SR2091,0


In [86]:
X_train_over = X_train_over[:,1:]
X_test_over = X_test_over[:,1:]

In [109]:
model1_over8 = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],), activity_regularizer=l1(0.001)),
    Dense(3, activation='softmax'),
    Dropout(0.2, ),
])

In [110]:
model1_over8.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [111]:
model1_over8.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 470 samples, validate on 202 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1a3a4b3518>

In [144]:
acc_test_over8 = model1_over8.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over8*100))

over-sampling test accuracy: 98.02%


In [112]:
pred8 = model1_over8.predict_classes(X_test_over)
pred8

array([1, 0, 1, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 2, 0, 2, 1, 1, 1, 0, 0, 2,
       1, 1, 2, 0, 1, 1, 1, 1, 2, 1, 0, 0, 1, 0, 1, 2, 2, 0, 2, 2, 2, 2,
       2, 2, 0, 0, 2, 2, 1, 1, 1, 0, 0, 2, 0, 2, 0, 1, 2, 1, 2, 1, 0, 2,
       0, 1, 0, 2, 2, 0, 2, 1, 0, 2, 2, 2, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 1, 1, 0, 0, 2, 2, 0, 1, 1, 0, 0, 0, 2, 0, 1, 1, 0, 2, 0, 1,
       1, 2, 2, 2, 2, 1, 0, 1, 1, 2, 0, 0, 1, 1, 0, 1, 1, 2, 2, 1, 2, 1,
       0, 2, 1, 0, 2, 2, 2, 2, 0, 1, 2, 2, 2, 1, 1, 0, 1, 2, 1, 0, 0, 2,
       1, 0, 0, 0, 2, 1, 0, 0, 0, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 0, 0, 0,
       2, 1, 2, 0, 1, 2, 2, 1, 0, 2, 1, 2, 2, 2, 0, 0, 1, 0, 0, 1, 2, 2,
       1, 0, 0, 1])

In [113]:
dat8['pred'] = pred8
dat8

Unnamed: 0,0,test,pred
0,CA541,1,1
1,SR3585,0,0
2,NRS232,1,1
3,NRS148,2,2
4,NRS180,1,1
...,...,...,...
197,NRS209,2,2
198,NRS035,1,1
199,506,0,0
200,SR2091,0,0


In [114]:
proba8 = model1_over8.predict_proba(X_test_over)
dat_proba8 = pd.DataFrame(proba8)

In [115]:
dat_proba8

Unnamed: 0,0,1,2
0,5.049835e-07,9.999992e-01,3.384297e-07
1,1.000000e+00,3.947699e-11,7.612437e-11
2,1.787098e-07,9.999999e-01,5.476143e-08
3,9.462030e-09,3.260693e-08,1.000000e+00
4,1.225878e-07,9.999999e-01,4.815261e-08
...,...,...,...
197,7.304264e-08,7.608065e-08,9.999999e-01
198,1.169835e-07,9.999999e-01,3.569366e-08
199,1.000000e+00,4.985692e-13,1.622500e-12
200,9.967397e-01,3.254553e-03,5.816956e-06


In [116]:
dat_proba8.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba8.csv", index = False,
         header=None)

In [117]:
dat8.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/8p003pp.csv", index = False,
         header=None)

In [148]:
hist1_over8 = model1_over8.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 470 samples, validate on 202 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [149]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist1_over8.history['accuracy'])*100))

over-sampling train accuracy: 79.80%


In [87]:
df_proba8 = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_regularizor_dropout_2.xlsx",
                        sheet_name=3,
                        index_col=None)

In [88]:
df_proba8

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p002ykpresabsSTCC_qual,CFBREBSa116,0,0,9.676203e-01,3.237956e-02,1.480166e-07
1,p002ykpresabsSTCC_qual,NRS214,0,0,1.000000e+00,6.534852e-11,2.250731e-18
2,p002ykpresabsSTCC_qual,NRS148,2,2,3.948571e-11,2.839096e-07,9.999998e-01
3,p002ykpresabsSTCC_qual,NRS148,2,2,3.948571e-11,2.839096e-07,9.999998e-01
4,p002ykpresabsSTCC_qual,NRS148,2,2,3.948571e-11,2.839096e-07,9.999998e-01
...,...,...,...,...,...,...,...
1977,pyopresabsSTCC_qual,NRS205,2,2,3.691095e-08,3.571927e-08,9.999999e-01
1978,pyopresabsSTCC_qual,CFBREBSa122,0,1,9.261665e-02,9.073822e-01,1.162373e-06
1979,pyopresabsSTCC_qual,NRS001,1,1,4.174278e-07,9.999995e-01,3.254024e-09
1980,pyopresabsSTCC_qual,NRS148,2,2,3.234670e-08,3.121212e-08,9.999999e-01


In [89]:
y_prob8 = df_proba8[df_proba8['phage']=='p003ppresabs_qual'].iloc[:,-3:]
y_prob8 = y_prob8.to_numpy()
y_prob8

array([[5.04983500e-07, 9.99999170e-01, 3.38429700e-07],
       [1.00000000e+00, 3.94769880e-11, 7.61243700e-11],
       [1.78709840e-07, 9.99999900e-01, 5.47614260e-08],
       [9.46203000e-09, 3.26069340e-08, 1.00000000e+00],
       [1.22587790e-07, 9.99999900e-01, 4.81526100e-08],
       [9.46203000e-09, 3.26069340e-08, 1.00000000e+00],
       [4.10628650e-08, 3.38463340e-08, 9.99999900e-01],
       [4.36488760e-02, 8.12866900e-01, 1.43484150e-01],
       [3.23510850e-08, 1.00000000e+00, 3.27703300e-09],
       [8.52893040e-07, 9.99998700e-01, 4.68896840e-07],
       [1.24812690e-06, 9.99998700e-01, 6.50669500e-08],
       [1.23115050e-06, 9.99998700e-01, 1.75076660e-07],
       [7.47876000e-09, 1.00000000e+00, 2.58419100e-09],
       [7.30427800e-08, 7.60807900e-08, 9.99999900e-01],
       [1.00000000e+00, 3.67538300e-10, 6.97490400e-10],
       [9.46203000e-09, 3.26069340e-08, 1.00000000e+00],
       [4.01261930e-07, 9.99999640e-01, 4.49401230e-08],
       [1.23115050e-06, 9.99998

In [90]:
ovo8 = rocauc_ovo(y_test_over, y_prob8, average="macro", multi_class="ovo")
ovo8

0.9974571586511886

In [91]:
ovr8 = rocauc_ovr(y_test_over, y_prob8, average="macro", multi_class="ovr")
ovr8

0.9974571586511886

In [92]:
ovos2 = [ovo5, ovo6, ovo7, ovo8]
np.mean(ovos2)

0.9945567979817688

In [93]:
np.std(ovos2)

0.003333773402070904

In [94]:
ovrs2 = [ovr5, ovr6, ovr7, ovr8]
np.mean(ovrs2)

0.9945567979817688

In [95]:
np.std(ovrs2)

0.003333773402070904

In [150]:
accs_reg = [acc_test_over5, acc_test_over6, acc_test_over7, acc_test_over8]

In [151]:
mean_reg = np.mean(accs_reg)
print('over-sampling test accuracy regularization mean: %.2f%%' % (mean_reg*100))

over-sampling test accuracy regularization mean: 96.91%


In [152]:
std_reg = np.std(accs_reg)
print('over-sampling test accuracy regularization standard deviation:', std_reg)

over-sampling test accuracy regularization standard deviation: 0.008838393669982979


In [153]:
accs_train_reg = [np.mean(hist1_over5.history['accuracy']), np.mean(hist1_over6.history['accuracy']), np.mean(hist1_over7.history['accuracy']),
             np.mean(hist1_over8.history['accuracy'])]

In [154]:
mean_train_reg = np.mean(accs_train_reg)
print('over-sampling train accuracy regularization mean: %.2f%%' % (mean_train_reg*100))

over-sampling train accuracy regularization mean: 79.44%


In [155]:
std_train_reg = np.std(accs_train_reg)
print('over-sampling train accuracy regularization standard deviation:', std_train_reg)

over-sampling train accuracy regularization standard deviation: 0.005295096
