In [None]:
## This file implements neural networks with/without dropout and regularizer for p0017Spresabs_qual with four replicates.
## We compute the mean and standarad deviation of training and test accuracies.
## We also compute the mean and standard deviation of AUC ROC values for each model.

In [1]:
from numpy.random import seed
import numpy as np
seed(100)
import tensorflow
tensorflow.random.set_seed(123)

In [2]:
import pandas as pd

df = pd.read_csv('/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/p0017Spresabs_qual.csv')
df.shape

(253, 151)

In [3]:
df.rename(columns={'Unnamed: 0':'id'}, inplace=True)

In [4]:
df['pheno']

0      0
1      0
2      1
3      0
4      0
      ..
248    1
249    0
250    0
251    0
252    0
Name: pheno, Length: 253, dtype: int64

In [5]:
df.head()

Unnamed: 0,id,TTTTGAAGCATTAAGATTACTTATCATTTTTAAATTTCAATTTAAACTAACAGTAATTTATGTAGCTTTTGTAATTCTCATAATAACCTTTACTTCATTT,TTTTAATACATAT,TTTGAAGCATTAAGATTACTTATCATTTTTAAATTTCAATTTAAACTAACAGTAATTTATGTAGCTTTTGTAATTCTCATAATAACCTTTACTTCATTTA,TTTATCTTTATGA,TTTAATTTAGTAAGT,TTTAAAAAGATGAATAATGTAAATGAAGTAAAGGTTATTATGAGAATTACAAAAGCTACATAAATTACTGTTAGTTTAAATTGAAATTTAAAAATGATAA,TTGAAGCATTAAGATTACTTATCATTTTTAAATTTCAATTTAAACTAACAGTAATTTATGTAGCTTTTGTAATTCTCATAATAACCTTTACTTCATTTAC,TTCCATCGAATCAC,TTCATTTAATGGCTAAGGAAATTGTGCGATTCCACTCAATTATTTGGCCTATTTTATTGATGGCATTAGACTTACCGTTACCTAAAAAAGTCTTTGCACA,...,group_1148,group_1598,group_1687,group_3441,group_4225,group_4420,group_7795,group_8042,group_8892,pheno
0,107,1,0,1,0,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
1,109,1,0,1,0,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2,115,1,1,1,0,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,1
3,120335,1,0,1,0,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,120337,1,0,1,0,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0


In [6]:
df['pheno'].value_counts()

0    216
1     35
2      2
Name: pheno, dtype: int64

In [7]:
df_clean = df.drop(columns=['id'])

In [8]:
df_clean.shape

(253, 150)

In [9]:
df_clean.head()

Unnamed: 0,TTTTGAAGCATTAAGATTACTTATCATTTTTAAATTTCAATTTAAACTAACAGTAATTTATGTAGCTTTTGTAATTCTCATAATAACCTTTACTTCATTT,TTTTAATACATAT,TTTGAAGCATTAAGATTACTTATCATTTTTAAATTTCAATTTAAACTAACAGTAATTTATGTAGCTTTTGTAATTCTCATAATAACCTTTACTTCATTTA,TTTATCTTTATGA,TTTAATTTAGTAAGT,TTTAAAAAGATGAATAATGTAAATGAAGTAAAGGTTATTATGAGAATTACAAAAGCTACATAAATTACTGTTAGTTTAAATTGAAATTTAAAAATGATAA,TTGAAGCATTAAGATTACTTATCATTTTTAAATTTCAATTTAAACTAACAGTAATTTATGTAGCTTTTGTAATTCTCATAATAACCTTTACTTCATTTAC,TTCCATCGAATCAC,TTCATTTAATGGCTAAGGAAATTGTGCGATTCCACTCAATTATTTGGCCTATTTTATTGATGGCATTAGACTTACCGTTACCTAAAAAAGTCTTTGCACA,TTCAAGAAGGAGA,...,group_1148,group_1598,group_1687,group_3441,group_4225,group_4420,group_7795,group_8042,group_8892,pheno
0,1,0,1,0,0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,1,0,0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,0,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,1
3,1,0,1,0,0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,1,0,0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
X = df.loc[:, df.columns != 'pheno']
y = df['pheno']
print(X.shape, y.shape)

(253, 150) (253,)


In [11]:
# over-sampling
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler
overS = RandomOverSampler(random_state=100)
X_over, y_over = overS.fit_resample(X, y)
print(sorted(Counter(y_over).items()))

Using TensorFlow backend.


[(0, 216), (1, 216), (2, 216)]




In [12]:
############# Fully-Connected Neural Network ################

In [13]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.regularizers import l1

In [14]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over,
                                                    test_size = 0.3,
                                                    random_state=123,
                                                    stratify=y_over)

In [15]:
dat = pd.DataFrame(X_test_over[:,0])
dat['test'] = y_test_over

In [16]:
dat

Unnamed: 0,0,test
0,312,1
1,CFBRSa27,0
2,BCH-SA-01,1
3,GA27,1
4,NRS209,2
...,...,...
190,NRS209,2
191,NRS235,0
192,NRS240,1
193,NRS110,2


In [17]:
X_train_over = X_train_over[:,1:]
X_test_over = X_test_over[:,1:]

In [18]:
#### neural network on over-sampling data
model1_over = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],)),
    Dense(3, activation='softmax'),
])

In [19]:
model1_over.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [20]:
model1_over.fit(X_train_over, y_train_over,
          batch_size=16, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 453 samples, validate on 195 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1a3655a438>

In [29]:
acc_test_over = model1_over.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over*100))

over-sampling test accuracy: 93.33%


In [21]:
pred = model1_over.predict_classes(X_test_over)
pred

array([1, 0, 1, 1, 2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 2, 0, 1, 2, 1, 0,
       2, 1, 1, 1, 0, 0, 0, 0, 2, 1, 2, 2, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 0, 2, 1, 0, 1, 1, 0, 1, 1, 0, 2, 0, 1, 2, 2, 2, 0, 0, 1, 1, 1,
       0, 0, 1, 1, 2, 2, 0, 2, 2, 0, 0, 0, 1, 0, 0, 1, 2, 2, 0, 0, 0, 2,
       1, 2, 1, 2, 2, 2, 2, 2, 0, 0, 1, 2, 2, 0, 2, 1, 1, 1, 0, 2, 1, 2,
       2, 1, 1, 1, 1, 2, 2, 2, 1, 0, 2, 1, 1, 2, 1, 0, 1, 0, 0, 2, 0, 2,
       0, 0, 0, 2, 2, 0, 2, 1, 0, 1, 2, 1, 0, 2, 2, 2, 2, 2, 2, 1, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 2, 0, 2, 1, 2, 2, 2, 1, 1, 2, 1, 0, 2,
       0, 2, 2, 1, 2, 1, 0, 2, 2, 2, 2, 2, 1, 0, 2, 0, 1, 2, 1])

In [22]:
dat['pred'] = pred
dat

Unnamed: 0,0,test,pred
0,312,1,1
1,CFBRSa27,0,0
2,BCH-SA-01,1,1
3,GA27,1,1
4,NRS209,2,2
...,...,...,...
190,NRS209,2,2
191,NRS235,0,0
192,NRS240,1,1
193,NRS110,2,2


In [23]:
proba1 = model1_over.predict_proba(X_test_over)
dat_proba1 = pd.DataFrame(proba1)

In [24]:
dat_proba1

Unnamed: 0,0,1,2
0,0.204497,0.795491,1.126081e-05
1,0.999987,0.000013,1.308756e-08
2,0.081229,0.918770,2.954983e-07
3,0.388062,0.611937,1.651251e-06
4,0.000020,0.000181,9.997988e-01
...,...,...,...
190,0.000020,0.000181,9.997988e-01
191,0.999429,0.000571,2.958436e-07
192,0.070944,0.929047,9.029484e-06
193,0.000134,0.000033,9.998335e-01


In [25]:
dat_proba1.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba1.csv", index = False,
         header=None)

In [26]:
dat.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/1p17sp.csv", index = False,
         header=None)

In [33]:
hist1_over = model1_over.fit(X_train_over, y_train_over,
          batch_size=16, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 453 samples, validate on 195 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [34]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist1_over.history['accuracy'])*100))

over-sampling train accuracy: 97.61%


In [18]:
df_proba = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_2.xlsx",
                        sheet_name=0,
                        index_col=None)

In [19]:
df_proba

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p002ykpresabs_qual,CFBRSa26,0,0,0.758914,0.241086,4.638713e-07
1,p002ykpresabs_qual,NRS109,2,2,0.005361,0.016236,9.784034e-01
2,p002ykpresabs_qual,NRS112,0,0,0.726623,0.273376,1.520979e-06
3,p002ykpresabs_qual,NRS216,1,1,0.138322,0.861665,1.334123e-05
4,p002ykpresabs_qual,NRS021,0,0,0.882176,0.117824,1.414530e-10
...,...,...,...,...,...,...,...
4279,pyopresabsSTCC_qual,NRS148,2,2,0.000007,0.000099,9.998934e-01
4280,pyopresabsSTCC_qual,NRS255,2,2,0.000257,0.002048,9.976944e-01
4281,pyopresabsSTCC_qual,NRS205,2,2,0.000011,0.000045,9.999435e-01
4282,pyopresabsSTCC_qual,NRS255,2,2,0.000257,0.002048,9.976944e-01


In [20]:
y_prob = df_proba[df_proba['phage']=='p0017Spresabs_qual'].iloc[:,-3:]
y_prob = y_prob.to_numpy()
y_prob

array([[2.04497440e-01, 7.95491300e-01, 1.12608150e-05],
       [9.99986650e-01, 1.33010600e-05, 1.30875595e-08],
       [8.12294300e-02, 9.18770200e-01, 2.95498320e-07],
       [3.88061640e-01, 6.11936750e-01, 1.65125070e-06],
       [1.96548240e-05, 1.81490130e-04, 9.99798830e-01],
       [2.04497440e-01, 7.95491300e-01, 1.12608150e-05],
       [4.47418800e-12, 9.99999900e-01, 1.58818420e-07],
       [9.99899500e-01, 6.92032650e-06, 9.35408460e-05],
       [1.00000000e+00, 2.26961400e-09, 1.49822470e-09],
       [1.00000000e+00, 5.57250000e-08, 9.10458000e-09],
       [3.08712060e-06, 9.99996900e-01, 1.00634670e-09],
       [4.89200540e-07, 9.99999400e-01, 1.02903876e-07],
       [9.99954200e-01, 4.53907520e-05, 3.19960400e-07],
       [9.99995800e-01, 4.20490230e-06, 2.72388550e-08],
       [9.99375300e-01, 6.24327200e-04, 3.72049980e-07],
       [9.99799300e-01, 2.00552170e-04, 6.73128540e-08],
       [1.96548240e-05, 1.81490130e-04, 9.99798830e-01],
       [9.99999760e-01, 2.53480

In [21]:
## Retrieved from https://github.com/scikit-learn/scikit-learn/issues/3298
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelBinarizer

def rocauc_ovo(truth, pred, average="macro", multi_class="ovo"):

    lb = LabelBinarizer()
    lb.fit(truth)

    truth = lb.transform(truth)   
    
    return roc_auc_score(truth, pred, average=average, multi_class=multi_class)

In [22]:
ovo1 = rocauc_ovo(y_test_over, y_prob, average="macro", multi_class="ovo")
ovo1

0.9776331360946745

In [23]:
def rocauc_ovr(truth, pred, average="macro", multi_class="ovr"):

    lb = LabelBinarizer()
    lb.fit(truth)

    truth = lb.transform(truth)   

    return roc_auc_score(truth, pred, average=average, multi_class=multi_class)

In [24]:
ovr1 = rocauc_ovr(y_test_over, y_prob, average="macro", multi_class="ovr")
ovr1

0.9776331360946745

In [25]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over,
                                                    test_size = 0.3,
                                                    random_state=234,
                                                    stratify=y_over)

In [26]:
dat2 = pd.DataFrame(X_test_over[:,0])
dat2['test'] = y_test_over

In [27]:
dat2

Unnamed: 0,0,test
0,NRS110,2
1,NRS254,1
2,BCH-SA-09,0
3,NRS177,0
4,GA27,1
...,...,...
190,NRS001,1
191,NRS209,2
192,NRS272,0
193,NRS110,2


In [28]:
X_train_over = X_train_over[:,1:]
X_test_over = X_test_over[:,1:]

In [29]:
model1_over2 = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],)),
    Dense(3, activation='softmax'),
])

In [32]:
model1_over2.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [33]:
model1_over2.fit(X_train_over, y_train_over,
          batch_size=16, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 453 samples, validate on 195 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1a36adf0b8>

In [46]:
acc_test_over2 = model1_over2.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over2*100))

over-sampling test accuracy: 92.82%


In [34]:
pred2 = model1_over2.predict_classes(X_test_over)
pred2

array([2, 1, 1, 0, 1, 0, 1, 1, 0, 2, 1, 0, 2, 2, 1, 2, 0, 1, 1, 0, 0, 0,
       2, 1, 0, 0, 1, 1, 1, 2, 0, 2, 1, 2, 1, 1, 2, 1, 1, 2, 1, 2, 2, 0,
       2, 0, 0, 2, 2, 0, 1, 1, 1, 2, 1, 0, 0, 1, 2, 2, 2, 2, 1, 1, 1, 2,
       1, 0, 2, 0, 1, 0, 1, 1, 0, 0, 2, 0, 1, 1, 1, 2, 1, 0, 0, 0, 0, 1,
       1, 1, 0, 1, 2, 0, 2, 1, 0, 1, 1, 0, 2, 2, 0, 2, 0, 1, 1, 2, 1, 1,
       2, 2, 1, 2, 2, 1, 0, 0, 2, 1, 1, 1, 2, 2, 0, 2, 0, 2, 2, 0, 1, 2,
       1, 0, 1, 2, 2, 0, 1, 0, 2, 1, 1, 0, 2, 1, 2, 1, 0, 2, 1, 2, 1, 1,
       1, 2, 1, 2, 0, 1, 0, 1, 2, 0, 2, 1, 2, 0, 0, 2, 1, 2, 2, 1, 1, 2,
       1, 2, 0, 2, 1, 0, 1, 0, 0, 2, 1, 2, 0, 2, 1, 2, 1, 2, 0])

In [35]:
dat2['pred'] = pred2
dat2

Unnamed: 0,0,test,pred
0,NRS110,2,2
1,NRS254,1,1
2,BCH-SA-09,0,1
3,NRS177,0,0
4,GA27,1,1
...,...,...,...
190,NRS001,1,1
191,NRS209,2,2
192,NRS272,0,1
193,NRS110,2,2


In [36]:
proba2 = model1_over2.predict_proba(X_test_over)
dat_proba2 = pd.DataFrame(proba2)

In [37]:
dat_proba2

Unnamed: 0,0,1,2
0,5.313570e-05,0.000230,9.997172e-01
1,2.550575e-08,1.000000,1.440481e-07
2,4.604916e-01,0.538894,6.145450e-04
3,9.999928e-01,0.000006,1.201935e-06
4,3.087689e-02,0.969123,6.280852e-08
...,...,...,...
190,1.834479e-02,0.981655,3.456540e-07
191,1.033348e-05,0.000109,9.998803e-01
192,3.802968e-01,0.481638,1.380656e-01
193,5.313570e-05,0.000230,9.997172e-01


In [38]:
dat_proba2.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba2.csv", index = False,
         header=None)

In [39]:
dat2.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/2p17sp.csv", index = False,
         header=None)

In [52]:
hist1_over2 = model1_over2.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 453 samples, validate on 195 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
E

In [53]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist1_over2.history['accuracy'])*100))

over-sampling train accuracy: 98.66%


In [30]:
df_proba2 = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_2.xlsx",
                        sheet_name=1,
                        index_col=None)

In [31]:
df_proba2

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p002ykpresabs_qual,NRS148,2,2,0.000056,1.748042e-03,9.981960e-01
1,p002ykpresabs_qual,BCH-SA-03,1,0,0.712007,2.879924e-01,9.646217e-07
2,p002ykpresabs_qual,NRS218,1,1,0.006222,9.937732e-01,4.482882e-06
3,p002ykpresabs_qual,NRS036,0,0,0.882617,1.173831e-01,2.310933e-10
4,p002ykpresabs_qual,NRS386,1,0,0.571179,4.288184e-01,2.444667e-06
...,...,...,...,...,...,...,...
4279,pyopresabsSTCC_qual,NRS112,1,1,0.001860,9.979747e-01,1.653396e-04
4280,pyopresabsSTCC_qual,SR1065,0,0,0.982940,1.705227e-02,7.349168e-06
4281,pyopresabsSTCC_qual,NRS203,0,0,0.997093,1.962516e-03,9.441347e-04
4282,pyopresabsSTCC_qual,CFBREBSa129,0,0,1.000000,3.031141e-13,3.208205e-09


In [32]:
y_prob2 = df_proba2[df_proba2['phage']=='p0017Spresabs_qual'].iloc[:,-3:]
y_prob2 = y_prob2.to_numpy()
y_prob2

array([[5.3135704e-05, 2.2963941e-04, 9.9971720e-01],
       [2.5505754e-08, 9.9999990e-01, 1.4404809e-07],
       [4.6049157e-01, 5.3889394e-01, 6.1454496e-04],
       [9.9999285e-01, 5.9044010e-06, 1.2019348e-06],
       [3.0876890e-02, 9.6912295e-01, 6.2808525e-08],
       [9.9999990e-01, 1.5234679e-07, 4.6820990e-08],
       [1.7536214e-02, 9.8245870e-01, 5.0006115e-06],
       [1.0336658e-02, 9.8965780e-01, 5.5370180e-06],
       [9.5360947e-01, 4.6373624e-02, 1.6885842e-05],
       [5.3135704e-05, 2.2963941e-04, 9.9971720e-01],
       [1.8344786e-02, 9.8165490e-01, 3.4565400e-07],
       [1.0000000e+00, 5.8926913e-10, 3.1481575e-09],
       [1.0333483e-05, 1.0935677e-04, 9.9988030e-01],
       [1.0333483e-05, 1.0935677e-04, 9.9988030e-01],
       [2.4009134e-10, 9.9999976e-01, 2.8092407e-07],
       [5.3135704e-05, 2.2963941e-04, 9.9971720e-01],
       [1.0000000e+00, 1.0804824e-08, 4.4811852e-11],
       [1.8344786e-02, 9.8165490e-01, 3.4565400e-07],
       [2.9894197e-04, 9.997

In [33]:
ovo2 = rocauc_ovo(y_test_over, y_prob2, average="macro", multi_class="ovo")
ovo2

0.9872189349112426

In [34]:
ovr2 = rocauc_ovr(y_test_over, y_prob2, average="macro", multi_class="ovr")
ovr2

0.9872189349112426

In [35]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over,
                                                    test_size = 0.3,
                                                    random_state=345,
                                                    stratify=y_over)

In [36]:
dat3 = pd.DataFrame(X_test_over[:,0])
dat3['test'] = y_test_over

In [37]:
dat3

Unnamed: 0,0,test
0,NRS249,1
1,NRS172,1
2,NRS209,2
3,NRS108,1
4,NRS209,2
...,...,...
190,NRS209,2
191,NRS110,2
192,NRS255,1
193,NRS175,1


In [38]:
X_train_over = X_train_over[:,1:]
X_test_over = X_test_over[:,1:]

In [39]:
model1_over3 = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],)),
    Dense(3, activation='softmax'),
])

In [45]:
model1_over3.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [46]:
model1_over3.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 453 samples, validate on 195 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1a36dab908>

In [61]:
acc_test_over3 = model1_over3.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over3*100))

over-sampling test accuracy: 93.33%


In [47]:
pred3 = model1_over3.predict_classes(X_test_over)
pred3

array([1, 1, 2, 1, 2, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 2, 0, 2, 0, 1, 1, 2,
       1, 0, 2, 0, 1, 0, 0, 0, 0, 0, 2, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 0,
       2, 2, 2, 2, 2, 1, 2, 2, 1, 0, 2, 2, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 2, 2, 0, 0, 0, 0, 0, 2, 1, 1, 0, 0, 2, 1, 0, 2, 0, 2, 1, 1, 2,
       1, 2, 1, 0, 2, 0, 1, 2, 1, 1, 2, 2, 2, 2, 2, 1, 1, 2, 0, 2, 0, 1,
       2, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 0, 2, 1, 1, 0, 1, 0, 2, 2, 0, 1,
       2, 2, 1, 2, 2, 2, 0, 1, 0, 0, 2, 1, 1, 1, 2, 0, 2, 2, 1, 2, 1, 0,
       1, 2, 0, 1, 1, 0, 0, 2, 0, 0, 0, 0, 2, 2, 0, 1, 1, 2, 0, 1, 2, 2,
       0, 2, 1, 0, 2, 0, 2, 1, 1, 0, 1, 1, 1, 2, 2, 2, 1, 1, 1])

In [48]:
dat3['pred'] = pred3
dat3

Unnamed: 0,0,test,pred
0,NRS249,1,1
1,NRS172,1,1
2,NRS209,2,2
3,NRS108,1,1
4,NRS209,2,2
...,...,...,...
190,NRS209,2,2
191,NRS110,2,2
192,NRS255,1,1
193,NRS175,1,1


In [49]:
proba3 = model1_over3.predict_proba(X_test_over)
dat_proba3 = pd.DataFrame(proba3)

In [50]:
dat_proba3

Unnamed: 0,0,1,2
0,2.464723e-01,0.753502,0.000026
1,6.200623e-08,0.999998,0.000001
2,1.636309e-04,0.000344,0.999492
3,6.200623e-08,0.999998,0.000001
4,1.636309e-04,0.000344,0.999492
...,...,...,...
190,1.636309e-04,0.000344,0.999492
191,5.468477e-04,0.000059,0.999395
192,1.099464e-05,0.999268,0.000721
193,3.802538e-01,0.619742,0.000004


In [51]:
dat_proba3.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba3.csv", index = False,
         header=None)

In [52]:
dat3.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/3p17sp.csv", index = False,
         header=None)

In [65]:
hist1_over3 = model1_over3.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 453 samples, validate on 195 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100


Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [66]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist1_over3.history['accuracy'])*100))

over-sampling train accuracy: 96.94%


In [40]:
df_proba3 = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_2.xlsx",
                        sheet_name=2,
                        index_col=None)

In [41]:
df_proba3

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p002ykpresabs_qual,NRS109,2,2,0.004477,0.013518,9.820048e-01
1,p002ykpresabs_qual,NRS109,2,2,0.004477,0.013518,9.820048e-01
2,p002ykpresabs_qual,NRS222,0,0,0.851725,0.148269,5.980786e-06
3,p002ykpresabs_qual,NRS109,2,2,0.004477,0.013518,9.820048e-01
4,p002ykpresabs_qual,GA50245,0,0,0.812055,0.187945,1.161034e-07
...,...,...,...,...,...,...,...
4279,pyopresabsSTCC_qual,NRS255,2,2,0.000633,0.000928,9.984396e-01
4280,pyopresabsSTCC_qual,NRS255,2,2,0.000633,0.000928,9.984396e-01
4281,pyopresabsSTCC_qual,NRS266,1,1,0.025932,0.974061,7.323514e-06
4282,pyopresabsSTCC_qual,NRS001,1,1,0.000597,0.999403,3.675362e-10


In [42]:
y_prob3 = df_proba3[df_proba3['phage']=='p0017Spresabs_qual'].iloc[:,-3:]
y_prob3 = y_prob3.to_numpy()
y_prob3

array([[2.46472310e-01, 7.53501530e-01, 2.61708200e-05],
       [6.20062300e-08, 9.99998330e-01, 1.49258550e-06],
       [1.63630890e-04, 3.44442260e-04, 9.99491930e-01],
       [6.20062300e-08, 9.99998330e-01, 1.49258550e-06],
       [1.63630890e-04, 3.44442260e-04, 9.99491930e-01],
       [3.68048100e-01, 6.31937600e-01, 1.43355055e-05],
       [9.97357550e-01, 2.64059030e-03, 1.72624960e-06],
       [2.46472310e-01, 7.53501530e-01, 2.61708200e-05],
       [1.17089964e-01, 8.82909700e-01, 3.66364220e-07],
       [6.20062300e-08, 9.99998330e-01, 1.49258550e-06],
       [4.91066100e-01, 5.08657460e-01, 2.76485100e-04],
       [9.99927640e-01, 7.21832550e-05, 7.01632900e-08],
       [6.20062300e-08, 9.99998330e-01, 1.49258550e-06],
       [1.09946440e-05, 9.99267760e-01, 7.21236170e-04],
       [9.39030770e-01, 6.09682720e-02, 8.76290100e-07],
       [5.46847700e-04, 5.86535600e-05, 9.99394540e-01],
       [5.20842600e-01, 4.78548350e-01, 6.09039600e-04],
       [5.46847700e-04, 5.86535

In [43]:
ovo3 = rocauc_ovo(y_test_over, y_prob3, average="macro", multi_class="ovo")
ovo3

0.9926232741617357

In [44]:
ovr3 = rocauc_ovr(y_test_over, y_prob3, average="macro", multi_class="ovr")
ovr3

0.9926232741617357

In [45]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over,
                                                    test_size = 0.3,
                                                    random_state=456,
                                                    stratify=y_over)

In [46]:
dat4 = pd.DataFrame(X_test_over[:,0])
dat4['test'] = y_test_over

In [47]:
dat4

Unnamed: 0,0,test
0,NRS241,1
1,BCH-SA-01,1
2,NRS219,1
3,NRS209,2
4,NRS001,1
...,...,...
190,GA15,1
191,NRS246,1
192,115,1
193,312,1


In [48]:
X_train_over = X_train_over[:,1:]
X_test_over = X_test_over[:,1:]

In [57]:
model1_over4 = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],)),
    Dense(3, activation='softmax'),
])

In [58]:
model1_over4.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [59]:
model1_over4.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 453 samples, validate on 195 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100


Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1a370c5e48>

In [74]:
acc_test_over4 = model1_over4.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over4*100))

over-sampling test accuracy: 94.87%


In [60]:
pred4 = model1_over4.predict_classes(X_test_over)
pred4

array([1, 1, 0, 2, 1, 2, 1, 1, 2, 2, 2, 0, 0, 0, 1, 2, 2, 2, 2, 0, 0, 1,
       1, 2, 1, 0, 0, 0, 2, 1, 2, 1, 2, 2, 2, 0, 0, 2, 0, 2, 1, 2, 1, 0,
       0, 0, 2, 2, 1, 0, 2, 0, 1, 2, 1, 2, 1, 0, 0, 1, 1, 1, 0, 1, 2, 0,
       1, 0, 0, 2, 1, 1, 2, 1, 1, 0, 2, 0, 0, 2, 1, 0, 2, 2, 2, 1, 2, 2,
       2, 2, 1, 0, 1, 0, 1, 0, 1, 2, 2, 2, 0, 0, 0, 1, 1, 2, 0, 2, 0, 0,
       2, 2, 0, 2, 2, 1, 2, 2, 2, 1, 0, 0, 2, 2, 2, 0, 0, 0, 2, 0, 1, 1,
       1, 0, 1, 1, 2, 1, 1, 2, 2, 2, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 2, 1,
       0, 0, 0, 0, 1, 1, 2, 0, 2, 0, 2, 0, 0, 2, 0, 2, 1, 0, 0, 2, 0, 0,
       1, 2, 1, 0, 1, 2, 0, 2, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0])

In [61]:
dat4['pred'] = pred4
dat4

Unnamed: 0,0,test,pred
0,NRS241,1,1
1,BCH-SA-01,1,1
2,NRS219,1,0
3,NRS209,2,2
4,NRS001,1,1
...,...,...,...
190,GA15,1,0
191,NRS246,1,1
192,115,1,1
193,312,1,1


In [62]:
proba4 = model1_over4.predict_proba(X_test_over)
dat_proba4 = pd.DataFrame(proba4)

In [63]:
dat_proba4

Unnamed: 0,0,1,2
0,0.005929,0.994005,6.562602e-05
1,0.025756,0.974242,1.108981e-06
2,0.867415,0.131991,5.944265e-04
3,0.000020,0.000809,9.991710e-01
4,0.489295,0.510701,3.876968e-06
...,...,...,...
190,0.522285,0.477700,1.542957e-05
191,0.055095,0.944002,9.027657e-04
192,0.013668,0.986331,1.037551e-06
193,0.253906,0.746083,1.064275e-05


In [64]:
dat_proba4.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba4.csv", index = False,
         header=None)

In [65]:
dat4.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/4p17sp.csv", index = False,
         header=None)

In [78]:
hist1_over4 = model1_over4.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 453 samples, validate on 195 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
E

In [79]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist1_over4.history['accuracy'])*100))

over-sampling train accuracy: 97.14%


In [49]:
df_proba4 = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_2.xlsx",
                        sheet_name=3,
                        index_col=None)

In [50]:
df_proba4

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p002ykpresabs_qual,NRS110,1,1,0.000003,0.999997,5.870196e-13
1,p002ykpresabs_qual,NRS216,1,1,0.039254,0.960745,9.078969e-07
2,p002ykpresabs_qual,NRS386,1,1,0.326752,0.673248,1.061032e-07
3,p002ykpresabs_qual,CFBRSa25,0,0,0.611084,0.388916,7.664974e-07
4,p002ykpresabs_qual,BCH-SA-03,1,0,0.611084,0.388916,7.664974e-07
...,...,...,...,...,...,...,...
4279,pyopresabsSTCC_qual,NRS236,1,1,0.000052,0.999768,1.803156e-04
4280,pyopresabsSTCC_qual,NRS029,0,1,0.322350,0.677496,1.533154e-04
4281,pyopresabsSTCC_qual,NRS148,2,2,0.000006,0.000026,9.999682e-01
4282,pyopresabsSTCC_qual,CFBRSa28,0,0,0.999288,0.000176,5.361527e-04


In [51]:
y_prob4 = df_proba4[df_proba4['phage']=='p0017Spresabs_qual'].iloc[:,-3:]
y_prob4 = y_prob4.to_numpy()
y_prob4

array([[5.9291230e-03, 9.9400526e-01, 6.5626020e-05],
       [2.5756495e-02, 9.7424227e-01, 1.1089805e-06],
       [8.6741490e-01, 1.3199063e-01, 5.9442650e-04],
       [2.0058531e-05, 8.0890060e-04, 9.9917100e-01],
       [4.8929495e-01, 5.1070110e-01, 3.8769676e-06],
       [2.0058531e-05, 8.0890060e-04, 9.9917100e-01],
       [2.7117264e-03, 9.9728835e-01, 3.6922436e-09],
       [5.5095475e-02, 9.4400173e-01, 9.0276566e-04],
       [3.5693520e-04, 2.0795199e-04, 9.9943510e-01],
       [2.0058531e-05, 8.0890060e-04, 9.9917100e-01],
       [3.5693520e-04, 2.0795199e-04, 9.9943510e-01],
       [5.3943540e-01, 4.6055737e-01, 7.2680978e-06],
       [9.9627530e-01, 3.0673728e-03, 6.5737707e-04],
       [9.9350870e-01, 6.4895680e-03, 1.8308112e-06],
       [1.1810342e-01, 8.8188950e-01, 7.0270635e-06],
       [3.5693520e-04, 2.0795199e-04, 9.9943510e-01],
       [3.5693520e-04, 2.0795199e-04, 9.9943510e-01],
       [2.0058531e-05, 8.0890060e-04, 9.9917100e-01],
       [3.5693520e-04, 2.079

In [52]:
ovo4 = rocauc_ovo(y_test_over, y_prob4, average="macro", multi_class="ovo")
ovo4

0.9902564102564103

In [53]:
ovr4 = rocauc_ovr(y_test_over, y_prob4, average="macro", multi_class="ovr")
ovr4

0.9902564102564103

In [54]:
ovos = [ovo1, ovo2, ovo3, ovo4]
np.mean(ovos)

0.9869329388560157

In [55]:
np.std(ovos)

0.005700734671606347

In [56]:
ovrs = [ovr1, ovr2, ovr3, ovr4]
np.mean(ovrs)

0.9869329388560157

In [57]:
np.std(ovrs)

0.005700734671606347

In [80]:
accs = [acc_test_over, acc_test_over2, acc_test_over3, acc_test_over4]

In [81]:
mean = np.mean(accs)
print('over-sampling test accuracy mean: %.2f%%' % (mean*100))

over-sampling test accuracy mean: 93.59%


In [82]:
std = np.std(accs)
print('over-sampling test accuracy standard deviation:', std)

over-sampling test accuracy standard deviation: 0.007692307233810425


In [83]:
accs_train = [np.mean(hist1_over.history['accuracy']), np.mean(hist1_over2.history['accuracy']), np.mean(hist1_over3.history['accuracy']),
             np.mean(hist1_over4.history['accuracy'])]

In [84]:
mean_train = np.mean(accs_train)
print('over-sampling train accuracy mean: %.2f%%' % (mean_train*100))

over-sampling train accuracy mean: 97.59%


In [85]:
std_train = np.std(accs_train)
print('over-sampling train accuracy standard deviation:', std_train)

over-sampling train accuracy standard deviation: 0.0066853072


In [58]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over,
                                                    test_size = 0.3,
                                                    random_state=567,
                                                    stratify=y_over)

In [59]:
dat5 = pd.DataFrame(X_test_over[:,0])
dat5['test'] = y_test_over

In [60]:
dat5

Unnamed: 0,0,test
0,312,1
1,BCH-SA-12,0
2,NRS209,2
3,CFBRSa29,0
4,NRS209,2
...,...,...
190,CA541,1
191,SR4152,1
192,NRS110,2
193,CFBRSa70,0


In [61]:
X_train_over = X_train_over[:,1:]
X_test_over = X_test_over[:,1:]

In [70]:
#### add regularizer and dropout
model1_over5 = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],), activity_regularizer=l1(0.001)),
    Dense(3, activation='softmax'),
    Dropout(0.2, ),
])

In [71]:
model1_over5.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [72]:
model1_over5.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 453 samples, validate on 195 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1a37afe4e0>

In [109]:
acc_test_over5 = model1_over5.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over5*100))

over-sampling test accuracy: 89.74%


In [73]:
pred5 = model1_over5.predict_classes(X_test_over)
pred5

array([1, 0, 2, 0, 2, 0, 0, 2, 0, 2, 2, 2, 2, 2, 1, 2, 2, 1, 0, 2, 2, 2,
       1, 2, 2, 1, 0, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 2, 2, 0, 1, 1, 2, 1,
       1, 1, 1, 1, 2, 2, 0, 2, 1, 0, 2, 0, 0, 1, 2, 1, 1, 0, 2, 0, 1, 0,
       2, 2, 1, 1, 0, 0, 1, 1, 2, 2, 2, 2, 0, 1, 0, 2, 0, 0, 2, 0, 2, 0,
       1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 0, 0, 0, 0,
       2, 1, 2, 1, 1, 1, 0, 1, 1, 1, 1, 2, 0, 1, 0, 2, 2, 1, 0, 0, 1, 1,
       2, 0, 1, 0, 2, 1, 1, 2, 2, 1, 2, 1, 0, 0, 1, 2, 2, 2, 1, 1, 2, 2,
       0, 0, 1, 0, 2, 1, 0, 1, 2, 2, 0, 0, 2, 2, 2, 2, 1, 1, 0, 1, 1, 1,
       2, 1, 0, 2, 1, 0, 1, 1, 0, 0, 0, 1, 2, 0, 0, 1, 2, 0, 1])

In [74]:
dat5['pred'] = pred5
dat5

Unnamed: 0,0,test,pred
0,312,1,1
1,BCH-SA-12,0,0
2,NRS209,2,2
3,CFBRSa29,0,0
4,NRS209,2,2
...,...,...,...
190,CA541,1,0
191,SR4152,1,1
192,NRS110,2,2
193,CFBRSa70,0,0


In [75]:
proba5 = model1_over5.predict_proba(X_test_over)
dat_proba5 = pd.DataFrame(proba5)

In [76]:
dat_proba5

Unnamed: 0,0,1,2
0,1.323380e-01,8.676553e-01,6.663150e-06
1,9.999998e-01,2.390859e-07,8.383459e-11
2,7.866942e-08,7.944796e-08,9.999999e-01
3,9.999999e-01,1.035796e-07,4.887048e-10
4,7.866942e-08,7.944796e-08,9.999999e-01
...,...,...,...
190,9.535643e-01,4.643551e-02,2.351167e-07
191,2.063678e-02,9.793620e-01,1.116670e-06
192,6.606913e-08,6.610152e-08,9.999999e-01
193,8.747304e-01,1.252672e-01,2.427289e-06


In [77]:
dat_proba5.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba5.csv", index = False,
         header=None)

In [78]:
dat5.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/5p17sp.csv", index = False,
         header=None)

In [113]:
hist1_over5 = model1_over5.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 453 samples, validate on 195 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [114]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist1_over5.history['accuracy'])*100))

over-sampling train accuracy: 76.83%


In [62]:
df_proba5 = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_regularizor_dropout_2.xlsx",
                        sheet_name=0,
                        index_col=None)

In [63]:
df_proba5

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p002ykpresabsSTCC_qual,NRS241,1,1,1.342914e-03,9.986569e-01,2.348628e-07
1,p002ykpresabsSTCC_qual,NRS148,2,2,5.170289e-08,1.017893e-07,9.999999e-01
2,p002ykpresabsSTCC_qual,NRS255,1,1,1.780311e-07,9.999999e-01,2.544841e-12
3,p002ykpresabsSTCC_qual,NRS214,0,0,1.000000e+00,2.203547e-10,5.688883e-15
4,p002ykpresabsSTCC_qual,NRS148,2,2,5.170289e-08,1.017893e-07,9.999999e-01
...,...,...,...,...,...,...,...
1977,pyopresabsSTCC_qual,BCH-SA-12,0,0,1.000000e+00,1.152503e-09,1.898730e-09
1978,pyopresabsSTCC_qual,NRS049,0,1,8.401357e-11,1.000000e+00,3.209735e-13
1979,pyopresabsSTCC_qual,NRS022,0,0,1.000000e+00,4.755084e-10,1.974275e-10
1980,pyopresabsSTCC_qual,NRS236,1,1,1.357345e-08,1.000000e+00,1.293117e-10


In [64]:
y_prob5 = df_proba5[df_proba5['phage']=='p0017Spresabs_qual'].iloc[:,-3:]
y_prob5 = y_prob5.to_numpy()
y_prob5

array([[1.32338030e-01, 8.67655340e-01, 6.66315000e-06],
       [9.99999760e-01, 2.39085860e-07, 8.38345900e-11],
       [7.86694160e-08, 7.94479600e-08, 9.99999900e-01],
       [9.99999900e-01, 1.03579550e-07, 4.88704800e-10],
       [7.86694160e-08, 7.94479600e-08, 9.99999900e-01],
       [9.99995470e-01, 4.49730400e-06, 4.60433580e-10],
       [1.00000000e+00, 1.22001910e-10, 1.76005890e-12],
       [6.60691300e-08, 6.61015200e-08, 9.99999900e-01],
       [9.99936700e-01, 6.32626700e-05, 5.07325560e-08],
       [7.86694160e-08, 7.94479600e-08, 9.99999900e-01],
       [6.60691300e-08, 6.61015200e-08, 9.99999900e-01],
       [6.60691300e-08, 6.61015200e-08, 9.99999900e-01],
       [6.60691300e-08, 6.61015200e-08, 9.99999900e-01],
       [6.60691300e-08, 6.61015200e-08, 9.99999900e-01],
       [4.79844030e-03, 9.95201470e-01, 1.42994510e-07],
       [6.60691300e-08, 6.61015200e-08, 9.99999900e-01],
       [7.86694160e-08, 7.94479600e-08, 9.99999900e-01],
       [6.89850500e-02, 9.31014

In [65]:
ovo5 = rocauc_ovo(y_test_over, y_prob5, average="macro", multi_class="ovo")
ovo5

0.9771203155818541

In [66]:
ovr5 = rocauc_ovr(y_test_over, y_prob5, average="macro", multi_class="ovr")
ovr5

0.9771203155818541

In [67]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over,
                                                    test_size = 0.3,
                                                    random_state=678,
                                                    stratify=y_over)

In [68]:
dat6 = pd.DataFrame(X_test_over[:,0])
dat6['test'] = y_test_over

In [69]:
dat6

Unnamed: 0,0,test
0,CFBREBSa119,0
1,NRS001,1
2,NRS074,0
3,NRS209,2
4,GA231,0
...,...,...
190,NRS252,0
191,SR2852,1
192,NRS108,1
193,NRS202,0


In [70]:
X_train_over = X_train_over[:,1:]
X_test_over = X_test_over[:,1:]

In [83]:
model1_over6 = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],), activity_regularizer=l1(0.001)),
    Dense(3, activation='softmax'),
    Dropout(0.2, ),
])

In [84]:
model1_over6.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [85]:
model1_over6.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 453 samples, validate on 195 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1a3827b080>

In [130]:
acc_test_over6 = model1_over6.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over6*100))

over-sampling test accuracy: 92.82%


In [86]:
pred6 = model1_over6.predict_classes(X_test_over)
pred6

array([0, 1, 0, 2, 0, 2, 0, 1, 1, 1, 2, 1, 2, 0, 0, 0, 2, 2, 2, 0, 2, 0,
       0, 1, 1, 0, 0, 1, 2, 1, 2, 1, 2, 2, 1, 2, 2, 2, 0, 2, 1, 0, 1, 1,
       2, 2, 2, 1, 1, 0, 1, 2, 0, 1, 1, 0, 1, 1, 2, 0, 2, 1, 1, 0, 0, 1,
       1, 2, 2, 1, 0, 2, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 2, 1, 0, 0, 1, 0,
       1, 2, 1, 1, 2, 2, 1, 1, 0, 2, 0, 2, 1, 1, 2, 2, 1, 1, 1, 1, 0, 2,
       2, 1, 0, 1, 2, 0, 0, 0, 2, 1, 2, 1, 1, 2, 2, 0, 0, 2, 2, 0, 1, 0,
       1, 1, 0, 2, 0, 2, 0, 0, 2, 0, 2, 1, 2, 1, 2, 0, 0, 2, 1, 2, 2, 2,
       2, 2, 1, 0, 2, 1, 0, 0, 0, 1, 2, 1, 0, 1, 1, 0, 0, 2, 0, 0, 1, 1,
       0, 2, 2, 2, 1, 0, 2, 1, 1, 2, 2, 1, 2, 1, 0, 1, 1, 0, 2])

In [87]:
dat6['pred'] = pred6
dat6

Unnamed: 0,0,test,pred
0,CFBREBSa119,0,0
1,NRS001,1,1
2,NRS074,0,0
3,NRS209,2,2
4,GA231,0,0
...,...,...,...
190,NRS252,0,0
191,SR2852,1,1
192,NRS108,1,1
193,NRS202,0,0


In [88]:
proba6 = model1_over6.predict_proba(X_test_over)
dat_proba6 = pd.DataFrame(proba6)

In [89]:
dat_proba6

Unnamed: 0,0,1,2
0,9.986406e-01,1.358906e-03,5.093945e-07
1,2.734758e-01,7.265226e-01,1.638970e-06
2,1.000000e+00,9.627309e-13,4.189273e-12
3,7.107406e-08,6.841635e-08,9.999999e-01
4,8.102067e-01,1.897932e-01,8.685920e-08
...,...,...,...
190,7.459918e-01,2.540054e-01,2.877789e-06
191,1.280722e-05,9.999872e-01,2.305947e-12
192,1.812534e-09,1.000000e+00,5.739356e-11
193,7.166129e-01,2.833829e-01,4.214001e-06


In [90]:
dat_proba6.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba6.csv", index = False,
         header=None)

In [91]:
dat6.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/6p17sp.csv", index = False,
         header=None)

In [134]:
hist1_over6 = model1_over6.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 453 samples, validate on 195 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
E

In [135]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist1_over6.history['accuracy'])*100))

over-sampling train accuracy: 76.81%


In [71]:
df_proba6 = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_regularizor_dropout_2.xlsx",
                        sheet_name=1,
                        index_col=None)

In [72]:
df_proba6

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p002ykpresabsSTCC_qual,NRS209,2,2,1.790400e-08,4.141849e-08,1.000000e+00
1,p002ykpresabsSTCC_qual,NRS386,1,1,5.739934e-04,9.994259e-01,6.773014e-08
2,p002ykpresabsSTCC_qual,NRS148,2,2,5.286934e-09,1.269109e-08,1.000000e+00
3,p002ykpresabsSTCC_qual,NRS178,0,1,6.494936e-12,1.000000e+00,2.537080e-25
4,p002ykpresabsSTCC_qual,NRS237,0,1,5.701098e-02,9.399204e-01,3.068583e-03
...,...,...,...,...,...,...,...
1977,pyopresabsSTCC_qual,NRS272,0,0,9.999607e-01,3.367024e-05,5.776848e-06
1978,pyopresabsSTCC_qual,NRS112,1,1,8.275442e-08,9.999999e-01,3.739556e-09
1979,pyopresabsSTCC_qual,NRS064,1,1,2.168245e-08,1.000000e+00,9.603962e-09
1980,pyopresabsSTCC_qual,BCH-SA-04,0,0,1.000000e+00,1.026408e-15,1.630406e-14


In [73]:
y_prob6 = df_proba6[df_proba6['phage']=='p0017Spresabs_qual'].iloc[:,-3:]
y_prob6 = y_prob6.to_numpy()
y_prob6

array([[9.98640600e-01, 1.35890630e-03, 5.09394450e-07],
       [2.73475800e-01, 7.26522600e-01, 1.63896990e-06],
       [1.00000000e+00, 9.62730900e-13, 4.18927260e-12],
       [7.10740600e-08, 6.84163500e-08, 9.99999900e-01],
       [8.10206700e-01, 1.89793210e-01, 8.68592000e-08],
       [6.39432700e-08, 6.58832600e-08, 9.99999900e-01],
       [1.00000000e+00, 4.60883070e-13, 2.49458220e-12],
       [7.35437730e-04, 9.99262870e-01, 1.62438610e-06],
       [4.33790120e-03, 9.95660960e-01, 1.03892860e-06],
       [9.24011800e-03, 9.90759600e-01, 2.44076600e-07],
       [7.10740600e-08, 6.84163500e-08, 9.99999900e-01],
       [1.85527950e-01, 8.14471900e-01, 8.39264200e-08],
       [6.39432700e-08, 6.58832600e-08, 9.99999900e-01],
       [9.96355530e-01, 3.64443850e-03, 1.05079240e-08],
       [9.99258600e-01, 7.41366650e-04, 1.15760885e-08],
       [9.56432340e-01, 4.35660220e-02, 1.54004270e-06],
       [6.39432700e-08, 6.58832600e-08, 9.99999900e-01],
       [7.10740600e-08, 6.84163

In [74]:
ovo6 = rocauc_ovo(y_test_over, y_prob6, average="macro", multi_class="ovo")
ovo6

0.9819329388560157

In [75]:
ovr6 = rocauc_ovr(y_test_over, y_prob6, average="macro", multi_class="ovr")
ovr6

0.9819329388560157

In [76]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over,
                                                    test_size = 0.3,
                                                    random_state=789,
                                                    stratify=y_over)

In [77]:
dat7 = pd.DataFrame(X_test_over[:,0])
dat7['test'] = y_test_over

In [78]:
dat7

Unnamed: 0,0,test
0,NRS218,1
1,NRS260,1
2,NRS162,0
3,NRS177,0
4,NRS209,2
...,...,...
190,NRS383,1
191,NRS218,1
192,NRS209,2
193,SR2852,1


In [79]:
X_train_over = X_train_over[:,1:]
X_test_over = X_test_over[:,1:]

In [80]:
model1_over7 = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],), activity_regularizer=l1(0.001)),
    Dense(3, activation='softmax'),
    Dropout(0.2, ),
])

In [97]:
model1_over7.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [98]:
model1_over7.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 453 samples, validate on 195 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1a3894c7b8>

In [151]:
acc_test_over7 = model1_over7.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over7*100))

over-sampling test accuracy: 90.77%


In [99]:
pred7 = model1_over7.predict_classes(X_test_over)
pred7

array([1, 1, 0, 0, 2, 0, 1, 1, 2, 2, 1, 2, 0, 1, 0, 1, 0, 0, 1, 1, 2, 2,
       2, 2, 1, 1, 1, 1, 1, 0, 1, 1, 0, 2, 2, 1, 0, 1, 2, 0, 2, 0, 2, 0,
       0, 0, 1, 2, 1, 2, 2, 0, 0, 0, 0, 2, 2, 1, 1, 1, 1, 0, 2, 2, 1, 1,
       2, 1, 0, 2, 2, 1, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1,
       0, 2, 0, 2, 2, 1, 2, 1, 0, 1, 0, 2, 2, 2, 0, 0, 1, 0, 2, 1, 0, 1,
       0, 1, 2, 1, 2, 1, 0, 1, 0, 1, 1, 1, 2, 1, 1, 2, 0, 0, 0, 2, 1, 1,
       2, 1, 1, 1, 1, 0, 1, 0, 1, 2, 2, 1, 0, 2, 1, 0, 1, 2, 2, 2, 0, 1,
       2, 1, 2, 2, 1, 2, 2, 1, 0, 0, 2, 0, 0, 2, 1, 1, 1, 0, 0, 1, 2, 1,
       1, 0, 0, 1, 2, 1, 1, 0, 2, 1, 2, 1, 1, 2, 1, 1, 2, 1, 0])

In [100]:
dat7['pred'] = pred7
dat7

Unnamed: 0,0,test,pred
0,NRS218,1,1
1,NRS260,1,1
2,NRS162,0,0
3,NRS177,0,0
4,NRS209,2,2
...,...,...,...
190,NRS383,1,1
191,NRS218,1,1
192,NRS209,2,2
193,SR2852,1,1


In [101]:
proba7 = model1_over7.predict_proba(X_test_over)
dat_proba7 = pd.DataFrame(proba7)

In [102]:
dat_proba7

Unnamed: 0,0,1,2
0,1.071406e-04,9.998922e-01,5.812307e-07
1,8.026793e-07,9.999992e-01,1.381583e-08
2,9.999787e-01,2.136149e-05,1.399937e-08
3,1.000000e+00,6.688418e-11,1.067184e-11
4,7.832252e-08,8.137763e-08,9.999999e-01
...,...,...,...
190,2.220224e-01,7.779776e-01,1.380784e-07
191,1.071406e-04,9.998922e-01,5.812307e-07
192,7.832252e-08,8.137763e-08,9.999999e-01
193,8.772524e-07,9.999992e-01,3.216317e-11


In [103]:
dat_proba7.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba7.csv", index = False,
         header=None)

In [104]:
dat7.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/7p17sp.csv", index = False,
         header=None)

In [155]:
hist1_over7 = model1_over7.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 453 samples, validate on 195 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [156]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist1_over7.history['accuracy'])*100))

over-sampling train accuracy: 76.73%


In [81]:
df_proba7 = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_regularizor_dropout_2.xlsx",
                        sheet_name=2,
                        index_col=None)

In [82]:
df_proba7

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p002ykpresabsSTCC_qual,NRS209,2,2,8.300497e-12,1.036520e-09,1.000000e+00
1,p002ykpresabsSTCC_qual,BCH-SA-09,1,1,1.137139e-06,9.999988e-01,2.067601e-09
2,p002ykpresabsSTCC_qual,NRS224,0,0,1.000000e+00,2.093110e-31,0.000000e+00
3,p002ykpresabsSTCC_qual,NRS209,2,2,8.300497e-12,1.036520e-09,1.000000e+00
4,p002ykpresabsSTCC_qual,NRS235,1,1,2.243513e-02,9.774035e-01,1.615106e-04
...,...,...,...,...,...,...,...
1977,pyopresabsSTCC_qual,NRS035,0,0,9.354528e-01,6.414209e-02,4.051121e-04
1978,pyopresabsSTCC_qual,NRS260,1,1,4.808470e-08,1.000000e+00,7.364639e-09
1979,pyopresabsSTCC_qual,CA9,0,0,1.000000e+00,2.361323e-08,2.871247e-08
1980,pyopresabsSTCC_qual,NRS183,1,1,2.755864e-07,9.999998e-01,5.310879e-08


In [83]:
y_prob7 = df_proba7[df_proba7['phage']=='p0017Spresabs_qual'].iloc[:,-3:]
y_prob7 = y_prob7.to_numpy()
y_prob7

array([[1.07140570e-04, 9.99892230e-01, 5.81230700e-07],
       [8.02679270e-07, 9.99999170e-01, 1.38158260e-08],
       [9.99978660e-01, 2.13614860e-05, 1.39993680e-08],
       [1.00000000e+00, 6.68841800e-11, 1.06718420e-11],
       [7.83225200e-08, 8.13776300e-08, 9.99999900e-01],
       [1.00000000e+00, 8.94014900e-14, 6.63932450e-13],
       [4.01321500e-03, 9.95985700e-01, 1.06534050e-06],
       [4.49186440e-01, 5.50808550e-01, 4.94297230e-06],
       [7.83225200e-08, 8.13776300e-08, 9.99999900e-01],
       [7.83225200e-08, 8.13776300e-08, 9.99999900e-01],
       [1.43922100e-02, 9.85607500e-01, 2.87602700e-07],
       [7.32167200e-08, 7.22469050e-08, 9.99999900e-01],
       [9.86984700e-01, 1.30151280e-02, 1.08627180e-07],
       [1.23519010e-04, 9.99876500e-01, 2.07203070e-08],
       [7.52568360e-01, 2.47431140e-01, 4.24021830e-07],
       [1.10291150e-08, 1.00000000e+00, 3.79636540e-10],
       [9.99734100e-01, 2.65911160e-04, 1.36096250e-08],
       [1.00000000e+00, 3.25824

In [84]:
ovo7 = rocauc_ovo(y_test_over, y_prob7, average="macro", multi_class="ovo")
ovo7

0.9768047337278106

In [85]:
ovr7 = rocauc_ovr(y_test_over, y_prob7, average="macro", multi_class="ovr")
ovr7

0.9768047337278106

In [86]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over,
                                                    test_size = 0.3,
                                                    random_state=890,
                                                    stratify=y_over)

In [87]:
dat8 = pd.DataFrame(X_test_over[:,0])
dat8['test'] = y_test_over

In [88]:
dat8

Unnamed: 0,0,test
0,NRS209,2
1,NRS255,1
2,NRS119,0
3,NRS071,0
4,NRS002,0
...,...,...
190,CFBRSa30,0
191,NRS383,1
192,NRS110,2
193,NRS209,2


In [89]:
X_train_over = X_train_over[:,1:]
X_test_over = X_test_over[:,1:]

In [90]:
model1_over8 = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],), activity_regularizer=l1(0.001)),
    Dense(3, activation='softmax'),
    Dropout(0.2, ),
])

In [110]:
model1_over8.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [111]:
model1_over8.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 453 samples, validate on 195 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1a392e6ef0>

In [164]:
acc_test_over8 = model1_over8.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over8*100))

over-sampling test accuracy: 91.79%


In [112]:
pred8 = model1_over8.predict_classes(X_test_over)
pred8

array([2, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 2, 1, 0, 0, 2, 2, 0, 0, 1, 2,
       1, 0, 1, 0, 2, 1, 2, 1, 1, 0, 1, 0, 0, 1, 2, 1, 0, 2, 1, 1, 2, 0,
       2, 2, 0, 0, 2, 0, 0, 0, 0, 2, 0, 1, 2, 2, 0, 0, 1, 2, 0, 1, 2, 1,
       0, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 0, 2, 0, 1, 2, 2, 2, 1, 0, 1, 2,
       2, 2, 2, 2, 2, 1, 1, 1, 2, 1, 2, 1, 0, 1, 1, 0, 0, 2, 2, 2, 1, 0,
       0, 2, 1, 0, 0, 2, 0, 0, 2, 1, 0, 1, 2, 0, 1, 2, 1, 0, 0, 1, 0, 2,
       2, 2, 2, 1, 1, 0, 0, 1, 0, 0, 2, 2, 0, 1, 2, 0, 1, 2, 2, 0, 2, 1,
       0, 0, 0, 2, 1, 1, 0, 2, 2, 1, 1, 2, 2, 2, 1, 1, 0, 0, 2, 0, 2, 2,
       1, 2, 0, 1, 0, 2, 1, 2, 1, 0, 0, 2, 1, 0, 0, 0, 2, 2, 0])

In [113]:
dat8['pred'] = pred8
dat8

Unnamed: 0,0,test,pred
0,NRS209,2,2
1,NRS255,1,1
2,NRS119,0,1
3,NRS071,0,0
4,NRS002,0,1
...,...,...,...
190,CFBRSa30,0,0
191,NRS383,1,0
192,NRS110,2,2
193,NRS209,2,2


In [114]:
proba8 = model1_over8.predict_proba(X_test_over)
dat_proba8 = pd.DataFrame(proba8)

In [115]:
dat_proba8

Unnamed: 0,0,1,2
0,7.099914e-08,6.899951e-08,9.999999e-01
1,2.746130e-08,1.000000e+00,3.077773e-08
2,3.049337e-01,6.949282e-01,1.379770e-04
3,9.999989e-01,1.046389e-06,4.682682e-09
4,1.473558e-02,9.852208e-01,4.373759e-05
...,...,...,...
190,9.999864e-01,1.340197e-05,2.034666e-07
191,9.542260e-01,4.577392e-02,1.702428e-07
192,7.822548e-08,7.226344e-08,9.999999e-01
193,7.099914e-08,6.899951e-08,9.999999e-01


In [116]:
dat_proba8.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba8.csv", index = False,
         header=None)

In [117]:
dat8.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/8p17sp.csv", index = False,
         header=None)

In [168]:
hist1_over8 = model1_over8.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 453 samples, validate on 195 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [169]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist1_over8.history['accuracy'])*100))

over-sampling train accuracy: 76.68%


In [91]:
df_proba8 = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_regularizor_dropout_2.xlsx",
                        sheet_name=3,
                        index_col=None)

In [92]:
df_proba8

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p002ykpresabsSTCC_qual,CFBREBSa116,0,0,9.676203e-01,3.237956e-02,1.480166e-07
1,p002ykpresabsSTCC_qual,NRS214,0,0,1.000000e+00,6.534852e-11,2.250731e-18
2,p002ykpresabsSTCC_qual,NRS148,2,2,3.948571e-11,2.839096e-07,9.999998e-01
3,p002ykpresabsSTCC_qual,NRS148,2,2,3.948571e-11,2.839096e-07,9.999998e-01
4,p002ykpresabsSTCC_qual,NRS148,2,2,3.948571e-11,2.839096e-07,9.999998e-01
...,...,...,...,...,...,...,...
1977,pyopresabsSTCC_qual,NRS205,2,2,3.691095e-08,3.571927e-08,9.999999e-01
1978,pyopresabsSTCC_qual,CFBREBSa122,0,1,9.261665e-02,9.073822e-01,1.162373e-06
1979,pyopresabsSTCC_qual,NRS001,1,1,4.174278e-07,9.999995e-01,3.254024e-09
1980,pyopresabsSTCC_qual,NRS148,2,2,3.234670e-08,3.121212e-08,9.999999e-01


In [93]:
y_prob8 = df_proba8[df_proba8['phage']=='p0017Spresabs_qual'].iloc[:,-3:]
y_prob8 = y_prob8.to_numpy()
y_prob8

array([[7.09991350e-08, 6.89995100e-08, 9.99999900e-01],
       [2.74612990e-08, 1.00000000e+00, 3.07777300e-08],
       [3.04933730e-01, 6.94928200e-01, 1.37976980e-04],
       [9.99998900e-01, 1.04638880e-06, 4.68268200e-09],
       [1.47355840e-02, 9.85220800e-01, 4.37375860e-05],
       [9.99997740e-01, 2.23636970e-06, 9.45778660e-09],
       [9.99999900e-01, 7.66665300e-08, 3.02197600e-08],
       [1.00000000e+00, 2.49247300e-10, 5.00452320e-11],
       [8.27895850e-03, 9.91591930e-01, 1.29060090e-04],
       [1.00000000e+00, 7.63169200e-13, 2.38115520e-12],
       [4.55311780e-01, 5.44677850e-01, 1.03394750e-05],
       [1.00000000e+00, 6.85312140e-10, 8.84964250e-13],
       [7.09991350e-08, 6.89995100e-08, 9.99999900e-01],
       [6.05514700e-12, 1.00000000e+00, 3.70156420e-12],
       [9.99998570e-01, 1.41368840e-06, 2.19187100e-09],
       [6.72134340e-01, 3.27859100e-01, 6.60773800e-06],
       [7.82254760e-08, 7.22634400e-08, 9.99999900e-01],
       [7.82254760e-08, 7.22634

In [94]:
ovo8 = rocauc_ovo(y_test_over, y_prob8, average="macro", multi_class="ovo")
ovo8

0.9705325443786982

In [95]:
ovr8 = rocauc_ovr(y_test_over, y_prob8, average="macro", multi_class="ovr")
ovr8

0.9705325443786982

In [96]:
ovos2 = [ovo5, ovo6, ovo7, ovo8]
np.mean(ovos2)

0.9765976331360946

In [97]:
np.std(ovos2)

0.004048668804505805

In [98]:
ovrs2 = [ovr5, ovr6, ovr7, ovr8]
np.mean(ovrs2)

0.9765976331360946

In [99]:
np.std(ovrs2)

0.004048668804505805

In [170]:
accs_reg = [acc_test_over5, acc_test_over6, acc_test_over7, acc_test_over8]

In [171]:
mean_reg = np.mean(accs_reg)
print('over-sampling test accuracy regularization mean: %.2f%%' % (mean_reg*100))

over-sampling test accuracy regularization mean: 91.28%


In [172]:
std_reg = np.std(accs_reg)
print('over-sampling test accuracy regularization standard deviation:', std_reg)

over-sampling test accuracy regularization standard deviation: 0.011467014585742319


In [173]:
accs_train_reg = [np.mean(hist1_over5.history['accuracy']), np.mean(hist1_over6.history['accuracy']), np.mean(hist1_over7.history['accuracy']),
             np.mean(hist1_over8.history['accuracy'])]

In [174]:
mean_train_reg = np.mean(accs_train_reg)
print('over-sampling train accuracy regularization mean: %.2f%%' % (mean_train_reg*100))

over-sampling train accuracy regularization mean: 76.76%


In [175]:
std_train_reg = np.std(accs_train_reg)
print('over-sampling train accuracy regularization standard deviation:', std_train_reg)

over-sampling train accuracy regularization standard deviation: 0.0005989477
