In [None]:
## This file implements neural networks with/without dropout and regularizer for p002ypresabsSTCC_qual with four replicates.
## We compute the mean and standarad deviation of training and test accuracies.
## We also compute the mean and standard deviation of AUC ROC values for each model.

In [91]:
from numpy.random import seed
import numpy as np
seed(100)
import tensorflow
tensorflow.random.set_seed(123)

In [92]:
import pandas as pd

df = pd.read_csv('/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/p002ypresabsSTCC_qual.csv')
df.shape

(253, 2035)

In [93]:
df.rename(columns={'Unnamed: 0':'id'}, inplace=True)

In [94]:
df['pheno']

0      0
1      0
2      1
3      0
4      0
      ..
248    0
249    0
250    0
251    0
252    0
Name: pheno, Length: 253, dtype: int64

In [95]:
df.head()

Unnamed: 0,id,TTTTTTTGACTAAAATTAATGAAAAGTGAAAATAGTATTGGAACTCAATATCTTTAATGATTTAATGAATAATTTTTATTGAAAGCGATAATTCGTATTA,TTTTTTTATGAAT,TTTTTTTAGTTTATCCAATGATTGATGTTATAATAATACTAAATTTGTATCTATAAAAAAGTAATGAGCATTTGTGCGCATATGATG,TTTTTTTAGTTTATCCAATGATTGATGTTATAATAATACTAAATTTGTATCTATAAAAAAGTAATGAGCATTTGTGCGCATATGATGA,TTTTTTTAGTTTATCCAATGATTGATGTTATAATAATACTAAATTTGTATCTATAAAAAAGTAATGAGCATTTGTGCGCATATGATGATGTAAAGCGTAA,TTTTTTGACTAAAATTAATGAAAAGTGAAAATAGTATTGGAACTCAATATCTTTAATGATTTAATGAATAATTTTTATTGAAAGCGATAATTCGTATTAA,TTTTTTCTTTTCATAACTGTGTTGGAAATGAATTAAATTAACAGCTCTTTGTGCTTTACGGTGTGTTGC,TTTTTTCATTAGT,TTTTTTCATTAGTAA,...,group_8644,group_8645,group_8646,group_8815,group_8892,group_9007,group_9104,ST,CC,pheno
0,107,0,1,1,1,1,0,0,1,1,...,0,0,0,0,0,0,0,5,5,0
1,109,0,1,1,1,1,0,0,1,1,...,0,0,0,0,0,0,0,8,8,0
2,115,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,5,5,1
3,120335,0,1,1,1,1,0,0,1,1,...,0,0,0,0,0,0,0,5,5,0
4,120337,0,1,1,1,1,0,0,1,1,...,0,0,0,0,0,0,0,5,5,0


In [96]:
df['pheno'].value_counts()

0    220
1     30
2      3
Name: pheno, dtype: int64

In [97]:
df_clean = df.drop(columns=['id'])

In [98]:
df_clean.shape

(253, 2034)

In [99]:
df_clean.head()

Unnamed: 0,TTTTTTTGACTAAAATTAATGAAAAGTGAAAATAGTATTGGAACTCAATATCTTTAATGATTTAATGAATAATTTTTATTGAAAGCGATAATTCGTATTA,TTTTTTTATGAAT,TTTTTTTAGTTTATCCAATGATTGATGTTATAATAATACTAAATTTGTATCTATAAAAAAGTAATGAGCATTTGTGCGCATATGATG,TTTTTTTAGTTTATCCAATGATTGATGTTATAATAATACTAAATTTGTATCTATAAAAAAGTAATGAGCATTTGTGCGCATATGATGA,TTTTTTTAGTTTATCCAATGATTGATGTTATAATAATACTAAATTTGTATCTATAAAAAAGTAATGAGCATTTGTGCGCATATGATGATGTAAAGCGTAA,TTTTTTGACTAAAATTAATGAAAAGTGAAAATAGTATTGGAACTCAATATCTTTAATGATTTAATGAATAATTTTTATTGAAAGCGATAATTCGTATTAA,TTTTTTCTTTTCATAACTGTGTTGGAAATGAATTAAATTAACAGCTCTTTGTGCTTTACGGTGTGTTGC,TTTTTTCATTAGT,TTTTTTCATTAGTAA,TTTTTTCAGCATTGTCTACATTACTTAACATTCGTGTTTGTAAGTAATATTGACCGCCAATATTTAGACACTTTATAAGTATGCCATTCATCATTTTTAA,...,group_8644,group_8645,group_8646,group_8815,group_8892,group_9007,group_9104,ST,CC,pheno
0,0,1,1,1,1,0,0,1,1,0,...,0,0,0,0,0,0,0,5,5,0
1,0,1,1,1,1,0,0,1,1,0,...,0,0,0,0,0,0,0,8,8,0
2,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,5,5,1
3,0,1,1,1,1,0,0,1,1,0,...,0,0,0,0,0,0,0,5,5,0
4,0,1,1,1,1,0,0,1,1,0,...,0,0,0,0,0,0,0,5,5,0


In [100]:
X = df.loc[:, df.columns != 'pheno']
y = df['pheno']
print(X.shape, y.shape)

(253, 2034) (253,)


In [101]:
# over-sampling
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler
overS = RandomOverSampler(random_state=100)
X_over, y_over = overS.fit_resample(X, y)
print(sorted(Counter(y_over).items()))

[(0, 220), (1, 220), (2, 220)]




In [102]:
############# Fully-Connected Neural Network ################

In [103]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.regularizers import l1

In [104]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over,
                                                    test_size = 0.3,
                                                    random_state=123,
                                                    stratify=y_over)

In [105]:
dat = pd.DataFrame(X_test_over[:,0])
dat['test'] = y_test_over

In [106]:
dat

Unnamed: 0,0,test
0,CFBRSa26,0
1,NRS109,2
2,NRS112,0
3,NRS216,1
4,NRS021,0
...,...,...
193,CFBREBSa133,0
194,NRS209,2
195,NRS109,2
196,NRS209,2


In [107]:
X_train_over = X_train_over[:,1:]
X_test_over = X_test_over[:,1:]

In [18]:
#### neural network on over-sampling data
model1_over = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],)),
    Dense(3, activation='softmax'),
])

In [19]:
model1_over.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [20]:
model1_over.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 462 samples, validate on 198 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100



Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1a3707f0b8>

In [21]:
acc_test_over = model1_over.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over*100))

over-sampling test accuracy: 97.47%


In [21]:
pred = model1_over.predict_classes(X_test_over)
pred

array([0, 2, 0, 1, 0, 0, 2, 2, 2, 0, 1, 0, 1, 1, 0, 1, 2, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 1, 2, 2, 1, 0, 0, 2, 0, 2, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       2, 2, 2, 1, 2, 2, 2, 1, 0, 2, 0, 0, 0, 2, 0, 0, 1, 2, 2, 0, 0, 1,
       0, 1, 0, 2, 0, 0, 1, 1, 2, 1, 1, 1, 2, 0, 1, 2, 0, 0, 2, 1, 1, 0,
       0, 2, 0, 0, 2, 0, 1, 2, 1, 1, 1, 2, 0, 2, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 1, 0, 1, 0, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 2, 0, 0,
       2, 2, 1, 2, 1, 2, 2, 2, 2, 1, 0, 2, 1, 0, 2, 0, 0, 2, 2, 0, 2, 0,
       0, 2, 0, 2, 1, 0, 2, 1, 1, 2, 0, 1, 2, 2, 1, 0, 2, 2, 2, 1, 2, 2,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 2, 0, 2, 1, 2, 0, 2, 2, 2, 1])

In [22]:
dat['pred'] = pred
dat

Unnamed: 0,0,test,pred
0,CFBRSa26,0,0
1,NRS109,2,2
2,NRS112,0,0
3,NRS216,1,1
4,NRS021,0,0
...,...,...,...
193,CFBREBSa133,0,0
194,NRS209,2,2
195,NRS109,2,2
196,NRS209,2,2


In [23]:
proba1 = model1_over.predict_proba(X_test_over)
dat_proba1 = pd.DataFrame(proba1)

In [24]:
dat_proba1

Unnamed: 0,0,1,2
0,9.999528e-01,4.725889e-05,3.082832e-09
1,1.021427e-03,7.837663e-04,9.981949e-01
2,9.970143e-01,2.985639e-03,1.549940e-07
3,2.931194e-02,9.706042e-01,8.391447e-05
4,9.988996e-01,1.100384e-03,1.590173e-10
...,...,...,...
193,1.000000e+00,7.373644e-10,7.920553e-09
194,2.044007e-07,4.704020e-04,9.995294e-01
195,1.021427e-03,7.837656e-04,9.981949e-01
196,2.044007e-07,4.704020e-04,9.995294e-01


In [25]:
dat_proba1.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba1.csv", index = False,
         header=None)

In [26]:
dat.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/1p002ypST.csv", index = False,
         header=None)

In [25]:
hist1_over = model1_over.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 462 samples, validate on 198 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [26]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist1_over.history['accuracy'])*100))

over-sampling train accuracy: 99.99%


In [108]:
df_proba = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_2.xlsx",
                        sheet_name=0,
                        index_col=None)

In [109]:
df_proba

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p002ykpresabs_qual,CFBRSa26,0,0,0.758914,0.241086,4.638713e-07
1,p002ykpresabs_qual,NRS109,2,2,0.005361,0.016236,9.784034e-01
2,p002ykpresabs_qual,NRS112,0,0,0.726623,0.273376,1.520979e-06
3,p002ykpresabs_qual,NRS216,1,1,0.138322,0.861665,1.334123e-05
4,p002ykpresabs_qual,NRS021,0,0,0.882176,0.117824,1.414530e-10
...,...,...,...,...,...,...,...
4279,pyopresabsSTCC_qual,NRS148,2,2,0.000007,0.000099,9.998934e-01
4280,pyopresabsSTCC_qual,NRS255,2,2,0.000257,0.002048,9.976944e-01
4281,pyopresabsSTCC_qual,NRS205,2,2,0.000011,0.000045,9.999435e-01
4282,pyopresabsSTCC_qual,NRS255,2,2,0.000257,0.002048,9.976944e-01


In [110]:
y_prob = df_proba[df_proba['phage']=='p002ypresabsSTCC_qual'].iloc[:,-3:]
y_prob = y_prob.to_numpy()
y_prob

array([[9.99952800e-01, 4.72588870e-05, 3.08283150e-09],
       [1.02142730e-03, 7.83766300e-04, 9.98194900e-01],
       [9.97014300e-01, 2.98563930e-03, 1.54993970e-07],
       [2.93119440e-02, 9.70604200e-01, 8.39144700e-05],
       [9.98899600e-01, 1.10038390e-03, 1.59017260e-10],
       [9.99733500e-01, 2.66428570e-04, 8.31315800e-09],
       [1.63454510e-06, 2.06930840e-03, 9.97929100e-01],
       [1.63454510e-06, 2.06930840e-03, 9.97929100e-01],
       [1.02142730e-03, 7.83766300e-04, 9.98194900e-01],
       [1.00000000e+00, 1.90122370e-08, 1.44964470e-12],
       [1.53664180e-02, 9.84576900e-01, 5.66871530e-05],
       [9.99995230e-01, 4.72467600e-06, 3.18319260e-13],
       [8.15281900e-02, 9.18467500e-01, 4.31467700e-06],
       [4.21984160e-02, 9.42683340e-01, 1.51182730e-02],
       [9.98228250e-01, 1.77174710e-03, 3.35112600e-10],
       [1.72360500e-02, 9.82735500e-01, 2.83248120e-05],
       [1.63454510e-06, 2.06930840e-03, 9.97929100e-01],
       [1.22267720e-03, 9.98767

In [111]:
## Retrieved from https://github.com/scikit-learn/scikit-learn/issues/3298
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelBinarizer

def rocauc_ovo(truth, pred, average="macro", multi_class="ovo"):

    lb = LabelBinarizer()
    lb.fit(truth)

    truth = lb.transform(truth)   
    
    return roc_auc_score(truth, pred, average=average, multi_class=multi_class)

In [112]:
ovo1 = rocauc_ovo(y_test_over, y_prob, average="macro", multi_class="ovo")
ovo1

0.9996939087848178

In [113]:
def rocauc_ovr(truth, pred, average="macro", multi_class="ovr"):

    lb = LabelBinarizer()
    lb.fit(truth)

    truth = lb.transform(truth)   

    return roc_auc_score(truth, pred, average=average, multi_class=multi_class)

In [114]:
ovr1 = rocauc_ovr(y_test_over, y_prob, average="macro", multi_class="ovr")
ovr1

0.9996939087848178

In [115]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over,
                                                    test_size = 0.3,
                                                    random_state=234,
                                                    stratify=y_over)

In [116]:
dat2 = pd.DataFrame(X_test_over[:,0])
dat2['test'] = y_test_over

In [117]:
dat2

Unnamed: 0,0,test
0,NRS109,2
1,NRS109,2
2,NRS222,0
3,NRS109,2
4,GA50245,0
...,...,...
193,NRS148,2
194,NRS266,1
195,NRS109,2
196,NRS149,0


In [118]:
X_train_over = X_train_over[:,1:]
X_test_over = X_test_over[:,1:]

In [31]:
model1_over2 = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],)),
    Dense(3, activation='softmax'),
])

In [32]:
model1_over2.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [33]:
model1_over2.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 462 samples, validate on 198 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x631cfc0f0>

In [37]:
acc_test_over2 = model1_over2.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over2*100))

over-sampling test accuracy: 97.47%


In [34]:
pred2 = model1_over2.predict_classes(X_test_over)
pred2

array([2, 2, 0, 2, 0, 2, 0, 1, 0, 2, 0, 1, 0, 1, 1, 2, 1, 2, 1, 2, 0, 1,
       2, 1, 1, 2, 1, 2, 1, 0, 2, 0, 1, 1, 1, 1, 0, 1, 2, 1, 0, 0, 2, 1,
       2, 0, 0, 2, 1, 1, 1, 2, 0, 1, 1, 2, 2, 0, 2, 0, 0, 1, 1, 2, 0, 2,
       1, 1, 0, 0, 1, 1, 2, 1, 2, 2, 0, 0, 0, 0, 1, 1, 0, 1, 2, 0, 1, 0,
       2, 0, 2, 1, 1, 1, 2, 2, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 2, 0, 2, 1,
       0, 2, 1, 1, 1, 1, 0, 0, 0, 2, 0, 2, 1, 2, 1, 2, 2, 2, 1, 0, 1, 1,
       1, 1, 2, 2, 0, 2, 0, 2, 2, 0, 1, 2, 0, 0, 1, 2, 2, 2, 0, 2, 1, 0,
       0, 2, 2, 2, 1, 1, 0, 2, 1, 2, 0, 0, 2, 0, 2, 2, 1, 1, 1, 2, 1, 2,
       0, 1, 1, 1, 2, 1, 2, 1, 0, 0, 0, 0, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2])

In [35]:
dat2['pred'] = pred2
dat2

Unnamed: 0,0,test,pred
0,NRS109,2,2
1,NRS109,2,2
2,NRS222,0,0
3,NRS109,2,2
4,GA50245,0,0
...,...,...,...
193,NRS148,2,2
194,NRS266,1,1
195,NRS109,2,2
196,NRS149,0,1


In [36]:
proba2 = model1_over2.predict_proba(X_test_over)
dat_proba2 = pd.DataFrame(proba2)

In [37]:
dat_proba2

Unnamed: 0,0,1,2
0,6.302851e-04,0.005169,9.942011e-01
1,6.302851e-04,0.005169,9.942011e-01
2,9.947300e-01,0.005270,3.008131e-07
3,6.302851e-04,0.005169,9.942011e-01
4,9.987645e-01,0.001235,6.272662e-07
...,...,...,...
193,1.341323e-09,0.001519,9.984811e-01
194,4.687704e-04,0.999517,1.455117e-05
195,6.302848e-04,0.005169,9.942011e-01
196,4.454741e-01,0.554522,3.477236e-06


In [38]:
dat_proba2.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba2.csv", index = False,
         header=None)

In [39]:
dat2.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/2p002ypST.csv", index = False,
         header=None)

In [41]:
hist1_over2 = model1_over2.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 462 samples, validate on 198 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [42]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist1_over2.history['accuracy'])*100))

over-sampling train accuracy: 99.98%


In [119]:
df_proba2 = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_2.xlsx",
                        sheet_name=1,
                        index_col=None)

In [120]:
df_proba2

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p002ykpresabs_qual,NRS148,2,2,0.000056,1.748042e-03,9.981960e-01
1,p002ykpresabs_qual,BCH-SA-03,1,0,0.712007,2.879924e-01,9.646217e-07
2,p002ykpresabs_qual,NRS218,1,1,0.006222,9.937732e-01,4.482882e-06
3,p002ykpresabs_qual,NRS036,0,0,0.882617,1.173831e-01,2.310933e-10
4,p002ykpresabs_qual,NRS386,1,0,0.571179,4.288184e-01,2.444667e-06
...,...,...,...,...,...,...,...
4279,pyopresabsSTCC_qual,NRS112,1,1,0.001860,9.979747e-01,1.653396e-04
4280,pyopresabsSTCC_qual,SR1065,0,0,0.982940,1.705227e-02,7.349168e-06
4281,pyopresabsSTCC_qual,NRS203,0,0,0.997093,1.962516e-03,9.441347e-04
4282,pyopresabsSTCC_qual,CFBREBSa129,0,0,1.000000,3.031141e-13,3.208205e-09


In [121]:
y_prob2 = df_proba2[df_proba2['phage']=='p002ypresabsSTCC_qual'].iloc[:,-3:]
y_prob2 = y_prob2.to_numpy()
y_prob2

array([[6.30285070e-04, 5.16860600e-03, 9.94201100e-01],
       [6.30285070e-04, 5.16860600e-03, 9.94201100e-01],
       [9.94730000e-01, 5.26959640e-03, 3.00813100e-07],
       [6.30285070e-04, 5.16860600e-03, 9.94201100e-01],
       [9.98764500e-01, 1.23492540e-03, 6.27266160e-07],
       [6.30285070e-04, 5.16860600e-03, 9.94201100e-01],
       [9.97446060e-01, 2.55394980e-03, 4.57869300e-09],
       [3.06966760e-03, 9.96923600e-01, 6.64078650e-06],
       [9.99999760e-01, 2.14201380e-07, 2.15502250e-11],
       [6.30285070e-04, 5.16860600e-03, 9.94201100e-01],
       [9.99815170e-01, 1.84799180e-04, 1.10494206e-07],
       [5.33969300e-04, 9.99464450e-01, 1.60185440e-06],
       [9.94723700e-01, 5.27623060e-03, 1.59178510e-07],
       [2.66609780e-06, 9.89390300e-01, 1.06069840e-02],
       [1.27158970e-03, 9.98695900e-01, 3.25350300e-05],
       [1.34133350e-09, 1.51889340e-03, 9.98481100e-01],
       [4.48031240e-05, 9.99951840e-01, 3.29720610e-06],
       [4.23477170e-10, 4.77588

In [122]:
ovo2 = rocauc_ovo(y_test_over, y_prob2, average="macro", multi_class="ovo")
ovo2

0.9949877563513927

In [123]:
ovr2 = rocauc_ovr(y_test_over, y_prob2, average="macro", multi_class="ovr")
ovr2

0.9949877563513927

In [124]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over,
                                                    test_size = 0.3,
                                                    random_state=345,
                                                    stratify=y_over)

In [125]:
dat3 = pd.DataFrame(X_test_over[:,0])
dat3['test'] = y_test_over

In [126]:
dat3

Unnamed: 0,0,test
0,CFBREBSa110,0
1,CFBREBSa131,1
2,NRS148,2
3,NRS169,1
4,NRS073,0
...,...,...
193,NRS001,1
194,NRS191,0
195,NRS207,0
196,CA26,0


In [127]:
X_train_over = X_train_over[:,1:]
X_test_over = X_test_over[:,1:]

In [44]:
model1_over3 = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],)),
    Dense(3, activation='softmax'),
])

In [45]:
model1_over3.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [46]:
model1_over3.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 462 samples, validate on 198 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x632277b00>

In [50]:
acc_test_over3 = model1_over3.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over3*100))

over-sampling test accuracy: 97.47%


In [47]:
pred3 = model1_over3.predict_classes(X_test_over)
pred3

array([0, 1, 2, 1, 0, 1, 1, 2, 1, 1, 0, 2, 1, 1, 0, 2, 0, 2, 0, 2, 0, 2,
       1, 0, 1, 2, 2, 2, 0, 2, 2, 1, 1, 0, 1, 2, 1, 0, 0, 2, 2, 1, 0, 0,
       1, 1, 0, 1, 2, 1, 2, 1, 0, 0, 1, 2, 2, 1, 1, 2, 1, 0, 2, 2, 2, 1,
       0, 1, 1, 2, 1, 1, 0, 1, 1, 2, 2, 1, 1, 2, 2, 0, 2, 0, 0, 1, 0, 2,
       2, 1, 2, 1, 2, 0, 1, 0, 0, 1, 1, 0, 0, 2, 1, 1, 0, 2, 0, 2, 0, 1,
       2, 2, 0, 1, 2, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 2, 2, 0,
       1, 2, 2, 2, 1, 2, 2, 2, 1, 1, 1, 0, 2, 1, 2, 0, 0, 2, 1, 1, 1, 2,
       1, 0, 0, 2, 2, 0, 1, 2, 2, 2, 2, 1, 1, 0, 2, 0, 1, 2, 0, 1, 0, 1,
       1, 1, 2, 0, 2, 2, 2, 0, 0, 1, 2, 0, 1, 2, 0, 2, 1, 1, 0, 0, 0, 0])

In [48]:
dat3['pred'] = pred3
dat3

Unnamed: 0,0,test,pred
0,CFBREBSa110,0,0
1,CFBREBSa131,1,1
2,NRS148,2,2
3,NRS169,1,1
4,NRS073,0,0
...,...,...,...
193,NRS001,1,1
194,NRS191,0,0
195,NRS207,0,0
196,CA26,0,0


In [49]:
proba3 = model1_over3.predict_proba(X_test_over)
dat_proba3 = pd.DataFrame(proba3)

In [50]:
dat_proba3

Unnamed: 0,0,1,2
0,0.999314,0.000686,3.761691e-09
1,0.036356,0.963644,1.427160e-09
2,0.000001,0.002898,9.971011e-01
3,0.015282,0.979220,5.497806e-03
4,0.999118,0.000882,1.210949e-12
...,...,...,...
193,0.009724,0.990276,5.089228e-08
194,0.845824,0.154172,4.374536e-06
195,0.999702,0.000297,7.595697e-08
196,0.999991,0.000009,1.380664e-09


In [51]:
dat_proba3.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba3.csv", index = False,
         header=None)

In [52]:
dat3.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/3p002ypST.csv", index = False,
         header=None)

In [54]:
hist1_over3 = model1_over3.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 462 samples, validate on 198 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [55]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist1_over3.history['accuracy'])*100))

over-sampling train accuracy: 100.00%


In [128]:
df_proba3 = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_2.xlsx",
                        sheet_name=2,
                        index_col=None)

In [129]:
df_proba3

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p002ykpresabs_qual,NRS109,2,2,0.004477,0.013518,9.820048e-01
1,p002ykpresabs_qual,NRS109,2,2,0.004477,0.013518,9.820048e-01
2,p002ykpresabs_qual,NRS222,0,0,0.851725,0.148269,5.980786e-06
3,p002ykpresabs_qual,NRS109,2,2,0.004477,0.013518,9.820048e-01
4,p002ykpresabs_qual,GA50245,0,0,0.812055,0.187945,1.161034e-07
...,...,...,...,...,...,...,...
4279,pyopresabsSTCC_qual,NRS255,2,2,0.000633,0.000928,9.984396e-01
4280,pyopresabsSTCC_qual,NRS255,2,2,0.000633,0.000928,9.984396e-01
4281,pyopresabsSTCC_qual,NRS266,1,1,0.025932,0.974061,7.323514e-06
4282,pyopresabsSTCC_qual,NRS001,1,1,0.000597,0.999403,3.675362e-10


In [130]:
y_prob3 = df_proba3[df_proba3['phage']=='p002ypresabsSTCC_qual'].iloc[:,-3:]
y_prob3 = y_prob3.to_numpy()
y_prob3

array([[9.99313830e-01, 6.86202500e-04, 3.76169140e-09],
       [3.63563600e-02, 9.63643600e-01, 1.42716030e-09],
       [1.31860800e-06, 2.89752900e-03, 9.97101100e-01],
       [1.52819770e-02, 9.79220150e-01, 5.49780620e-03],
       [9.99117700e-01, 8.82361260e-04, 1.21094910e-12],
       [1.54402840e-05, 9.99984500e-01, 8.83860100e-11],
       [7.55906150e-05, 9.77803100e-01, 2.21213330e-02],
       [1.31860800e-06, 2.89752900e-03, 9.97101100e-01],
       [5.66168600e-04, 9.99433700e-01, 7.55483640e-08],
       [3.30562800e-01, 6.69437200e-01, 1.04187174e-08],
       [9.99978540e-01, 2.14273660e-05, 2.31031800e-14],
       [1.49616090e-03, 8.12478650e-04, 9.97691400e-01],
       [2.47907300e-02, 9.75209060e-01, 2.69607230e-07],
       [9.72403900e-03, 9.90276000e-01, 5.08922820e-08],
       [9.99975900e-01, 2.05343300e-05, 3.57822070e-06],
       [1.49616090e-03, 8.12478650e-04, 9.97691400e-01],
       [9.99882700e-01, 6.67148600e-05, 5.05345740e-05],
       [1.49616090e-03, 8.12478

In [131]:
ovo3 = rocauc_ovo(y_test_over, y_prob3, average="macro", multi_class="ovo")
ovo3

0.9920033670033671

In [132]:
ovr3 = rocauc_ovr(y_test_over, y_prob3, average="macro", multi_class="ovr")
ovr3

0.9920033670033671

In [133]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over,
                                                    test_size = 0.3,
                                                    random_state=456,
                                                    stratify=y_over)

In [134]:
dat4 = pd.DataFrame(X_test_over[:,0])
dat4['test'] = y_test_over

In [135]:
dat4

Unnamed: 0,0,test
0,NRS110,1
1,NRS216,1
2,NRS386,1
3,CFBRSa25,0
4,BCH-SA-03,1
...,...,...
193,NRS216,1
194,CFBREBSa110,0
195,NRS148,2
196,GA27,0


In [136]:
X_train_over = X_train_over[:,1:]
X_test_over = X_test_over[:,1:]

In [57]:
model1_over4 = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],)),
    Dense(3, activation='softmax'),
])

In [58]:
model1_over4.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [59]:
model1_over4.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 462 samples, validate on 198 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1a373eafd0>

In [63]:
acc_test_over4 = model1_over4.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over4*100))

over-sampling test accuracy: 96.97%


In [60]:
pred4 = model1_over4.predict_classes(X_test_over)
pred4

array([1, 1, 1, 0, 1, 1, 2, 2, 2, 0, 2, 0, 0, 0, 0, 2, 0, 2, 1, 0, 2, 1,
       1, 2, 1, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 1, 1, 0, 2, 0, 0, 1, 1, 1,
       0, 1, 1, 2, 2, 0, 0, 2, 1, 2, 2, 1, 1, 0, 0, 0, 1, 1, 2, 1, 2, 2,
       0, 0, 1, 2, 2, 0, 0, 1, 2, 0, 1, 2, 1, 1, 0, 1, 2, 0, 2, 0, 0, 1,
       0, 0, 2, 2, 2, 2, 2, 1, 1, 1, 1, 0, 2, 0, 0, 1, 2, 2, 1, 1, 2, 0,
       1, 2, 2, 2, 1, 2, 1, 2, 1, 2, 0, 0, 2, 1, 1, 0, 1, 2, 2, 2, 0, 1,
       0, 2, 1, 2, 0, 0, 1, 2, 1, 2, 2, 0, 0, 2, 0, 1, 1, 0, 0, 0, 1, 2,
       2, 2, 1, 1, 1, 2, 2, 1, 1, 2, 1, 0, 1, 2, 0, 1, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 2, 1, 1, 1, 0, 2, 0, 0, 0, 0, 2, 0, 2, 2, 1, 0, 2, 0, 2])

In [61]:
dat4['pred'] = pred4
dat4

Unnamed: 0,0,test,pred
0,NRS110,1,1
1,NRS216,1,1
2,NRS386,1,1
3,CFBRSa25,0,0
4,BCH-SA-03,1,1
...,...,...,...
193,NRS216,1,1
194,CFBREBSa110,0,0
195,NRS148,2,2
196,GA27,0,0


In [62]:
proba4 = model1_over4.predict_proba(X_test_over)
dat_proba4 = pd.DataFrame(proba4)

In [63]:
dat_proba4

Unnamed: 0,0,1,2
0,0.002259,0.997741,8.019265e-12
1,0.023295,0.976704,2.383147e-07
2,0.038680,0.961320,1.034880e-09
3,0.999643,0.000357,1.617060e-12
4,0.010256,0.989744,5.751620e-11
...,...,...,...
193,0.023295,0.976704,2.383147e-07
194,0.999738,0.000262,7.801651e-09
195,0.000003,0.001161,9.988357e-01
196,0.999186,0.000814,2.547308e-17


In [64]:
dat_proba4.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba4.csv", index = False,
         header=None)

In [65]:
dat4.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/4p002ypST.csv", index = False,
         header=None)

In [67]:
hist1_over4 = model1_over4.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 462 samples, validate on 198 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [68]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist1_over4.history['accuracy'])*100))

over-sampling train accuracy: 100.00%


In [137]:
df_proba4 = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_2.xlsx",
                        sheet_name=3,
                        index_col=None)

In [138]:
df_proba4

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p002ykpresabs_qual,NRS110,1,1,0.000003,0.999997,5.870196e-13
1,p002ykpresabs_qual,NRS216,1,1,0.039254,0.960745,9.078969e-07
2,p002ykpresabs_qual,NRS386,1,1,0.326752,0.673248,1.061032e-07
3,p002ykpresabs_qual,CFBRSa25,0,0,0.611084,0.388916,7.664974e-07
4,p002ykpresabs_qual,BCH-SA-03,1,0,0.611084,0.388916,7.664974e-07
...,...,...,...,...,...,...,...
4279,pyopresabsSTCC_qual,NRS236,1,1,0.000052,0.999768,1.803156e-04
4280,pyopresabsSTCC_qual,NRS029,0,1,0.322350,0.677496,1.533154e-04
4281,pyopresabsSTCC_qual,NRS148,2,2,0.000006,0.000026,9.999682e-01
4282,pyopresabsSTCC_qual,CFBRSa28,0,0,0.999288,0.000176,5.361527e-04


In [139]:
y_prob4 = df_proba4[df_proba4['phage']=='p002ypresabsSTCC_qual'].iloc[:,-3:]
y_prob4 = y_prob4.to_numpy()
y_prob4

array([[2.25869940e-03, 9.97741340e-01, 8.01926500e-12],
       [2.32953320e-02, 9.76704500e-01, 2.38314660e-07],
       [3.86803600e-02, 9.61319700e-01, 1.03487970e-09],
       [9.99643450e-01, 3.56529520e-04, 1.61705980e-12],
       [1.02559660e-02, 9.89744000e-01, 5.75162000e-11],
       [2.25869940e-03, 9.97741340e-01, 8.01926500e-12],
       [3.23752900e-05, 8.90931600e-05, 9.99878500e-01],
       [3.23752900e-05, 8.90931600e-05, 9.99878500e-01],
       [3.23752900e-05, 8.90931600e-05, 9.99878500e-01],
       [9.99736500e-01, 2.48903100e-04, 1.45464730e-05],
       [3.02686100e-06, 1.16119230e-03, 9.98835740e-01],
       [9.82613500e-01, 1.73864960e-02, 2.57739800e-10],
       [9.95524300e-01, 4.47564850e-03, 7.19103850e-08],
       [9.71900460e-01, 2.80995740e-02, 2.64107350e-11],
       [9.99098660e-01, 9.01342840e-04, 1.45180080e-12],
       [3.23752900e-05, 8.90931600e-05, 9.99878500e-01],
       [9.99938130e-01, 6.18358900e-05, 3.06039480e-09],
       [3.23752900e-05, 8.90931

In [140]:
ovo4 = rocauc_ovo(y_test_over, y_prob4, average="macro", multi_class="ovo")
ovo4

0.9964799510254055

In [141]:
ovr4 = rocauc_ovr(y_test_over, y_prob4, average="macro", multi_class="ovr")
ovr4

0.9964799510254055

In [142]:
ovos = [ovo1, ovo2, ovo3, ovo4]
np.mean(ovos)

0.9957912457912458

In [144]:
np.std(ovos)

0.002770321025286243

In [145]:
ovrs = [ovr1, ovr2, ovr3, ovr4]
np.mean(ovrs)

0.9957912457912458

In [146]:
np.std(ovrs)

0.002770321025286243

In [69]:
accs = [acc_test_over, acc_test_over2, acc_test_over3, acc_test_over4]

In [70]:
mean = np.mean(accs)
print('over-sampling test accuracy mean: %.2f%%' % (mean*100))

over-sampling test accuracy mean: 97.35%


In [71]:
std = np.std(accs)
print('over-sampling test accuracy standard deviation:', std)

over-sampling test accuracy standard deviation: 0.002186922149028386


In [72]:
accs_train = [np.mean(hist1_over.history['accuracy']), np.mean(hist1_over2.history['accuracy']), np.mean(hist1_over3.history['accuracy']),
             np.mean(hist1_over4.history['accuracy'])]

In [73]:
mean_train = np.mean(accs_train)
print('over-sampling train accuracy mean: %.2f%%' % (mean_train*100))

over-sampling train accuracy mean: 99.99%


In [74]:
std_train = np.std(accs_train)
print('over-sampling train accuracy standard deviation:', std_train)

over-sampling train accuracy standard deviation: 5.5162433e-05


In [147]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over,
                                                    test_size = 0.3,
                                                    random_state=567,
                                                    stratify=y_over)

In [148]:
dat5 = pd.DataFrame(X_test_over[:,0])
dat5['test'] = y_test_over

In [149]:
dat5

Unnamed: 0,0,test
0,NRS241,1
1,NRS148,2
2,NRS255,1
3,NRS214,0
4,NRS148,2
...,...,...
193,CFBRSa30,0
194,NRS266,1
195,SR4152,0
196,NRS109,2


In [150]:
X_train_over = X_train_over[:,1:]
X_test_over = X_test_over[:,1:]

In [151]:
#### add regularizer and dropout
model1_over5 = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],), activity_regularizer=l1(0.001)),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax'),
    Dropout(0.2, ),
])

In [71]:
model1_over5.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [72]:
model1_over5.fit(X_train_over, y_train_over,
          batch_size=16, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 462 samples, validate on 198 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1a38c92748>

In [92]:
acc_test_over5 = model1_over5.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over5*100))

over-sampling test accuracy: 94.44%


In [73]:
pred5 = model1_over5.predict_classes(X_test_over)
pred5

array([1, 2, 1, 0, 2, 1, 2, 2, 1, 1, 2, 2, 2, 1, 0, 2, 2, 1, 2, 2, 1, 1,
       2, 2, 1, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0, 2, 0, 2, 0, 2, 0, 1, 1, 2,
       0, 0, 1, 2, 2, 0, 0, 1, 1, 0, 1, 0, 2, 0, 1, 1, 2, 2, 2, 2, 0, 2,
       2, 2, 0, 1, 2, 1, 0, 0, 0, 2, 0, 1, 1, 0, 0, 1, 0, 1, 2, 2, 0, 0,
       2, 1, 0, 1, 2, 0, 2, 1, 1, 0, 2, 0, 2, 0, 1, 2, 0, 1, 1, 1, 2, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 2, 1, 1, 2, 2, 2, 1, 2, 2, 0, 1, 0, 2, 1,
       1, 1, 1, 0, 1, 0, 2, 0, 0, 1, 1, 2, 0, 1, 0, 2, 1, 1, 1, 1, 0, 2,
       0, 1, 0, 0, 2, 1, 1, 1, 1, 2, 2, 1, 2, 0, 1, 0, 2, 1, 2, 0, 1, 2,
       2, 0, 2, 2, 0, 2, 0, 2, 2, 2, 1, 1, 0, 2, 1, 0, 1, 0, 1, 0, 2, 1])

In [74]:
dat5['pred'] = pred5
dat5

Unnamed: 0,0,test,pred
0,NRS241,1,1
1,NRS148,2,2
2,NRS255,1,1
3,NRS214,0,0
4,NRS148,2,2
...,...,...,...
193,CFBRSa30,0,0
194,NRS266,1,1
195,SR4152,0,0
196,NRS109,2,2


In [75]:
proba5 = model1_over5.predict_proba(X_test_over)
dat_proba5 = pd.DataFrame(proba5)

In [76]:
dat_proba5

Unnamed: 0,0,1,2
0,1.010424e-03,9.989893e-01,2.661334e-07
1,1.171583e-14,1.380678e-15,1.000000e+00
2,6.208902e-13,1.000000e+00,2.522147e-14
3,1.000000e+00,1.010950e-12,7.298213e-14
4,1.171583e-14,1.380678e-15,1.000000e+00
...,...,...,...
193,1.000000e+00,3.208899e-15,1.537287e-16
194,6.152107e-09,1.000000e+00,1.621104e-10
195,1.000000e+00,1.045865e-11,8.918558e-13
196,4.125410e-12,1.115170e-12,1.000000e+00


In [77]:
dat_proba5.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba5.csv", index = False,
         header=None)

In [78]:
dat5.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/5p002ypST.csv", index = False,
         header=None)

In [96]:
hist1_over5 = model1_over5.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 462 samples, validate on 198 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [97]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist1_over5.history['accuracy'])*100))

over-sampling train accuracy: 79.92%


In [152]:
df_proba5 = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_regularizor_dropout_2.xlsx",
                        sheet_name=0,
                        index_col=None)

In [153]:
df_proba5

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p002ykpresabsSTCC_qual,NRS241,1,1,1.342914e-03,9.986569e-01,2.348628e-07
1,p002ykpresabsSTCC_qual,NRS148,2,2,5.170289e-08,1.017893e-07,9.999999e-01
2,p002ykpresabsSTCC_qual,NRS255,1,1,1.780311e-07,9.999999e-01,2.544841e-12
3,p002ykpresabsSTCC_qual,NRS214,0,0,1.000000e+00,2.203547e-10,5.688883e-15
4,p002ykpresabsSTCC_qual,NRS148,2,2,5.170289e-08,1.017893e-07,9.999999e-01
...,...,...,...,...,...,...,...
1977,pyopresabsSTCC_qual,BCH-SA-12,0,0,1.000000e+00,1.152503e-09,1.898730e-09
1978,pyopresabsSTCC_qual,NRS049,0,1,8.401357e-11,1.000000e+00,3.209735e-13
1979,pyopresabsSTCC_qual,NRS022,0,0,1.000000e+00,4.755084e-10,1.974275e-10
1980,pyopresabsSTCC_qual,NRS236,1,1,1.357345e-08,1.000000e+00,1.293117e-10


In [154]:
y_prob5 = df_proba5[df_proba5['phage']=='p002ypresabsSTCC_qual'].iloc[:,-3:]
y_prob5 = y_prob5.to_numpy()
y_prob5

array([[1.01042390e-03, 9.98989300e-01, 2.66133360e-07],
       [1.17158345e-14, 1.38067790e-15, 1.00000000e+00],
       [6.20890200e-13, 1.00000000e+00, 2.52214660e-14],
       [1.00000000e+00, 1.01095010e-12, 7.29821300e-14],
       [1.17158345e-14, 1.38067790e-15, 1.00000000e+00],
       [6.59399170e-06, 9.99993440e-01, 1.21551270e-09],
       [4.12541800e-12, 1.11517380e-12, 1.00000000e+00],
       [2.42376170e-11, 8.64185900e-12, 1.00000000e+00],
       [7.05474650e-08, 9.99999900e-01, 3.39856570e-09],
       [3.29710060e-05, 9.99967000e-01, 8.31547100e-11],
       [1.17158345e-14, 1.38067790e-15, 1.00000000e+00],
       [2.42376170e-11, 8.64185900e-12, 1.00000000e+00],
       [4.12541800e-12, 1.11517380e-12, 1.00000000e+00],
       [8.64502100e-09, 1.00000000e+00, 2.47795400e-10],
       [1.00000000e+00, 6.37299230e-24, 7.31899400e-26],
       [4.12541800e-12, 1.11517380e-12, 1.00000000e+00],
       [1.17158345e-14, 1.38067790e-15, 1.00000000e+00],
       [2.98721130e-05, 9.99965

In [155]:
ovo5 = rocauc_ovo(y_test_over, y_prob5, average="macro", multi_class="ovo")
ovo5

0.9891720232629323

In [156]:
ovr5 = rocauc_ovr(y_test_over, y_prob5, average="macro", multi_class="ovr")
ovr5

0.9891720232629323

In [157]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over,
                                                    test_size = 0.3,
                                                    random_state=678,
                                                    stratify=y_over)

In [158]:
dat6 = pd.DataFrame(X_test_over[:,0])
dat6['test'] = y_test_over

In [159]:
dat6

Unnamed: 0,0,test
0,NRS209,2
1,NRS386,1
2,NRS148,2
3,NRS178,0
4,NRS237,0
...,...,...
193,NRS209,2
194,NRS002,0
195,NRS109,2
196,BCH-SA-03,1


In [160]:
X_train_over = X_train_over[:,1:]
X_test_over = X_test_over[:,1:]

In [83]:
model1_over6 = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],), activity_regularizer=l1(0.001)),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax'),
    Dropout(0.2, ),
])

In [84]:
model1_over6.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [85]:
model1_over6.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 462 samples, validate on 198 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1a3950ea90>

In [109]:
acc_test_over6 = model1_over6.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over6*100))

over-sampling test accuracy: 94.44%


In [86]:
pred6 = model1_over6.predict_classes(X_test_over)
pred6

array([2, 1, 2, 1, 0, 1, 1, 2, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 2, 0,
       0, 0, 1, 2, 1, 2, 2, 0, 1, 0, 2, 2, 1, 0, 0, 2, 0, 2, 1, 2, 1, 2,
       2, 2, 2, 0, 0, 1, 2, 0, 1, 1, 2, 1, 2, 2, 1, 1, 2, 1, 0, 1, 2, 2,
       0, 0, 1, 2, 1, 0, 1, 2, 0, 1, 0, 2, 1, 2, 2, 1, 0, 2, 0, 1, 0, 0,
       0, 2, 2, 0, 1, 0, 2, 0, 0, 1, 2, 1, 1, 1, 2, 0, 1, 2, 2, 1, 0, 1,
       2, 1, 0, 2, 1, 1, 2, 1, 2, 1, 1, 1, 0, 2, 2, 1, 2, 0, 1, 0, 1, 0,
       1, 1, 2, 0, 2, 2, 0, 1, 0, 1, 1, 0, 2, 0, 2, 1, 0, 2, 0, 2, 1, 2,
       1, 2, 1, 0, 2, 0, 2, 2, 0, 2, 2, 2, 0, 0, 2, 1, 0, 0, 0, 1, 2, 1,
       0, 1, 1, 0, 0, 2, 1, 0, 1, 1, 1, 0, 2, 1, 2, 2, 1, 2, 0, 2, 1, 1])

In [87]:
dat6['pred'] = pred6
dat6

Unnamed: 0,0,test,pred
0,NRS209,2,2
1,NRS386,1,1
2,NRS148,2,2
3,NRS178,0,1
4,NRS237,0,0
...,...,...,...
193,NRS209,2,2
194,NRS002,0,0
195,NRS109,2,2
196,BCH-SA-03,1,1


In [88]:
proba6 = model1_over6.predict_proba(X_test_over)
dat_proba6 = pd.DataFrame(proba6)

In [89]:
dat_proba6

Unnamed: 0,0,1,2
0,2.090142e-08,2.289380e-08,1.000000e+00
1,2.581215e-04,9.997348e-01,6.993341e-06
2,7.165129e-09,2.305962e-09,1.000000e+00
3,4.373221e-12,1.000000e+00,4.699599e-16
4,1.000000e+00,2.403704e-10,4.481151e-11
...,...,...,...
193,2.090142e-08,2.289380e-08,1.000000e+00
194,8.901500e-01,1.098500e-01,7.356455e-09
195,1.026436e-08,6.748064e-08,9.999999e-01
196,2.584839e-07,9.999998e-01,4.213840e-13


In [90]:
dat_proba6.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba6.csv", index = False,
         header=None)

In [91]:
dat6.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/6p002ypST.csv", index = False,
         header=None)

In [113]:
hist1_over6 = model1_over6.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 462 samples, validate on 198 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [114]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist1_over6.history['accuracy'])*100))

over-sampling train accuracy: 79.57%


In [161]:
df_proba6 = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_regularizor_dropout_2.xlsx",
                        sheet_name=1,
                        index_col=None)

In [162]:
df_proba6

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p002ykpresabsSTCC_qual,NRS209,2,2,1.790400e-08,4.141849e-08,1.000000e+00
1,p002ykpresabsSTCC_qual,NRS386,1,1,5.739934e-04,9.994259e-01,6.773014e-08
2,p002ykpresabsSTCC_qual,NRS148,2,2,5.286934e-09,1.269109e-08,1.000000e+00
3,p002ykpresabsSTCC_qual,NRS178,0,1,6.494936e-12,1.000000e+00,2.537080e-25
4,p002ykpresabsSTCC_qual,NRS237,0,1,5.701098e-02,9.399204e-01,3.068583e-03
...,...,...,...,...,...,...,...
1977,pyopresabsSTCC_qual,NRS272,0,0,9.999607e-01,3.367024e-05,5.776848e-06
1978,pyopresabsSTCC_qual,NRS112,1,1,8.275442e-08,9.999999e-01,3.739556e-09
1979,pyopresabsSTCC_qual,NRS064,1,1,2.168245e-08,1.000000e+00,9.603962e-09
1980,pyopresabsSTCC_qual,BCH-SA-04,0,0,1.000000e+00,1.026408e-15,1.630406e-14


In [163]:
y_prob6 = df_proba6[df_proba6['phage']=='p002ypresabsSTCC_qual'].iloc[:,-3:]
y_prob6 = y_prob6.to_numpy()
y_prob6

array([[2.09014200e-08, 2.28938020e-08, 1.00000000e+00],
       [2.58121460e-04, 9.99734800e-01, 6.99334120e-06],
       [7.16512900e-09, 2.30596230e-09, 1.00000000e+00],
       [4.37322050e-12, 1.00000000e+00, 4.69959900e-16],
       [1.00000000e+00, 2.40370420e-10, 4.48115120e-11],
       [6.21077400e-05, 9.99937900e-01, 7.19250600e-09],
       [2.58483650e-07, 9.99999760e-01, 4.21383980e-13],
       [2.09014200e-08, 2.28938020e-08, 1.00000000e+00],
       [6.21077400e-05, 9.99937900e-01, 7.19250600e-09],
       [1.11004330e-01, 8.67309500e-01, 2.16861300e-02],
       [2.58121460e-04, 9.99734800e-01, 6.99334120e-06],
       [8.98108850e-07, 9.99999050e-01, 5.97431840e-09],
       [1.75746940e-08, 1.00000000e+00, 3.12505580e-11],
       [1.34828690e-02, 9.86517200e-01, 9.37456000e-10],
       [1.00000000e+00, 9.04878800e-19, 1.96243880e-20],
       [2.58483650e-07, 9.99999760e-01, 4.21383980e-13],
       [1.00000000e+00, 1.55860100e-11, 1.04412880e-14],
       [1.30124600e-06, 9.99998

In [164]:
ovo6 = rocauc_ovo(y_test_over, y_prob6, average="macro", multi_class="ovo")
ovo6

0.9815388735843281

In [165]:
ovr6 = rocauc_ovr(y_test_over, y_prob6, average="macro", multi_class="ovr")
ovr6

0.9815388735843281

In [166]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over,
                                                    test_size = 0.3,
                                                    random_state=789,
                                                    stratify=y_over)

In [167]:
dat7 = pd.DataFrame(X_test_over[:,0])
dat7['test'] = y_test_over

In [168]:
dat7

Unnamed: 0,0,test
0,NRS209,2
1,BCH-SA-09,1
2,NRS224,0
3,NRS209,2
4,NRS235,1
...,...,...
193,NRS209,2
194,CFBREBSa131,1
195,CFBREBSa103,0
196,NRS188,1


In [169]:
X_train_over = X_train_over[:,1:]
X_test_over = X_test_over[:,1:]

In [96]:
model1_over7 = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],), activity_regularizer=l1(0.001)),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax'),
    Dropout(0.2, ),
])

In [97]:
model1_over7.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [98]:
model1_over7.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 462 samples, validate on 198 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1a3a45f390>

In [122]:
acc_test_over7 = model1_over7.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over7*100))

over-sampling test accuracy: 91.41%


In [99]:
pred7 = model1_over7.predict_classes(X_test_over)
pred7

array([2, 1, 0, 2, 1, 1, 1, 2, 1, 1, 1, 0, 0, 0, 1, 2, 1, 2, 2, 0, 2, 1,
       2, 0, 0, 2, 1, 0, 1, 0, 1, 2, 1, 2, 2, 2, 1, 2, 0, 0, 2, 1, 1, 0,
       2, 1, 1, 2, 1, 1, 2, 2, 2, 2, 0, 0, 1, 2, 2, 2, 0, 2, 0, 0, 0, 0,
       0, 2, 0, 2, 0, 1, 0, 1, 2, 1, 0, 0, 1, 1, 1, 1, 2, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 2, 0, 2, 1, 0, 1, 1, 2, 1,
       1, 0, 2, 2, 2, 2, 2, 1, 1, 2, 1, 1, 1, 2, 0, 1, 2, 1, 2, 0, 0, 0,
       2, 2, 1, 1, 0, 1, 0, 1, 2, 0, 0, 1, 2, 0, 1, 0, 1, 0, 1, 0, 1, 2,
       0, 2, 2, 0, 0, 1, 2, 1, 0, 2, 1, 2, 1, 1, 2, 1, 2, 0, 2, 2, 0, 2,
       0, 0, 2, 0, 1, 2, 0, 2, 2, 2, 2, 1, 2, 2, 1, 0, 1, 2, 1, 0, 1, 2])

In [100]:
dat7['pred'] = pred7
dat7

Unnamed: 0,0,test,pred
0,NRS209,2,2
1,BCH-SA-09,1,1
2,NRS224,0,0
3,NRS209,2,2
4,NRS235,1,1
...,...,...,...
193,NRS209,2,2
194,CFBREBSa131,1,1
195,CFBREBSa103,0,0
196,NRS188,1,1


In [101]:
proba7 = model1_over7.predict_proba(X_test_over)
dat_proba7 = pd.DataFrame(proba7)

In [102]:
dat_proba7

Unnamed: 0,0,1,2
0,9.617234e-09,1.121028e-08,1.000000e+00
1,2.367589e-05,9.999763e-01,3.152567e-08
2,1.000000e+00,1.816863e-14,4.248027e-12
3,9.617234e-09,1.121028e-08,1.000000e+00
4,8.324296e-08,9.999999e-01,1.054044e-09
...,...,...,...
193,9.617215e-09,1.121026e-08,1.000000e+00
194,7.296561e-02,8.494496e-01,7.758473e-02
195,1.000000e+00,7.829268e-15,1.843543e-17
196,1.117732e-08,1.000000e+00,4.808631e-10


In [103]:
dat_proba7.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba7.csv", index = False,
         header=None)

In [104]:
dat7.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/7p002ypST.csv", index = False,
         header=None)

In [126]:
hist1_over7 = model1_over7.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 462 samples, validate on 198 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [127]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist1_over7.history['accuracy'])*100))

over-sampling train accuracy: 78.82%


In [170]:
df_proba7 = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_regularizor_dropout_2.xlsx",
                        sheet_name=2,
                        index_col=None)

In [171]:
df_proba7

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p002ykpresabsSTCC_qual,NRS209,2,2,8.300497e-12,1.036520e-09,1.000000e+00
1,p002ykpresabsSTCC_qual,BCH-SA-09,1,1,1.137139e-06,9.999988e-01,2.067601e-09
2,p002ykpresabsSTCC_qual,NRS224,0,0,1.000000e+00,2.093110e-31,0.000000e+00
3,p002ykpresabsSTCC_qual,NRS209,2,2,8.300497e-12,1.036520e-09,1.000000e+00
4,p002ykpresabsSTCC_qual,NRS235,1,1,2.243513e-02,9.774035e-01,1.615106e-04
...,...,...,...,...,...,...,...
1977,pyopresabsSTCC_qual,NRS035,0,0,9.354528e-01,6.414209e-02,4.051121e-04
1978,pyopresabsSTCC_qual,NRS260,1,1,4.808470e-08,1.000000e+00,7.364639e-09
1979,pyopresabsSTCC_qual,CA9,0,0,1.000000e+00,2.361323e-08,2.871247e-08
1980,pyopresabsSTCC_qual,NRS183,1,1,2.755864e-07,9.999998e-01,5.310879e-08


In [172]:
y_prob7 = df_proba7[df_proba7['phage']=='p002ypresabsSTCC_qual'].iloc[:,-3:]
y_prob7 = y_prob7.to_numpy()
y_prob7

array([[9.61723400e-09, 1.12102800e-08, 1.00000000e+00],
       [2.36758900e-05, 9.99976300e-01, 3.15256700e-08],
       [1.00000000e+00, 1.81686280e-14, 4.24802700e-12],
       [9.61723400e-09, 1.12102800e-08, 1.00000000e+00],
       [8.32429600e-08, 9.99999900e-01, 1.05404420e-09],
       [4.62640420e-05, 9.99953600e-01, 9.47458100e-08],
       [7.88794700e-08, 9.99999900e-01, 6.59497940e-11],
       [9.61723400e-09, 1.12102800e-08, 1.00000000e+00],
       [7.29656140e-02, 8.49449630e-01, 7.75847300e-02],
       [8.32429600e-08, 9.99999900e-01, 1.05404420e-09],
       [2.63931080e-08, 1.00000000e+00, 1.31076380e-09],
       [1.00000000e+00, 1.02502680e-16, 6.67100450e-14],
       [1.00000000e+00, 6.40438040e-17, 7.43121200e-20],
       [1.00000000e+00, 5.07298100e-14, 1.57200220e-16],
       [7.88794700e-08, 9.99999900e-01, 6.59497940e-11],
       [1.54003670e-08, 1.82290110e-08, 1.00000000e+00],
       [7.29656140e-02, 8.49449630e-01, 7.75847300e-02],
       [9.61723400e-09, 1.12102

In [173]:
ovo7 = rocauc_ovo(y_test_over, y_prob7, average="macro", multi_class="ovo")
ovo7

0.9852502295684115

In [174]:
ovr7 = rocauc_ovr(y_test_over, y_prob7, average="macro", multi_class="ovr")
ovr7

0.9852502295684115

In [175]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over,
                                                    test_size = 0.3,
                                                    random_state=890,
                                                    stratify=y_over)

In [176]:
dat8 = pd.DataFrame(X_test_over[:,0])
dat8['test'] = y_test_over

In [177]:
dat8

Unnamed: 0,0,test
0,CFBREBSa116,0
1,NRS214,0
2,NRS148,2
3,NRS148,2
4,NRS148,2
...,...,...
193,NRS148,2
194,NRS054,0
195,NRS109,2
196,NRS216,1


In [178]:
X_train_over = X_train_over[:,1:]
X_test_over = X_test_over[:,1:]

In [109]:
model1_over8 = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],), activity_regularizer=l1(0.001)),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax'),
    Dropout(0.2, ),
])

In [110]:
model1_over8.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [111]:
model1_over8.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 462 samples, validate on 198 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1a3b05c2b0>

In [135]:
acc_test_over8 = model1_over8.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over8*100))

over-sampling test accuracy: 94.44%


In [112]:
pred8 = model1_over8.predict_classes(X_test_over)
pred8

array([0, 0, 2, 2, 2, 1, 2, 0, 0, 1, 0, 2, 1, 0, 1, 2, 1, 2, 1, 1, 0, 0,
       1, 1, 0, 1, 0, 2, 1, 0, 2, 1, 0, 1, 1, 2, 2, 0, 0, 0, 2, 0, 1, 2,
       2, 2, 2, 1, 1, 1, 2, 2, 1, 0, 2, 1, 1, 2, 1, 0, 0, 1, 2, 0, 2, 2,
       1, 2, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 2, 1, 0, 1, 1,
       1, 0, 0, 1, 1, 2, 0, 2, 0, 1, 1, 1, 2, 2, 1, 1, 1, 2, 1, 1, 1, 0,
       2, 1, 1, 2, 2, 2, 0, 1, 1, 2, 2, 1, 0, 2, 2, 1, 1, 2, 2, 2, 2, 0,
       0, 2, 2, 1, 1, 0, 1, 2, 1, 0, 0, 1, 2, 0, 0, 2, 1, 0, 0, 1, 2, 2,
       2, 2, 2, 1, 1, 1, 1, 2, 0, 0, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 2,
       2, 0, 2, 1, 0, 0, 1, 2, 1, 1, 0, 1, 2, 0, 1, 2, 0, 2, 0, 2, 1, 1])

In [113]:
dat8['pred'] = pred8
dat8

Unnamed: 0,0,test,pred
0,CFBREBSa116,0,0
1,NRS214,0,0
2,NRS148,2,2
3,NRS148,2,2
4,NRS148,2,2
...,...,...,...
193,NRS148,2,2
194,NRS054,0,0
195,NRS109,2,2
196,NRS216,1,1


In [114]:
proba8 = model1_over8.predict_proba(X_test_over)
dat_proba8 = pd.DataFrame(proba8)

In [115]:
dat_proba8

Unnamed: 0,0,1,2
0,1.000000e+00,2.989959e-17,2.267019e-16
1,1.000000e+00,7.571602e-11,6.163611e-11
2,1.903322e-11,9.252212e-12,1.000000e+00
3,1.903322e-11,9.252212e-12,1.000000e+00
4,1.903322e-11,9.252212e-12,1.000000e+00
...,...,...,...
193,1.903325e-11,9.252230e-12,1.000000e+00
194,9.999892e-01,7.644894e-06,3.258920e-06
195,3.963342e-13,1.159683e-13,1.000000e+00
196,9.209495e-09,1.000000e+00,1.830634e-10


In [116]:
dat_proba8.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba8.csv", index = False,
         header=None)

In [117]:
dat8.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/8p002ypST.csv", index = False,
         header=None)

In [139]:
hist1_over8 = model1_over8.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 462 samples, validate on 198 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
 64/462 [===>..........................] - ETA: 0s - loss: 0.4037 - accuracy: 0.8125



Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [140]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist1_over8.history['accuracy'])*100))

over-sampling train accuracy: 79.55%


In [179]:
df_proba8 = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_regularizor_dropout_2.xlsx",
                        sheet_name=3,
                        index_col=None)

In [180]:
df_proba8

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p002ykpresabsSTCC_qual,CFBREBSa116,0,0,9.676203e-01,3.237956e-02,1.480166e-07
1,p002ykpresabsSTCC_qual,NRS214,0,0,1.000000e+00,6.534852e-11,2.250731e-18
2,p002ykpresabsSTCC_qual,NRS148,2,2,3.948571e-11,2.839096e-07,9.999998e-01
3,p002ykpresabsSTCC_qual,NRS148,2,2,3.948571e-11,2.839096e-07,9.999998e-01
4,p002ykpresabsSTCC_qual,NRS148,2,2,3.948571e-11,2.839096e-07,9.999998e-01
...,...,...,...,...,...,...,...
1977,pyopresabsSTCC_qual,NRS205,2,2,3.691095e-08,3.571927e-08,9.999999e-01
1978,pyopresabsSTCC_qual,CFBREBSa122,0,1,9.261665e-02,9.073822e-01,1.162373e-06
1979,pyopresabsSTCC_qual,NRS001,1,1,4.174278e-07,9.999995e-01,3.254024e-09
1980,pyopresabsSTCC_qual,NRS148,2,2,3.234670e-08,3.121212e-08,9.999999e-01


In [181]:
y_prob8 = df_proba8[df_proba8['phage']=='p002ypresabsSTCC_qual'].iloc[:,-3:]
y_prob8 = y_prob8.to_numpy()
y_prob8

array([[1.00000000e+00, 2.98995880e-17, 2.26701880e-16],
       [1.00000000e+00, 7.57160200e-11, 6.16361060e-11],
       [1.90332160e-11, 9.25221200e-12, 1.00000000e+00],
       [1.90332160e-11, 9.25221200e-12, 1.00000000e+00],
       [1.90332160e-11, 9.25221200e-12, 1.00000000e+00],
       [9.20947800e-09, 1.00000000e+00, 1.83062750e-10],
       [1.90332160e-11, 9.25221200e-12, 1.00000000e+00],
       [1.00000000e+00, 3.42661080e-14, 1.20149440e-13],
       [1.00000000e+00, 2.47253750e-11, 2.14327900e-11],
       [5.53588500e-09, 1.00000000e+00, 7.43924440e-11],
       [1.00000000e+00, 2.66637300e-14, 2.73315330e-13],
       [3.96337230e-13, 1.15969180e-13, 1.00000000e+00],
       [5.53588500e-09, 1.00000000e+00, 7.43924440e-11],
       [1.00000000e+00, 6.36831000e-14, 7.71469950e-14],
       [7.14991730e-10, 1.00000000e+00, 3.25342270e-13],
       [4.70374100e-09, 4.20116160e-09, 1.00000000e+00],
       [7.93568000e-02, 8.78490870e-01, 4.21523230e-02],
       [1.90332160e-11, 9.25221

In [182]:
ovo8 = rocauc_ovo(y_test_over, y_prob8, average="macro", multi_class="ovo")
ovo8

0.977502295684114

In [183]:
ovr8 = rocauc_ovr(y_test_over, y_prob8, average="macro", multi_class="ovr")
ovr8

0.977502295684114

In [186]:
ovos2 = [ovo5, ovo6, ovo7, ovo8]
np.mean(ovos2)

0.9833658555249465

In [187]:
np.std(ovos2)

0.004329597162429823

In [188]:
ovrs2 = [ovr5, ovr6, ovr7, ovr8]
np.mean(ovrs2)

0.9833658555249465

In [189]:
np.std(ovrs2)

0.004329597162429823

In [141]:
accs_reg = [acc_test_over5, acc_test_over6, acc_test_over7, acc_test_over8]

In [142]:
mean_reg = np.mean(accs_reg)
print('over-sampling test accuracy regularization mean: %.2f%%' % (mean_reg*100))

over-sampling test accuracy regularization mean: 93.69%


In [143]:
std_reg = np.std(accs_reg)
print('over-sampling test accuracy regularization standard deviation:', std_reg)

over-sampling test accuracy regularization standard deviation: 0.013121584513306875


In [144]:
accs_train_reg = [np.mean(hist1_over5.history['accuracy']), np.mean(hist1_over6.history['accuracy']), np.mean(hist1_over7.history['accuracy']),
             np.mean(hist1_over8.history['accuracy'])]

In [145]:
mean_train_reg = np.mean(accs_train_reg)
print('over-sampling train accuracy regularization mean: %.2f%%' % (mean_train_reg*100))

over-sampling train accuracy regularization mean: 79.46%


In [146]:
std_train_reg = np.std(accs_train_reg)
print('over-sampling train accuracy regularization standard deviation:', std_train_reg)

over-sampling train accuracy regularization standard deviation: 0.0039923233
