In [None]:
## This file implements neural networks with/without dropout and regularizer for pyopresabs_qual with four replicates.
## We compute the mean and standarad deviation of training and test accuracies.
## We also compute the mean and standard deviation of AUC ROC values for each model.

In [1]:
from numpy.random import seed
import numpy as np
seed(100)
import tensorflow
tensorflow.random.set_seed(123)

In [2]:
import pandas as pd

df = pd.read_csv('/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/pyopresabs_qual.csv')
df.shape

(253, 612)

In [3]:
df.rename(columns={'Unnamed: 0':'id'}, inplace=True)

In [4]:
df['pheno']

0      0
1      0
2      0
3      0
4      0
      ..
248    0
249    0
250    0
251    0
252    0
Name: pheno, Length: 253, dtype: int64

In [5]:
df.head()

Unnamed: 0,id,TTTTCCCCCAT,TTCAAATTACCCTCTCTTGTAAAATCAAATTCACATGATGTCCATGGTTC,TGGGTCTGAC,TCCTGATGGACCAAAACCTAATTTAATCCAATCTATATAATCAAACGATACTTTCAAATTACCCTCTCTTGTAAAATCAAATTCACATGATGTCCATGGT,TATATAGACTG,TAGTCGCACT,TAAGAATAATATATTAAATATTTATTAACAAATTATAGATAAAATATGAATAATTAATTAATGGTATTTACATATTCATAACC,GGGCTGAGG,GAGCAACCTT,...,group_8644,group_8645,group_8646,group_8815,group_8892,group_9007,group_9104,group_9110,group_9207,pheno
0,107,0,1,1,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,109,0,1,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,115,0,0,0,0,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3,120335,0,1,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,120337,0,1,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
df['pheno'].value_counts()

0    217
1     32
2      4
Name: pheno, dtype: int64

In [7]:
df_clean = df.drop(columns=['id'])

In [8]:
df_clean.shape

(253, 611)

In [9]:
df_clean.head()

Unnamed: 0,TTTTCCCCCAT,TTCAAATTACCCTCTCTTGTAAAATCAAATTCACATGATGTCCATGGTTC,TGGGTCTGAC,TCCTGATGGACCAAAACCTAATTTAATCCAATCTATATAATCAAACGATACTTTCAAATTACCCTCTCTTGTAAAATCAAATTCACATGATGTCCATGGT,TATATAGACTG,TAGTCGCACT,TAAGAATAATATATTAAATATTTATTAACAAATTATAGATAAAATATGAATAATTAATTAATGGTATTTACATATTCATAACC,GGGCTGAGG,GAGCAACCTT,GAACCATGGACATCATGTGAATTTGATTTTACAAGAGAGGGT,...,group_8644,group_8645,group_8646,group_8815,group_8892,group_9007,group_9104,group_9110,group_9207,pheno
0,0,1,1,1,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,1,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,1,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,1,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [10]:
X = df.loc[:, df.columns != 'pheno']
y = df['pheno']
print(X.shape, y.shape)

(253, 611) (253,)


In [11]:
# over-sampling
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler
overS = RandomOverSampler(random_state=100)
X_over, y_over = overS.fit_resample(X, y)
print(sorted(Counter(y_over).items()))

Using TensorFlow backend.


[(0, 217), (1, 217), (2, 217)]




In [12]:
############# Fully-Connected Neural Network ################

In [13]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.regularizers import l1

In [14]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over,
                                                    test_size = 0.3,
                                                    random_state=123,
                                                    stratify=y_over)

In [15]:
dat = pd.DataFrame(X_test_over[:,0])
dat['test'] = y_test_over

In [16]:
dat

Unnamed: 0,0,test
0,CFBRSa07,0
1,CFBRSa66A,0
2,NRS112,1
3,NRS211,0
4,CFBRSa22,0
...,...,...
191,NRS148,2
192,NRS255,2
193,NRS205,2
194,NRS255,2


In [17]:
X_train_over = X_train_over[:,1:]
X_test_over = X_test_over[:,1:]

In [18]:
#### neural network on over-sampling data
model1_over = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],)),
    Dense(3, activation='softmax'),
])

In [19]:
model1_over.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [20]:
model1_over.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 455 samples, validate on 196 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1a345325f8>

In [21]:
acc_test_over = model1_over.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over*100))

over-sampling test accuracy: 97.45%


In [21]:
pred = model1_over.predict_classes(X_test_over)
pred

array([0, 0, 1, 1, 0, 1, 1, 0, 2, 1, 0, 0, 1, 2, 0, 0, 1, 0, 0, 1, 1, 1,
       0, 1, 0, 2, 0, 0, 0, 2, 2, 0, 2, 0, 0, 0, 2, 2, 1, 2, 1, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 2, 2, 2, 1, 1, 1, 0, 1, 1, 1, 2, 0, 0, 2, 1, 1,
       0, 1, 2, 0, 0, 1, 0, 2, 2, 1, 1, 2, 0, 1, 0, 0, 0, 1, 0, 1, 2, 1,
       0, 0, 0, 1, 1, 2, 2, 2, 2, 2, 1, 1, 2, 0, 0, 0, 1, 2, 1, 0, 2, 1,
       1, 1, 0, 2, 1, 2, 2, 1, 1, 2, 1, 2, 2, 2, 2, 1, 1, 0, 2, 1, 0, 0,
       0, 2, 2, 0, 2, 0, 0, 0, 2, 1, 0, 2, 1, 0, 1, 2, 2, 1, 1, 2, 2, 2,
       2, 2, 2, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 2, 0, 2, 1, 2, 2, 2,
       1, 1, 2, 1, 0, 2, 1, 2, 2, 2, 2, 1, 2, 1, 0, 2, 2, 2, 2, 2])

In [22]:
dat['pred'] = pred
dat

Unnamed: 0,0,test,pred
0,CFBRSa07,0,0
1,CFBRSa66A,0,0
2,NRS112,1,1
3,NRS211,0,1
4,CFBRSa22,0,0
...,...,...,...
191,NRS148,2,2
192,NRS255,2,2
193,NRS205,2,2
194,NRS255,2,2


In [23]:
proba1 = model1_over.predict_proba(X_test_over)
dat_proba1 = pd.DataFrame(proba1)

In [24]:
dat_proba1

Unnamed: 0,0,1,2
0,9.969317e-01,3.025032e-03,4.328353e-05
1,9.999998e-01,1.815897e-07,6.515274e-11
2,5.695107e-04,9.993972e-01,3.336425e-05
3,2.700199e-01,7.291656e-01,8.145223e-04
4,9.999968e-01,3.227481e-06,1.450214e-10
...,...,...,...
191,6.744767e-07,1.196296e-06,9.999981e-01
192,5.102197e-05,2.256726e-04,9.997233e-01
193,2.193248e-06,1.811189e-07,9.999976e-01
194,5.102197e-05,2.256726e-04,9.997233e-01


In [25]:
dat_proba1.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba1.csv", index = False,
         header=None)

In [26]:
dat.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/1pyop.csv", index = False,
         header=None)

In [25]:
hist1_over = model1_over.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 455 samples, validate on 196 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
E

In [26]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist1_over.history['accuracy'])*100))

over-sampling train accuracy: 100.00%


In [19]:
df_proba = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_2.xlsx",
                        sheet_name=0,
                        index_col=None)

In [20]:
df_proba

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p002ykpresabs_qual,CFBRSa26,0,0,0.758914,0.241086,4.638713e-07
1,p002ykpresabs_qual,NRS109,2,2,0.005361,0.016236,9.784034e-01
2,p002ykpresabs_qual,NRS112,0,0,0.726623,0.273376,1.520979e-06
3,p002ykpresabs_qual,NRS216,1,1,0.138322,0.861665,1.334123e-05
4,p002ykpresabs_qual,NRS021,0,0,0.882176,0.117824,1.414530e-10
...,...,...,...,...,...,...,...
4279,pyopresabsSTCC_qual,NRS148,2,2,0.000007,0.000099,9.998934e-01
4280,pyopresabsSTCC_qual,NRS255,2,2,0.000257,0.002048,9.976944e-01
4281,pyopresabsSTCC_qual,NRS205,2,2,0.000011,0.000045,9.999435e-01
4282,pyopresabsSTCC_qual,NRS255,2,2,0.000257,0.002048,9.976944e-01


In [21]:
y_prob = df_proba[df_proba['phage']=='pyopresabs_qual'].iloc[:,-3:]
y_prob = y_prob.to_numpy()
y_prob

array([[9.96931700e-01, 3.02503200e-03, 4.32835300e-05],
       [9.99999760e-01, 1.81589700e-07, 6.51527400e-11],
       [5.69510700e-04, 9.99397160e-01, 3.33642500e-05],
       [2.70019860e-01, 7.29165550e-01, 8.14522300e-04],
       [9.99996800e-01, 3.22748060e-06, 1.45021380e-10],
       [3.07954480e-03, 9.96917000e-01, 3.42955170e-06],
       [4.05356750e-03, 9.95901300e-01, 4.50550600e-05],
       [9.89232540e-01, 1.07656560e-02, 1.75003570e-06],
       [2.19324780e-06, 1.81118890e-07, 9.99997600e-01],
       [1.45743640e-03, 9.98413560e-01, 1.29059980e-04],
       [9.99489400e-01, 5.10568800e-04, 1.64358480e-08],
       [9.99999900e-01, 7.62459600e-08, 5.61407550e-11],
       [7.84250300e-07, 9.99939800e-01, 5.93978330e-05],
       [2.19324780e-06, 1.81118890e-07, 9.99997600e-01],
       [1.00000000e+00, 1.56779160e-13, 1.39752140e-10],
       [9.99923200e-01, 6.06750150e-05, 1.61404500e-05],
       [1.18362330e-03, 9.98816130e-01, 1.85832010e-07],
       [9.99888200e-01, 1.11831

In [22]:
## Retrieved from https://github.com/scikit-learn/scikit-learn/issues/3298
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelBinarizer

def rocauc_ovo(truth, pred, average="macro", multi_class="ovo"):

    lb = LabelBinarizer()
    lb.fit(truth)

    truth = lb.transform(truth)   
    
    return roc_auc_score(truth, pred, average=average, multi_class=multi_class)

In [23]:
ovo1 = rocauc_ovo(y_test_over, y_prob, average="macro", multi_class="ovo")
ovo1

0.9949892346838912

In [24]:
def rocauc_ovr(truth, pred, average="macro", multi_class="ovr"):

    lb = LabelBinarizer()
    lb.fit(truth)

    truth = lb.transform(truth)   

    return roc_auc_score(truth, pred, average=average, multi_class=multi_class)

In [25]:
ovr1 = rocauc_ovr(y_test_over, y_prob, average="macro", multi_class="ovr")
ovr1

0.9949892346838912

In [26]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over,
                                                    test_size = 0.3,
                                                    random_state=234,
                                                    stratify=y_over)

In [27]:
dat2 = pd.DataFrame(X_test_over[:,0])
dat2['test'] = y_test_over

In [28]:
dat2

Unnamed: 0,0,test
0,BCH-SA-04,0
1,NRS110,1
2,NRS109,2
3,NRS183,1
4,BCH-SA-05,0
...,...,...
191,NRS112,1
192,SR1065,0
193,NRS203,0
194,CFBREBSa129,0


In [29]:
X_train_over = X_train_over[:,1:]
X_test_over = X_test_over[:,1:]

In [31]:
model1_over2 = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],)),
    Dense(3, activation='softmax'),
])

In [32]:
model1_over2.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [33]:
model1_over2.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 455 samples, validate on 196 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x630e3d0b8>

In [34]:
acc_test_over2 = model1_over2.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over2*100))

over-sampling test accuracy: 96.43%


In [34]:
pred2 = model1_over2.predict_classes(X_test_over)
pred2

array([0, 1, 2, 1, 0, 0, 2, 2, 2, 2, 1, 0, 1, 2, 0, 0, 2, 1, 2, 1, 2, 1,
       2, 1, 0, 1, 0, 1, 2, 2, 2, 1, 1, 0, 0, 2, 0, 2, 1, 1, 1, 1, 1, 2,
       1, 2, 1, 1, 2, 1, 2, 0, 1, 0, 0, 1, 2, 1, 0, 1, 2, 0, 1, 1, 1, 1,
       0, 2, 1, 2, 1, 2, 2, 0, 0, 0, 0, 1, 0, 1, 2, 0, 0, 2, 0, 1, 1, 1,
       2, 1, 1, 0, 1, 1, 1, 2, 0, 2, 0, 1, 2, 0, 2, 1, 0, 0, 1, 0, 2, 2,
       0, 2, 0, 2, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 0, 1, 2, 1, 1, 1, 2, 2,
       0, 2, 0, 2, 2, 0, 1, 2, 0, 0, 1, 2, 2, 1, 1, 0, 0, 2, 2, 2, 1, 1,
       0, 2, 1, 2, 1, 0, 2, 0, 2, 1, 1, 1, 2, 1, 2, 0, 1, 0, 1, 2, 0, 2,
       1, 0, 0, 0, 0, 1, 2, 2, 1, 0, 2, 1, 2, 0, 2, 1, 0, 0, 0, 0])

In [35]:
dat2['pred'] = pred2
dat2

Unnamed: 0,0,test,pred
0,BCH-SA-04,0,0
1,NRS110,1,1
2,NRS109,2,2
3,NRS183,1,1
4,BCH-SA-05,0,0
...,...,...,...
191,NRS112,1,1
192,SR1065,0,0
193,NRS203,0,0
194,CFBREBSa129,0,0


In [36]:
proba2 = model1_over2.predict_proba(X_test_over)
dat_proba2 = pd.DataFrame(proba2)

In [37]:
dat_proba2

Unnamed: 0,0,1,2
0,0.999987,2.188299e-06,1.047621e-05
1,0.000192,9.998078e-01,2.152596e-07
2,0.000101,1.293190e-04,9.997700e-01
3,0.001240,9.984179e-01,3.424566e-04
4,0.999987,1.278588e-05,1.477784e-07
...,...,...,...
191,0.001045,9.988925e-01,6.241521e-05
192,0.999998,1.182363e-06,1.043173e-06
193,0.997402,2.434911e-03,1.627142e-04
194,1.000000,2.506735e-12,1.521137e-08


In [38]:
dat_proba2.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba2.csv", index = False,
         header=None)

In [39]:
dat2.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/2pyop.csv", index = False,
         header=None)

In [38]:
hist1_over2 = model1_over2.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 455 samples, validate on 196 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
E

In [39]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist1_over2.history['accuracy'])*100))

over-sampling train accuracy: 100.00%


In [30]:
df_proba2 = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_2.xlsx",
                        sheet_name=1,
                        index_col=None)

In [31]:
df_proba2

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p002ykpresabs_qual,NRS148,2,2,0.000056,1.748042e-03,9.981960e-01
1,p002ykpresabs_qual,BCH-SA-03,1,0,0.712007,2.879924e-01,9.646217e-07
2,p002ykpresabs_qual,NRS218,1,1,0.006222,9.937732e-01,4.482882e-06
3,p002ykpresabs_qual,NRS036,0,0,0.882617,1.173831e-01,2.310933e-10
4,p002ykpresabs_qual,NRS386,1,0,0.571179,4.288184e-01,2.444667e-06
...,...,...,...,...,...,...,...
4279,pyopresabsSTCC_qual,NRS112,1,1,0.001860,9.979747e-01,1.653396e-04
4280,pyopresabsSTCC_qual,SR1065,0,0,0.982940,1.705227e-02,7.349168e-06
4281,pyopresabsSTCC_qual,NRS203,0,0,0.997093,1.962516e-03,9.441347e-04
4282,pyopresabsSTCC_qual,CFBREBSa129,0,0,1.000000,3.031141e-13,3.208205e-09


In [32]:
y_prob2 = df_proba2[df_proba2['phage']=='pyopresabs_qual'].iloc[:,-3:]
y_prob2 = y_prob2.to_numpy()
y_prob2

array([[9.99987360e-01, 2.18829880e-06, 1.04762130e-05],
       [1.92046300e-04, 9.99807800e-01, 2.15259630e-07],
       [1.00679090e-04, 1.29319010e-04, 9.99770000e-01],
       [1.23976230e-03, 9.98417850e-01, 3.42456600e-04],
       [9.99987100e-01, 1.27858760e-05, 1.47778410e-07],
       [9.99999170e-01, 6.69225400e-07, 1.50855940e-07],
       [6.12886800e-05, 1.87066410e-04, 9.99751600e-01],
       [5.19854740e-08, 8.26063230e-07, 9.99999170e-01],
       [1.00679090e-04, 1.29319010e-04, 9.99770000e-01],
       [8.28986400e-08, 2.73671840e-07, 9.99999640e-01],
       [2.15213460e-05, 9.99350250e-01, 6.28237100e-04],
       [9.99892700e-01, 1.03719045e-04, 3.55666040e-06],
       [2.93898200e-03, 9.97053600e-01, 7.35788170e-06],
       [1.00679090e-04, 1.29319010e-04, 9.99770000e-01],
       [8.03582600e-01, 1.96370420e-01, 4.69492330e-05],
       [9.87799400e-01, 1.20986850e-02, 1.01860340e-04],
       [5.19854740e-08, 8.26063230e-07, 9.99999170e-01],
       [2.15213460e-05, 9.99350

In [33]:
ovo2 = rocauc_ovo(y_test_over, y_prob2, average="macro", multi_class="ovo")
ovo2

0.9952463566967383

In [34]:
ovr2 = rocauc_ovr(y_test_over, y_prob2, average="macro", multi_class="ovr")
ovr2

0.9952463566967383

In [35]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over,
                                                    test_size = 0.3,
                                                    random_state=345,
                                                    stratify=y_over)

In [36]:
dat3 = pd.DataFrame(X_test_over[:,0])
dat3['test'] = y_test_over

In [37]:
dat3

Unnamed: 0,0,test
0,NRS168,1
1,NRS383,1
2,NRS148,2
3,NRS109,2
4,NRS213,0
...,...,...
191,NRS255,2
192,NRS255,2
193,NRS266,1
194,NRS001,1


In [38]:
X_train_over = X_train_over[:,1:]
X_test_over = X_test_over[:,1:]

In [44]:
model1_over3 = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],)),
    Dense(3, activation='softmax'),
])

In [45]:
model1_over3.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [46]:
model1_over3.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 455 samples, validate on 196 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x6312f1978>

In [47]:
acc_test_over3 = model1_over3.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over3*100))

over-sampling test accuracy: 97.45%


In [47]:
pred3 = model1_over3.predict_classes(X_test_over)
pred3

array([1, 1, 2, 2, 0, 0, 0, 2, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 2, 2, 1, 1,
       0, 2, 0, 1, 2, 1, 1, 0, 0, 2, 1, 2, 1, 1, 0, 2, 1, 1, 1, 2, 1, 2,
       0, 1, 2, 1, 1, 2, 0, 2, 1, 1, 0, 2, 0, 1, 2, 0, 2, 1, 0, 0, 2, 0,
       1, 0, 2, 1, 1, 0, 1, 0, 0, 2, 0, 1, 0, 0, 2, 0, 0, 2, 0, 2, 1, 1,
       2, 1, 2, 1, 0, 2, 0, 1, 2, 2, 1, 2, 2, 1, 2, 2, 1, 0, 2, 0, 2, 2,
       1, 2, 1, 1, 0, 1, 1, 2, 2, 1, 1, 2, 0, 2, 1, 1, 0, 1, 0, 2, 2, 0,
       1, 2, 2, 1, 2, 2, 2, 0, 1, 0, 0, 2, 1, 1, 1, 2, 0, 2, 2, 0, 2, 1,
       0, 1, 2, 0, 1, 1, 0, 0, 2, 0, 0, 0, 0, 2, 2, 0, 1, 1, 2, 0, 1, 2,
       2, 0, 2, 1, 0, 2, 0, 2, 1, 1, 1, 1, 1, 0, 2, 2, 2, 1, 1, 1])

In [48]:
dat3['pred'] = pred3
dat3

Unnamed: 0,0,test,pred
0,NRS168,1,1
1,NRS383,1,1
2,NRS148,2,2
3,NRS109,2,2
4,NRS213,0,0
...,...,...,...
191,NRS255,2,2
192,NRS255,2,2
193,NRS266,1,1
194,NRS001,1,1


In [49]:
proba3 = model1_over3.predict_proba(X_test_over)
dat_proba3 = pd.DataFrame(proba3)

In [50]:
dat_proba3

Unnamed: 0,0,1,2
0,1.067561e-03,9.986314e-01,3.009572e-04
1,3.340373e-03,9.966537e-01,5.995487e-06
2,1.013187e-08,3.012955e-07,9.999996e-01
3,3.516528e-04,2.067980e-04,9.994416e-01
4,9.986980e-01,8.186222e-04,4.833555e-04
...,...,...,...
191,3.268514e-05,1.174866e-04,9.998498e-01
192,3.268514e-05,1.174866e-04,9.998498e-01
193,3.445794e-03,9.965532e-01,1.058171e-06
194,4.533626e-04,9.995466e-01,1.211587e-08


In [51]:
dat_proba3.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba3.csv", index = False,
         header=None)

In [52]:
dat3.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/3pyop.csv", index = False,
         header=None)

In [51]:
hist1_over3 = model1_over3.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 455 samples, validate on 196 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100


Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [52]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist1_over3.history['accuracy'])*100))

over-sampling train accuracy: 100.00%


In [39]:
df_proba3 = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_2.xlsx",
                        sheet_name=2,
                        index_col=None)

In [40]:
df_proba3

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p002ykpresabs_qual,NRS109,2,2,0.004477,0.013518,9.820048e-01
1,p002ykpresabs_qual,NRS109,2,2,0.004477,0.013518,9.820048e-01
2,p002ykpresabs_qual,NRS222,0,0,0.851725,0.148269,5.980786e-06
3,p002ykpresabs_qual,NRS109,2,2,0.004477,0.013518,9.820048e-01
4,p002ykpresabs_qual,GA50245,0,0,0.812055,0.187945,1.161034e-07
...,...,...,...,...,...,...,...
4279,pyopresabsSTCC_qual,NRS255,2,2,0.000633,0.000928,9.984396e-01
4280,pyopresabsSTCC_qual,NRS255,2,2,0.000633,0.000928,9.984396e-01
4281,pyopresabsSTCC_qual,NRS266,1,1,0.025932,0.974061,7.323514e-06
4282,pyopresabsSTCC_qual,NRS001,1,1,0.000597,0.999403,3.675362e-10


In [41]:
y_prob3 = df_proba3[df_proba3['phage']=='pyopresabs_qual'].iloc[:,-3:]
y_prob3 = y_prob3.to_numpy()
y_prob3

array([[1.06756050e-03, 9.98631400e-01, 3.00957180e-04],
       [3.34037300e-03, 9.96653700e-01, 5.99548730e-06],
       [1.01318710e-08, 3.01295500e-07, 9.99999640e-01],
       [3.51652800e-04, 2.06797980e-04, 9.99441560e-01],
       [9.98698000e-01, 8.18622200e-04, 4.83355500e-04],
       [9.99557800e-01, 4.41890270e-04, 2.97983500e-07],
       [1.00000000e+00, 3.68119740e-11, 8.43727300e-09],
       [3.51652800e-04, 2.06797980e-04, 9.99441560e-01],
       [1.00000000e+00, 3.51061050e-09, 7.80931360e-11],
       [1.27618570e-03, 9.98723700e-01, 8.18768500e-08],
       [9.98868000e-01, 9.97580800e-04, 1.34462620e-04],
       [9.95727300e-01, 4.02861140e-03, 2.44106430e-04],
       [3.12047450e-03, 9.96879100e-01, 4.98674500e-07],
       [9.96742670e-01, 2.66662540e-03, 5.90831860e-04],
       [6.92515260e-03, 9.93031140e-01, 4.36997200e-05],
       [5.49813850e-04, 9.99255600e-01, 1.94553290e-04],
       [9.99500750e-01, 6.29872960e-05, 4.36184800e-04],
       [9.14257100e-01, 8.57062

In [42]:
ovo3 = rocauc_ovo(y_test_over, y_prob3, average="macro", multi_class="ovo")
ovo3

0.9960998712907111

In [43]:
ovr3 = rocauc_ovr(y_test_over, y_prob3, average="macro", multi_class="ovr")
ovr3

0.9960998712907111

In [44]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over,
                                                    test_size = 0.3,
                                                    random_state=456,
                                                    stratify=y_over)

In [45]:
dat4 = pd.DataFrame(X_test_over[:,0])
dat4['test'] = y_test_over

In [46]:
dat4

Unnamed: 0,0,test
0,NRS178,1
1,NRS109,2
2,NRS073,1
3,CFBREBSa119,0
4,NRS109,2
...,...,...
191,NRS236,1
192,NRS029,0
193,NRS148,2
194,CFBRSa28,0


In [47]:
X_train_over = X_train_over[:,1:]
X_test_over = X_test_over[:,1:]

In [57]:
model1_over4 = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],)),
    Dense(3, activation='softmax'),
])

In [58]:
model1_over4.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [59]:
model1_over4.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 455 samples, validate on 196 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1a34cfeef0>

In [60]:
acc_test_over4 = model1_over4.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over4*100))

over-sampling test accuracy: 95.92%


In [60]:
pred4 = model1_over4.predict_classes(X_test_over)
pred4

array([1, 2, 1, 0, 2, 1, 1, 2, 2, 0, 2, 1, 1, 2, 0, 1, 0, 2, 2, 0, 2, 1,
       1, 1, 1, 0, 2, 2, 1, 1, 2, 0, 1, 2, 2, 0, 0, 0, 0, 1, 1, 2, 1, 0,
       1, 1, 2, 2, 0, 0, 2, 1, 2, 2, 1, 0, 0, 2, 0, 1, 1, 1, 2, 1, 2, 0,
       0, 1, 1, 2, 0, 0, 1, 2, 0, 1, 2, 1, 1, 0, 0, 2, 1, 2, 0, 0, 1, 0,
       0, 2, 2, 2, 2, 2, 1, 1, 1, 1, 0, 2, 0, 0, 2, 2, 1, 1, 1, 2, 0, 1,
       2, 2, 2, 0, 2, 2, 2, 1, 1, 0, 0, 2, 1, 1, 0, 1, 2, 2, 2, 0, 1, 0,
       2, 1, 2, 0, 1, 1, 1, 2, 2, 1, 0, 2, 0, 0, 1, 0, 0, 0, 1, 2, 2, 2,
       1, 1, 1, 2, 2, 0, 1, 2, 1, 1, 1, 2, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 2, 1, 1, 1, 0, 2, 0, 0, 0, 0, 2, 0, 2, 2, 1, 1, 2, 0, 2])

In [61]:
dat4['pred'] = pred4
dat4

Unnamed: 0,0,test,pred
0,NRS178,1,1
1,NRS109,2,2
2,NRS073,1,1
3,CFBREBSa119,0,0
4,NRS109,2,2
...,...,...,...
191,NRS236,1,1
192,NRS029,0,1
193,NRS148,2,2
194,CFBRSa28,0,0


In [62]:
proba4 = model1_over4.predict_proba(X_test_over)
dat_proba4 = pd.DataFrame(proba4)

In [63]:
dat_proba4

Unnamed: 0,0,1,2
0,1.994046e-06,9.999955e-01,0.000002
1,7.460319e-05,1.494679e-04,0.999776
2,5.154975e-03,9.947492e-01,0.000096
3,9.997056e-01,2.888730e-04,0.000006
4,7.460319e-05,1.494679e-04,0.999776
...,...,...,...
191,5.748236e-04,9.993150e-01,0.000110
192,3.761070e-01,6.238710e-01,0.000022
193,3.515777e-07,6.264383e-07,0.999999
194,9.979942e-01,1.685220e-03,0.000321


In [64]:
dat_proba4.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba4.csv", index = False,
         header=None)

In [65]:
dat4.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/4pyop.csv", index = False,
         header=None)

In [64]:
hist1_over4 = model1_over4.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 455 samples, validate on 196 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
E

In [65]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist1_over4.history['accuracy'])*100))

over-sampling train accuracy: 100.00%


In [48]:
df_proba4 = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_2.xlsx",
                        sheet_name=3,
                        index_col=None)

In [49]:
df_proba4

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p002ykpresabs_qual,NRS110,1,1,0.000003,0.999997,5.870196e-13
1,p002ykpresabs_qual,NRS216,1,1,0.039254,0.960745,9.078969e-07
2,p002ykpresabs_qual,NRS386,1,1,0.326752,0.673248,1.061032e-07
3,p002ykpresabs_qual,CFBRSa25,0,0,0.611084,0.388916,7.664974e-07
4,p002ykpresabs_qual,BCH-SA-03,1,0,0.611084,0.388916,7.664974e-07
...,...,...,...,...,...,...,...
4279,pyopresabsSTCC_qual,NRS236,1,1,0.000052,0.999768,1.803156e-04
4280,pyopresabsSTCC_qual,NRS029,0,1,0.322350,0.677496,1.533154e-04
4281,pyopresabsSTCC_qual,NRS148,2,2,0.000006,0.000026,9.999682e-01
4282,pyopresabsSTCC_qual,CFBRSa28,0,0,0.999288,0.000176,5.361527e-04


In [50]:
y_prob4 = df_proba4[df_proba4['phage']=='pyopresabs_qual'].iloc[:,-3:]
y_prob4 = y_prob4.to_numpy()
y_prob4

array([[1.9940455e-06, 9.9999547e-01, 2.4950780e-06],
       [7.4603190e-05, 1.4946793e-04, 9.9977595e-01],
       [5.1549748e-03, 9.9474925e-01, 9.5758376e-05],
       [9.9970560e-01, 2.8887298e-04, 5.5314354e-06],
       [7.4603190e-05, 1.4946793e-04, 9.9977595e-01],
       [1.6642529e-04, 9.9949074e-01, 3.4282426e-04],
       [1.0950358e-03, 9.9882406e-01, 8.0820140e-05],
       [7.4603190e-05, 1.4946793e-04, 9.9977595e-01],
       [9.7784790e-07, 5.8225172e-08, 9.9999890e-01],
       [9.9999964e-01, 3.6036045e-07, 3.4241030e-09],
       [7.4603190e-05, 1.4946793e-04, 9.9977595e-01],
       [1.6642529e-04, 9.9949074e-01, 3.4282426e-04],
       [8.5979220e-04, 9.9914014e-01, 1.1070867e-08],
       [9.7784790e-07, 5.8225172e-08, 9.9999890e-01],
       [9.9995494e-01, 4.5081684e-05, 5.5291700e-08],
       [3.9611794e-03, 9.9588263e-01, 1.5623847e-04],
       [9.9999570e-01, 4.2347747e-06, 5.8259615e-09],
       [3.5157770e-07, 6.2643830e-07, 9.9999905e-01],
       [1.9371908e-05, 2.869

In [51]:
ovo4 = rocauc_ovo(y_test_over, y_prob4, average="macro", multi_class="ovo")
ovo4

0.9989083435648322

In [52]:
ovr4 = rocauc_ovr(y_test_over, y_prob4, average="macro", multi_class="ovr")
ovr4

0.9989083435648322

In [53]:
ovos = [ovo1, ovo2, ovo3, ovo4]
np.mean(ovos)

0.9963109515590433

In [54]:
np.std(ovos)

0.0015549355155699358

In [55]:
ovrs = [ovr1, ovr2, ovr3, ovr4]
np.mean(ovrs)

0.9963109515590433

In [56]:
np.std(ovrs)

0.0015549355155699358

In [66]:
accs = [acc_test_over, acc_test_over2, acc_test_over3, acc_test_over4]

In [67]:
mean = np.mean(accs)
print('over-sampling test accuracy mean: %.2f%%' % (mean*100))

over-sampling test accuracy mean: 96.81%


In [68]:
std = np.std(accs)
print('over-sampling test accuracy standard deviation:', std)

over-sampling test accuracy standard deviation: 0.0066277422767705065


In [69]:
accs_train = [np.mean(hist1_over.history['accuracy']), np.mean(hist1_over2.history['accuracy']), np.mean(hist1_over3.history['accuracy']),
             np.mean(hist1_over4.history['accuracy'])]

In [70]:
mean_train = np.mean(accs_train)
print('over-sampling train accuracy mean: %.2f%%' % (mean_train*100))

over-sampling train accuracy mean: 100.00%


In [71]:
std_train = np.std(accs_train)
print('over-sampling train accuracy standard deviation:', std_train)

over-sampling train accuracy standard deviation: 0.0


In [57]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over,
                                                    test_size = 0.3,
                                                    random_state=567,
                                                    stratify=y_over)

In [58]:
dat5 = pd.DataFrame(X_test_over[:,0])
dat5['test'] = y_test_over

In [59]:
dat5

Unnamed: 0,0,test
0,NRS255,2
1,NRS255,2
2,NRS386,1
3,NRS205,2
4,NRS205,2
...,...,...
191,BCH-SA-12,0
192,NRS049,0
193,NRS022,0
194,NRS236,1


In [60]:
X_train_over = X_train_over[:,1:]
X_test_over = X_test_over[:,1:]

In [61]:
#### add regularizer and dropout
model1_over5 = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],), activity_regularizer=l1(0.001)),
    Dense(3, activation='softmax'),
    Dropout(0.2, ),
])

In [71]:
model1_over5.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [72]:
model1_over5.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 455 samples, validate on 196 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1a3547e2e8>

In [79]:
acc_test_over5 = model1_over5.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over5*100))

over-sampling test accuracy: 96.43%


In [73]:
pred5 = model1_over5.predict_classes(X_test_over)
pred5

array([2, 2, 1, 2, 2, 1, 1, 0, 2, 2, 1, 2, 2, 1, 0, 1, 1, 0, 1, 2, 0, 1,
       0, 2, 0, 0, 1, 0, 2, 2, 2, 0, 1, 1, 2, 1, 1, 2, 0, 2, 1, 1, 1, 0,
       2, 0, 1, 0, 0, 1, 1, 1, 2, 2, 1, 0, 0, 2, 2, 2, 1, 0, 1, 2, 2, 1,
       0, 0, 2, 1, 1, 1, 0, 0, 2, 0, 1, 1, 2, 2, 2, 0, 1, 2, 0, 2, 1, 0,
       2, 0, 2, 0, 1, 1, 0, 1, 0, 2, 0, 1, 1, 0, 2, 1, 1, 1, 0, 1, 2, 2,
       1, 2, 0, 0, 0, 0, 2, 1, 2, 1, 1, 1, 0, 1, 2, 1, 2, 1, 0, 1, 0, 2,
       0, 0, 1, 1, 2, 0, 1, 0, 2, 1, 1, 2, 2, 0, 2, 0, 0, 0, 0, 2, 1, 2,
       1, 1, 2, 2, 0, 0, 1, 0, 2, 1, 0, 1, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2,
       1, 1, 0, 1, 0, 1, 2, 1, 1, 2, 1, 0, 1, 1, 2, 0, 1, 0, 1, 2])

In [74]:
dat5['pred'] = pred5
dat5

Unnamed: 0,0,test,pred
0,NRS255,2,2
1,NRS255,2,2
2,NRS386,1,1
3,NRS205,2,2
4,NRS205,2,2
...,...,...,...
191,BCH-SA-12,0,0
192,NRS049,0,1
193,NRS022,0,0
194,NRS236,1,1


In [75]:
proba5 = model1_over5.predict_proba(X_test_over)
dat_proba5 = pd.DataFrame(proba5)

In [76]:
dat_proba5

Unnamed: 0,0,1,2
0,9.109882e-08,7.782663e-08,9.999999e-01
1,9.109882e-08,7.782663e-08,9.999999e-01
2,8.673664e-08,9.999999e-01,5.412694e-08
3,4.084002e-08,2.183209e-08,9.999999e-01
4,4.084002e-08,2.183209e-08,9.999999e-01
...,...,...,...
191,1.000000e+00,2.202963e-10,6.193051e-10
192,2.235976e-07,9.999998e-01,1.070886e-08
193,9.999999e-01,1.583399e-07,1.003328e-08
194,4.615438e-09,1.000000e+00,4.644775e-09


In [77]:
dat_proba5.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba5.csv", index = False,
         header=None)

In [78]:
dat5.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/5pyop.csv", index = False,
         header=None)

In [83]:
hist1_over5 = model1_over5.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 455 samples, validate on 196 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [84]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist1_over5.history['accuracy'])*100))

over-sampling train accuracy: 80.42%


In [62]:
df_proba5 = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_regularizor_dropout_2.xlsx",
                        sheet_name=0,
                        index_col=None)

In [63]:
df_proba5

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p002ykpresabsSTCC_qual,NRS241,1,1,1.342914e-03,9.986569e-01,2.348628e-07
1,p002ykpresabsSTCC_qual,NRS148,2,2,5.170289e-08,1.017893e-07,9.999999e-01
2,p002ykpresabsSTCC_qual,NRS255,1,1,1.780311e-07,9.999999e-01,2.544841e-12
3,p002ykpresabsSTCC_qual,NRS214,0,0,1.000000e+00,2.203547e-10,5.688883e-15
4,p002ykpresabsSTCC_qual,NRS148,2,2,5.170289e-08,1.017893e-07,9.999999e-01
...,...,...,...,...,...,...,...
1977,pyopresabsSTCC_qual,BCH-SA-12,0,0,1.000000e+00,1.152503e-09,1.898730e-09
1978,pyopresabsSTCC_qual,NRS049,0,1,8.401357e-11,1.000000e+00,3.209735e-13
1979,pyopresabsSTCC_qual,NRS022,0,0,1.000000e+00,4.755084e-10,1.974275e-10
1980,pyopresabsSTCC_qual,NRS236,1,1,1.357345e-08,1.000000e+00,1.293117e-10


In [64]:
y_prob5 = df_proba5[df_proba5['phage']=='pyopresabs_qual'].iloc[:,-3:]
y_prob5 = y_prob5.to_numpy()
y_prob5

array([[9.10988200e-08, 7.78266340e-08, 9.99999900e-01],
       [9.10988200e-08, 7.78266340e-08, 9.99999900e-01],
       [8.67366400e-08, 9.99999900e-01, 5.41269430e-08],
       [4.08400200e-08, 2.18320880e-08, 9.99999900e-01],
       [4.08400200e-08, 2.18320880e-08, 9.99999900e-01],
       [8.24899500e-09, 1.00000000e+00, 1.13398280e-08],
       [3.50563960e-04, 9.99548600e-01, 1.00752164e-04],
       [9.99997260e-01, 1.54527160e-06, 1.23060520e-06],
       [9.98193500e-08, 1.04520424e-07, 9.99999760e-01],
       [3.50179800e-08, 1.84950050e-08, 1.00000000e+00],
       [2.25788830e-08, 1.00000000e+00, 8.81947700e-10],
       [4.08400200e-08, 2.18320880e-08, 9.99999900e-01],
       [3.50179800e-08, 1.84950050e-08, 1.00000000e+00],
       [4.06691680e-08, 1.00000000e+00, 4.41087630e-08],
       [9.99771540e-01, 9.69215900e-05, 1.31523050e-04],
       [2.58581000e-01, 7.41257850e-01, 1.61153370e-04],
       [1.18832210e-08, 1.00000000e+00, 3.08196580e-09],
       [1.00000000e+00, 2.47841

In [65]:
ovo5 = rocauc_ovo(y_test_over, y_prob5, average="macro", multi_class="ovo")
ovo5

0.9940998357028891

In [66]:
ovr5 = rocauc_ovr(y_test_over, y_prob5, average="macro", multi_class="ovr")
ovr5

0.9940998357028891

In [67]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over,
                                                    test_size = 0.3,
                                                    random_state=678,
                                                    stratify=y_over)

In [68]:
dat6 = pd.DataFrame(X_test_over[:,0])
dat6['test'] = y_test_over

In [69]:
dat6

Unnamed: 0,0,test
0,120335,0
1,NRS168,1
2,NRS202,0
3,NRS109,2
4,BCH-SA-01,0
...,...,...
191,NRS272,0
192,NRS112,1
193,NRS064,1
194,BCH-SA-04,0


In [70]:
X_train_over = X_train_over[:,1:]
X_test_over = X_test_over[:,1:]

In [83]:
model1_over6 = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],), activity_regularizer=l1(0.001)),
    Dense(3, activation='softmax'),
    Dropout(0.2, ),
])

In [84]:
model1_over6.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [85]:
model1_over6.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 455 samples, validate on 196 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1a35cb6438>

In [92]:
acc_test_over6 = model1_over6.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over6*100))

over-sampling test accuracy: 97.96%


In [86]:
pred6 = model1_over6.predict_classes(X_test_over)
pred6

array([0, 1, 0, 2, 0, 2, 0, 1, 1, 1, 2, 1, 2, 0, 0, 0, 2, 2, 2, 1, 2, 0,
       0, 1, 0, 0, 0, 1, 2, 1, 2, 1, 2, 2, 1, 2, 2, 2, 0, 2, 1, 2, 1, 1,
       2, 2, 2, 0, 1, 0, 1, 2, 0, 0, 1, 0, 1, 0, 2, 0, 2, 1, 1, 0, 0, 1,
       1, 2, 2, 1, 0, 2, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 2, 0, 0, 0, 1, 0,
       1, 2, 1, 1, 2, 1, 1, 0, 0, 2, 0, 2, 1, 1, 2, 2, 1, 1, 1, 1, 0, 2,
       2, 1, 0, 1, 2, 0, 0, 0, 2, 1, 2, 1, 1, 2, 2, 0, 0, 2, 2, 0, 1, 0,
       1, 1, 0, 2, 0, 2, 1, 0, 2, 0, 2, 1, 2, 1, 2, 1, 0, 2, 0, 2, 2, 2,
       2, 2, 0, 0, 2, 1, 0, 0, 0, 1, 2, 0, 0, 1, 1, 0, 0, 2, 1, 0, 1, 1,
       1, 2, 2, 2, 1, 2, 0, 2, 1, 1, 2, 2, 1, 2, 1, 0, 1, 1, 0, 2])

In [87]:
dat6['pred'] = pred6
dat6

Unnamed: 0,0,test,pred
0,120335,0,0
1,NRS168,1,1
2,NRS202,0,0
3,NRS109,2,2
4,BCH-SA-01,0,0
...,...,...,...
191,NRS272,0,0
192,NRS112,1,1
193,NRS064,1,1
194,BCH-SA-04,0,0


In [88]:
proba6 = model1_over6.predict_proba(X_test_over)
dat_proba6 = pd.DataFrame(proba6)

In [89]:
dat_proba6

Unnamed: 0,0,1,2
0,9.998709e-01,1.055906e-04,2.351972e-05
1,7.056495e-08,9.999998e-01,7.416148e-08
2,9.825587e-01,1.741214e-02,2.915574e-05
3,7.267313e-08,5.286402e-08,9.999999e-01
4,9.999878e-01,1.192644e-05,1.807559e-07
...,...,...,...
191,1.000000e+00,1.789710e-12,2.588786e-11
192,3.431651e-08,1.000000e+00,3.777323e-08
193,4.984497e-08,1.000000e+00,9.552954e-09
194,1.000000e+00,1.183396e-11,4.653354e-11


In [90]:
dat_proba6.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba6.csv", index = False,
         header=None)

In [91]:
dat6.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/6pyop.csv", index = False,
         header=None)

In [96]:
hist1_over6 = model1_over6.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 455 samples, validate on 196 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
E

In [97]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist1_over6.history['accuracy'])*100))

over-sampling train accuracy: 79.98%


In [71]:
df_proba6 = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_regularizor_dropout_2.xlsx",
                        sheet_name=1,
                        index_col=None)

In [72]:
df_proba6

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p002ykpresabsSTCC_qual,NRS209,2,2,1.790400e-08,4.141849e-08,1.000000e+00
1,p002ykpresabsSTCC_qual,NRS386,1,1,5.739934e-04,9.994259e-01,6.773014e-08
2,p002ykpresabsSTCC_qual,NRS148,2,2,5.286934e-09,1.269109e-08,1.000000e+00
3,p002ykpresabsSTCC_qual,NRS178,0,1,6.494936e-12,1.000000e+00,2.537080e-25
4,p002ykpresabsSTCC_qual,NRS237,0,1,5.701098e-02,9.399204e-01,3.068583e-03
...,...,...,...,...,...,...,...
1977,pyopresabsSTCC_qual,NRS272,0,0,9.999607e-01,3.367024e-05,5.776848e-06
1978,pyopresabsSTCC_qual,NRS112,1,1,8.275442e-08,9.999999e-01,3.739556e-09
1979,pyopresabsSTCC_qual,NRS064,1,1,2.168245e-08,1.000000e+00,9.603962e-09
1980,pyopresabsSTCC_qual,BCH-SA-04,0,0,1.000000e+00,1.026408e-15,1.630406e-14


In [73]:
y_prob6 = df_proba6[df_proba6['phage']=='pyopresabs_qual'].iloc[:,-3:]
y_prob6 = y_prob6.to_numpy()
y_prob6

array([[9.99870900e-01, 1.05590580e-04, 2.35197190e-05],
       [7.05649500e-08, 9.99999760e-01, 7.41614800e-08],
       [9.82558700e-01, 1.74121430e-02, 2.91557440e-05],
       [7.26731300e-08, 5.28640200e-08, 9.99999900e-01],
       [9.99987840e-01, 1.19264410e-05, 1.80755950e-07],
       [7.26731300e-08, 5.28640200e-08, 9.99999900e-01],
       [1.00000000e+00, 1.38063510e-16, 4.48793330e-15],
       [1.59947750e-07, 9.99999760e-01, 1.24296990e-07],
       [4.33853600e-06, 9.99993560e-01, 2.17785280e-06],
       [1.54564190e-06, 9.99998450e-01, 2.13267640e-08],
       [7.26731300e-08, 5.28640200e-08, 9.99999900e-01],
       [3.31455740e-07, 9.99999640e-01, 4.21992300e-08],
       [9.23167500e-08, 9.78539500e-08, 9.99999760e-01],
       [1.00000000e+00, 4.00905900e-13, 2.77484050e-12],
       [1.00000000e+00, 7.45279300e-11, 3.95747130e-10],
       [1.00000000e+00, 2.22623800e-09, 7.87326900e-10],
       [9.23167500e-08, 9.78539500e-08, 9.99999760e-01],
       [9.23167500e-08, 9.78539

In [74]:
ovo6 = rocauc_ovo(y_test_over, y_prob6, average="macro", multi_class="ovo")
ovo6

1.0

In [75]:
ovr6 = rocauc_ovr(y_test_over, y_prob6, average="macro", multi_class="ovr")
ovr6

1.0

In [76]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over,
                                                    test_size = 0.3,
                                                    random_state=789,
                                                    stratify=y_over)

In [77]:
dat7 = pd.DataFrame(X_test_over[:,0])
dat7['test'] = y_test_over

In [78]:
dat7

Unnamed: 0,0,test
0,NRS253,1
1,NRS148,2
2,NRS105,1
3,NRS265,1
4,NRS211,0
...,...,...
191,NRS035,0
192,NRS260,1
193,CA9,0
194,NRS183,1


In [79]:
X_train_over = X_train_over[:,1:]
X_test_over = X_test_over[:,1:]

In [96]:
model1_over7 = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],), activity_regularizer=l1(0.001)),
    Dense(3, activation='softmax'),
    Dropout(0.2, ),
])

In [97]:
model1_over7.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [98]:
model1_over7.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 455 samples, validate on 196 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1a36488278>

In [105]:
acc_test_over7 = model1_over7.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over7*100))

over-sampling test accuracy: 95.41%


In [99]:
pred7 = model1_over7.predict_classes(X_test_over)
pred7

array([1, 2, 1, 1, 0, 2, 0, 1, 2, 1, 0, 0, 0, 1, 1, 0, 2, 1, 0, 1, 1, 0,
       0, 0, 2, 1, 2, 2, 1, 1, 1, 0, 1, 2, 0, 2, 2, 0, 1, 0, 2, 1, 1, 2,
       2, 0, 1, 2, 2, 1, 1, 2, 0, 1, 2, 2, 1, 1, 2, 0, 0, 0, 2, 2, 2, 1,
       1, 1, 2, 0, 0, 0, 0, 0, 2, 1, 0, 1, 2, 0, 2, 1, 1, 1, 1, 0, 2, 2,
       2, 2, 2, 1, 1, 2, 2, 1, 0, 2, 1, 0, 2, 0, 2, 1, 1, 0, 0, 1, 0, 0,
       2, 2, 1, 2, 0, 0, 2, 2, 2, 0, 1, 1, 2, 2, 0, 0, 1, 0, 2, 0, 1, 2,
       1, 2, 1, 0, 1, 0, 2, 0, 0, 2, 1, 0, 2, 1, 1, 1, 1, 0, 2, 2, 0, 0,
       0, 2, 2, 1, 1, 2, 1, 2, 1, 1, 2, 2, 1, 1, 1, 2, 1, 2, 2, 2, 1, 2,
       2, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 2, 0, 1, 0, 1, 2])

In [100]:
dat7['pred'] = pred7
dat7

Unnamed: 0,0,test,pred
0,NRS253,1,1
1,NRS148,2,2
2,NRS105,1,1
3,NRS265,1,1
4,NRS211,0,0
...,...,...,...
191,NRS035,0,0
192,NRS260,1,1
193,CA9,0,0
194,NRS183,1,1


In [101]:
proba7 = model1_over7.predict_proba(X_test_over)
dat_proba7 = pd.DataFrame(proba7)

In [102]:
dat_proba7

Unnamed: 0,0,1,2
0,1.213230e-08,1.000000e+00,1.219015e-08
1,8.214332e-08,6.891100e-08,9.999999e-01
2,2.556513e-08,1.000000e+00,5.688339e-08
3,3.214448e-08,1.000000e+00,5.910719e-08
4,9.998562e-01,1.211750e-04,2.248427e-05
...,...,...,...
191,9.999982e-01,1.602887e-06,2.507971e-07
192,3.809193e-08,1.000000e+00,1.845216e-08
193,1.000000e+00,1.470798e-08,2.432653e-08
194,6.648909e-08,9.999999e-01,3.424112e-08


In [103]:
dat_proba7.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba7.csv", index = False,
         header=None)

In [104]:
dat7.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/7pyop.csv", index = False,
         header=None)

In [109]:
hist1_over7 = model1_over7.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 455 samples, validate on 196 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [110]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist1_over7.history['accuracy'])*100))

over-sampling train accuracy: 80.49%


In [80]:
df_proba7 = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_regularizor_dropout_2.xlsx",
                        sheet_name=2,
                        index_col=None)

In [81]:
df_proba7

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p002ykpresabsSTCC_qual,NRS209,2,2,8.300497e-12,1.036520e-09,1.000000e+00
1,p002ykpresabsSTCC_qual,BCH-SA-09,1,1,1.137139e-06,9.999988e-01,2.067601e-09
2,p002ykpresabsSTCC_qual,NRS224,0,0,1.000000e+00,2.093110e-31,0.000000e+00
3,p002ykpresabsSTCC_qual,NRS209,2,2,8.300497e-12,1.036520e-09,1.000000e+00
4,p002ykpresabsSTCC_qual,NRS235,1,1,2.243513e-02,9.774035e-01,1.615106e-04
...,...,...,...,...,...,...,...
1977,pyopresabsSTCC_qual,NRS035,0,0,9.354528e-01,6.414209e-02,4.051121e-04
1978,pyopresabsSTCC_qual,NRS260,1,1,4.808470e-08,1.000000e+00,7.364639e-09
1979,pyopresabsSTCC_qual,CA9,0,0,1.000000e+00,2.361323e-08,2.871247e-08
1980,pyopresabsSTCC_qual,NRS183,1,1,2.755864e-07,9.999998e-01,5.310879e-08


In [82]:
y_prob7 = df_proba7[df_proba7['phage']=='pyopresabs_qual'].iloc[:,-3:]
y_prob7 = y_prob7.to_numpy()
y_prob7

array([[1.21323050e-08, 1.00000000e+00, 1.21901550e-08],
       [8.21433160e-08, 6.89110000e-08, 9.99999900e-01],
       [2.55651290e-08, 1.00000000e+00, 5.68833900e-08],
       [3.21444800e-08, 1.00000000e+00, 5.91071900e-08],
       [9.99856230e-01, 1.21174970e-04, 2.24842700e-05],
       [5.81974700e-08, 6.16394900e-08, 9.99999900e-01],
       [1.00000000e+00, 3.65324000e-11, 3.04184140e-11],
       [8.95153800e-08, 9.99999760e-01, 1.22073530e-07],
       [5.38301240e-08, 5.96405700e-08, 9.99999900e-01],
       [6.42920060e-02, 9.35570360e-01, 1.37649680e-04],
       [9.91371150e-01, 8.62499300e-03, 3.88851370e-06],
       [1.00000000e+00, 1.94729400e-08, 1.11430440e-08],
       [9.99919800e-01, 7.57107550e-05, 4.57944800e-06],
       [6.64890860e-08, 9.99999900e-01, 3.42411220e-08],
       [9.63194100e-08, 9.99999760e-01, 7.43851700e-08],
       [9.44630800e-01, 5.48884650e-02, 4.80774180e-04],
       [8.21433160e-08, 6.89110000e-08, 9.99999900e-01],
       [3.62281530e-05, 9.99963

In [83]:
ovo7 = rocauc_ovo(y_test_over, y_prob7, average="macro", multi_class="ovo")
ovo7

0.9980035231943628

In [85]:
ovr7 = rocauc_ovr(y_test_over, y_prob7, average="macro", multi_class="ovr")
ovr7

0.9980035231943628

In [86]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over,
                                                    test_size = 0.3,
                                                    random_state=890,
                                                    stratify=y_over)

In [87]:
dat8 = pd.DataFrame(X_test_over[:,0])
dat8['test'] = y_test_over

In [88]:
dat8

Unnamed: 0,0,test
0,NRS205,2
1,NRS109,2
2,CFBREBSa131,0
3,NRS148,2
4,NRS177,1
...,...,...
191,NRS205,2
192,CFBREBSa122,0
193,NRS001,1
194,NRS148,2


In [89]:
X_train_over = X_train_over[:,1:]
X_test_over = X_test_over[:,1:]

In [90]:
model1_over8 = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],), activity_regularizer=l1(0.001)),
    Dense(3, activation='softmax'),
    Dropout(0.2, ),
])

In [110]:
model1_over8.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [111]:
model1_over8.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 455 samples, validate on 196 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1a36dceef0>

In [118]:
acc_test_over8 = model1_over8.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over8*100))

over-sampling test accuracy: 97.96%


In [112]:
pred8 = model1_over8.predict_classes(X_test_over)
pred8

array([2, 2, 0, 2, 1, 1, 1, 1, 1, 2, 2, 1, 1, 0, 2, 1, 1, 2, 1, 0, 1, 2,
       0, 1, 1, 0, 2, 0, 0, 2, 0, 0, 2, 2, 2, 0, 1, 1, 1, 2, 2, 2, 2, 1,
       0, 1, 2, 1, 2, 1, 1, 1, 0, 0, 0, 1, 1, 0, 2, 2, 1, 1, 1, 2, 0, 1,
       2, 0, 0, 1, 0, 1, 1, 2, 0, 0, 2, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 2,
       0, 1, 1, 2, 0, 1, 2, 0, 0, 1, 2, 2, 2, 2, 1, 0, 2, 2, 2, 1, 0, 1,
       2, 1, 1, 2, 1, 2, 2, 1, 2, 0, 0, 1, 1, 0, 2, 2, 1, 2, 1, 0, 1, 1,
       0, 1, 1, 2, 0, 0, 2, 1, 0, 0, 0, 2, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0,
       0, 2, 2, 2, 0, 1, 2, 0, 1, 2, 2, 2, 0, 2, 1, 0, 0, 1, 2, 1, 0, 0,
       1, 2, 0, 1, 2, 0, 2, 0, 2, 1, 1, 0, 0, 2, 0, 2, 0, 1, 2, 1])

In [113]:
dat8['pred'] = pred8
dat8

Unnamed: 0,0,test,pred
0,NRS205,2,2
1,NRS109,2,2
2,CFBREBSa131,0,0
3,NRS148,2,2
4,NRS177,1,1
...,...,...,...
191,NRS205,2,2
192,CFBREBSa122,0,0
193,NRS001,1,1
194,NRS148,2,2


In [114]:
proba8 = model1_over8.predict_proba(X_test_over)
dat_proba8 = pd.DataFrame(proba8)

In [115]:
dat_proba8

Unnamed: 0,0,1,2
0,5.425449e-08,5.280386e-08,9.999999e-01
1,4.110887e-08,6.513918e-08,9.999999e-01
2,6.937832e-01,2.684859e-05,3.061899e-01
3,4.660395e-08,5.000589e-08,9.999999e-01
4,6.728791e-07,9.999993e-01,2.921243e-08
...,...,...,...
191,5.425449e-08,5.280386e-08,9.999999e-01
192,9.999980e-01,9.470651e-07,1.131866e-06
193,6.362179e-07,9.999994e-01,3.903229e-08
194,4.660395e-08,5.000589e-08,9.999999e-01


In [116]:
dat_proba8.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba8.csv", index = False,
         header=None)

In [117]:
dat8.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/8pyop.csv", index = False,
         header=None)

In [122]:
hist1_over8 = model1_over8.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 455 samples, validate on 196 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [123]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist1_over8.history['accuracy'])*100))

over-sampling train accuracy: 80.10%


In [91]:
df_proba8 = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_regularizor_dropout_2.xlsx",
                        sheet_name=3,
                        index_col=None)

In [92]:
df_proba8

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p002ykpresabsSTCC_qual,CFBREBSa116,0,0,9.676203e-01,3.237956e-02,1.480166e-07
1,p002ykpresabsSTCC_qual,NRS214,0,0,1.000000e+00,6.534852e-11,2.250731e-18
2,p002ykpresabsSTCC_qual,NRS148,2,2,3.948571e-11,2.839096e-07,9.999998e-01
3,p002ykpresabsSTCC_qual,NRS148,2,2,3.948571e-11,2.839096e-07,9.999998e-01
4,p002ykpresabsSTCC_qual,NRS148,2,2,3.948571e-11,2.839096e-07,9.999998e-01
...,...,...,...,...,...,...,...
1977,pyopresabsSTCC_qual,NRS205,2,2,3.691095e-08,3.571927e-08,9.999999e-01
1978,pyopresabsSTCC_qual,CFBREBSa122,0,1,9.261665e-02,9.073822e-01,1.162373e-06
1979,pyopresabsSTCC_qual,NRS001,1,1,4.174278e-07,9.999995e-01,3.254024e-09
1980,pyopresabsSTCC_qual,NRS148,2,2,3.234670e-08,3.121212e-08,9.999999e-01


In [93]:
y_prob8 = df_proba8[df_proba8['phage']=='pyopresabs_qual'].iloc[:,-3:]
y_prob8 = y_prob8.to_numpy()
y_prob8

array([[5.42544920e-08, 5.28038600e-08, 9.99999900e-01],
       [4.11088660e-08, 6.51391760e-08, 9.99999900e-01],
       [6.93783200e-01, 2.68485950e-05, 3.06189860e-01],
       [4.66039500e-08, 5.00058950e-08, 9.99999900e-01],
       [6.72879100e-07, 9.99999300e-01, 2.92124260e-08],
       [3.72345200e-08, 1.00000000e+00, 5.79937020e-08],
       [1.02757380e-03, 9.98969100e-01, 3.37555460e-06],
       [5.05107900e-07, 9.99999400e-01, 7.77220100e-08],
       [2.62808740e-01, 7.36330300e-01, 8.61036200e-04],
       [5.42544920e-08, 5.28038600e-08, 9.99999900e-01],
       [4.11088660e-08, 6.51391760e-08, 9.99999900e-01],
       [2.07309320e-08, 1.00000000e+00, 9.41759400e-09],
       [6.36217900e-07, 9.99999400e-01, 3.90322870e-08],
       [1.00000000e+00, 2.34482920e-12, 1.59445100e-12],
       [4.89416080e-08, 5.46391500e-08, 9.99999900e-01],
       [3.89802320e-08, 1.00000000e+00, 3.86282300e-08],
       [3.71373500e-08, 1.00000000e+00, 2.03168080e-09],
       [4.66039500e-08, 5.00058

In [94]:
ovo8 = rocauc_ovo(y_test_over, y_prob8, average="macro", multi_class="ovo")
ovo8

0.9996868271677432

In [95]:
ovr8 = rocauc_ovr(y_test_over, y_prob8, average="macro", multi_class="ovr")
ovr8

0.9996868271677432

In [96]:
ovos2 = [ovo5, ovo6, ovo7, ovo8]
np.mean(ovos2)

0.9979475465162487

In [97]:
np.std(ovos2)

0.002347641070274753

In [98]:
ovrs2 = [ovr5, ovr6, ovr7, ovr8]
np.mean(ovrs2)

0.9979475465162487

In [99]:
np.std(ovrs2)

0.002347641070274753

In [124]:
accs_reg = [acc_test_over5, acc_test_over6, acc_test_over7, acc_test_over8]

In [125]:
mean_reg = np.mean(accs_reg)
print('over-sampling test accuracy regularization mean: %.2f%%' % (mean_reg*100))

over-sampling test accuracy regularization mean: 96.94%


In [126]:
std_reg = np.std(accs_reg)
print('over-sampling test accuracy regularization standard deviation:', std_reg)

over-sampling test accuracy regularization standard deviation: 0.010823057816506523


In [127]:
accs_train_reg = [np.mean(hist1_over5.history['accuracy']), np.mean(hist1_over6.history['accuracy']), np.mean(hist1_over7.history['accuracy']),
             np.mean(hist1_over8.history['accuracy'])]

In [128]:
mean_train_reg = np.mean(accs_train_reg)
print('over-sampling train accuracy regularization mean: %.2f%%' % (mean_train_reg*100))

over-sampling train accuracy regularization mean: 80.25%


In [129]:
std_train_reg = np.std(accs_train_reg)
print('over-sampling train accuracy regularization standard deviation:', std_train_reg)

over-sampling train accuracy regularization standard deviation: 0.0021164876
