In [None]:
## This file implements neural networks for pyokpresabs_qual with four replicates.
## We compute the mean and standarad deviation of training and test accuracies.
## We also compute the mean and standard deviation of AUC ROC values for four replicates.

In [1]:
from numpy.random import seed
import numpy as np
seed(100)
import tensorflow
tensorflow.random.set_seed(123)

In [2]:
import pandas as pd

df = pd.read_csv('/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/pyokpresabs_qual.csv')
df.shape

(253, 16)

In [3]:
df.rename(columns={'Unnamed: 0':'id'}, inplace=True)

In [4]:
df['pheno']

0      0
1      0
2      0
3      0
4      0
      ..
248    0
249    0
250    0
251    0
252    0
Name: pheno, Length: 253, dtype: int64

In [5]:
df.head()

Unnamed: 0,id,TTTTCCCCCAT,TTCAAATTACCCTCTCTTGTAAAATCAAATTCACATGATGTCCATGGTTC,TGGGTCTGAC,TCCTGATGGACCAAAACCTAATTTAATCCAATCTATATAATCAAACGATACTTTCAAATTACCCTCTCTTGTAAAATCAAATTCACATGATGTCCATGGT,TATATAGACTG,TAGTCGCACT,TAAGAATAATATATTAAATATTTATTAACAAATTATAGATAAAATATGAATAATTAATTAATGGTATTTACATATTCATAACC,GGGCTGAGG,GAGCAACCTT,GAACCATGGACATCATGTGAATTTGATTTTACAAGAGAGGGT,GAACCATGGACATCATGTGAATTTGATTTTACAAGAGAGGGTAATTTGAAAGTATCGTTTGATTATATAGATTGGAT,GAACCATGGACATCATGTGAATTTGATTTTACAAGAGAGGGTAATTTGAAAGTATCGTTTGATTATATAGATTGGATTAAATTAGGTTTTGGTCCATCAG,CCTTGTTGCGG,CCTGATGGACCAAAACCTAATTTAATCCAATCTATATAATCAAACGATACTTTCAAATTACCCTCTCTTGTAAAATCAAATTCACATGATGTCCATGGTT,pheno
0,107,0,1,1,1,1,0,1,0,0,1,1,1,0,1,0
1,109,0,1,0,1,1,0,1,0,0,1,1,1,0,1,0
2,115,0,0,0,0,1,1,1,1,1,0,0,0,1,0,0
3,120335,0,1,0,1,1,0,1,0,0,1,1,1,0,1,0
4,120337,0,1,0,1,1,0,1,0,0,1,1,1,0,1,0


In [6]:
df['pheno'].value_counts()

0    217
1     32
2      4
Name: pheno, dtype: int64

In [7]:
df_clean = df.drop(columns=['id'])

In [8]:
df_clean.shape

(253, 15)

In [9]:
df_clean.head()

Unnamed: 0,TTTTCCCCCAT,TTCAAATTACCCTCTCTTGTAAAATCAAATTCACATGATGTCCATGGTTC,TGGGTCTGAC,TCCTGATGGACCAAAACCTAATTTAATCCAATCTATATAATCAAACGATACTTTCAAATTACCCTCTCTTGTAAAATCAAATTCACATGATGTCCATGGT,TATATAGACTG,TAGTCGCACT,TAAGAATAATATATTAAATATTTATTAACAAATTATAGATAAAATATGAATAATTAATTAATGGTATTTACATATTCATAACC,GGGCTGAGG,GAGCAACCTT,GAACCATGGACATCATGTGAATTTGATTTTACAAGAGAGGGT,GAACCATGGACATCATGTGAATTTGATTTTACAAGAGAGGGTAATTTGAAAGTATCGTTTGATTATATAGATTGGAT,GAACCATGGACATCATGTGAATTTGATTTTACAAGAGAGGGTAATTTGAAAGTATCGTTTGATTATATAGATTGGATTAAATTAGGTTTTGGTCCATCAG,CCTTGTTGCGG,CCTGATGGACCAAAACCTAATTTAATCCAATCTATATAATCAAACGATACTTTCAAATTACCCTCTCTTGTAAAATCAAATTCACATGATGTCCATGGTT,pheno
0,0,1,1,1,1,0,1,0,0,1,1,1,0,1,0
1,0,1,0,1,1,0,1,0,0,1,1,1,0,1,0
2,0,0,0,0,1,1,1,1,1,0,0,0,1,0,0
3,0,1,0,1,1,0,1,0,0,1,1,1,0,1,0
4,0,1,0,1,1,0,1,0,0,1,1,1,0,1,0


In [10]:
X = df.loc[:, df.columns != 'pheno']
y = df['pheno']
print(X.shape, y.shape)

(253, 15) (253,)


In [11]:
# over-sampling
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler
overS = RandomOverSampler(random_state=100)
X_over, y_over = overS.fit_resample(X, y)
print(sorted(Counter(y_over).items()))

Using TensorFlow backend.


[(0, 217), (1, 217), (2, 217)]




In [12]:
############# Fully-Connected Neural Network ################

In [13]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.regularizers import l1

In [14]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over,
                                                    test_size = 0.3,
                                                    random_state=123,
                                                    stratify=y_over)

In [15]:
dat = pd.DataFrame(X_test_over[:,0])
dat['test'] = y_test_over

In [16]:
dat

Unnamed: 0,0,test
0,CFBRSa07,0
1,CFBRSa66A,0
2,NRS112,1
3,NRS211,0
4,CFBRSa22,0
...,...,...
191,NRS148,2
192,NRS255,2
193,NRS205,2
194,NRS255,2


In [17]:
X_train_over = X_train_over[:,1:]
X_test_over = X_test_over[:,1:]

In [18]:
#### neural network on over-sampling data
model1_over = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],)),
    Dense(3, activation='softmax'),
])

In [19]:
model1_over.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [20]:
model1_over.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 455 samples, validate on 196 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
E

<keras.callbacks.callbacks.History at 0x1a3ded9198>

In [34]:
acc_test_over = model1_over.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over*100))

over-sampling test accuracy: 88.27%


In [21]:
pred = model1_over.predict_classes(X_test_over)
pred

array([1, 0, 1, 0, 0, 1, 0, 0, 2, 1, 0, 0, 1, 2, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 2, 0, 0, 0, 2, 2, 0, 2, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 2, 2, 2, 2, 1, 1, 0, 0, 1, 1, 1, 2, 1, 0, 2, 1, 1,
       0, 1, 2, 1, 0, 1, 2, 2, 2, 0, 1, 2, 0, 1, 0, 0, 0, 1, 0, 1, 2, 0,
       0, 0, 0, 1, 0, 2, 2, 2, 2, 2, 1, 1, 2, 0, 0, 0, 1, 2, 0, 0, 2, 1,
       0, 2, 0, 2, 1, 2, 2, 1, 1, 2, 1, 2, 2, 2, 2, 1, 1, 0, 2, 1, 0, 1,
       0, 2, 2, 0, 2, 0, 0, 0, 2, 1, 0, 2, 1, 0, 0, 2, 2, 1, 0, 2, 2, 2,
       2, 2, 2, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 2, 0, 2, 2, 2, 2, 2,
       2, 1, 2, 1, 0, 2, 0, 2, 2, 2, 2, 1, 2, 2, 0, 2, 2, 2, 2, 2])

In [22]:
dat['pred'] = pred
dat

Unnamed: 0,0,test,pred
0,CFBRSa07,0,1
1,CFBRSa66A,0,0
2,NRS112,1,1
3,NRS211,0,0
4,CFBRSa22,0,0
...,...,...,...
191,NRS148,2,2
192,NRS255,2,2
193,NRS205,2,2
194,NRS255,2,2


In [24]:
proba1 = model1_over.predict_proba(X_test_over)
dat_proba1 = pd.DataFrame(proba1)

In [25]:
dat_proba1

Unnamed: 0,0,1,2
0,0.265483,0.734014,0.000503
1,0.858843,0.141141,0.000016
2,0.003340,0.924840,0.071820
3,0.585653,0.410576,0.003771
4,0.858843,0.141141,0.000016
...,...,...,...
191,0.002040,0.051028,0.946932
192,0.020451,0.027734,0.951815
193,0.002040,0.051028,0.946932
194,0.020451,0.027734,0.951815


In [26]:
dat_proba1.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba1.csv", index = False,
         header=None)

In [27]:
dat.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/1pyo.csv", index = False,
         header=None)

In [38]:
hist1_over = model1_over.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 455 samples, validate on 196 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
E

In [39]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist1_over.history['accuracy'])*100))

over-sampling train accuracy: 85.69%


In [18]:
df_proba = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_2.xlsx",
                        sheet_name=0,
                        index_col=None)

In [19]:
df_proba

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p002ykpresabs_qual,CFBRSa26,0,0,0.758914,0.241086,4.638713e-07
1,p002ykpresabs_qual,NRS109,2,2,0.005361,0.016236,9.784034e-01
2,p002ykpresabs_qual,NRS112,0,0,0.726623,0.273376,1.520979e-06
3,p002ykpresabs_qual,NRS216,1,1,0.138322,0.861665,1.334123e-05
4,p002ykpresabs_qual,NRS021,0,0,0.882176,0.117824,1.414530e-10
...,...,...,...,...,...,...,...
4279,pyopresabsSTCC_qual,NRS148,2,2,0.000007,0.000099,9.998934e-01
4280,pyopresabsSTCC_qual,NRS255,2,2,0.000257,0.002048,9.976944e-01
4281,pyopresabsSTCC_qual,NRS205,2,2,0.000011,0.000045,9.999435e-01
4282,pyopresabsSTCC_qual,NRS255,2,2,0.000257,0.002048,9.976944e-01


In [20]:
y_prob = df_proba[df_proba['phage']=='pyokpresabs_qual'].iloc[:,-3:]
y_prob = y_prob.to_numpy()
y_prob

array([[2.65482600e-01, 7.34014000e-01, 5.03491700e-04],
       [8.58843450e-01, 1.41141010e-01, 1.55746110e-05],
       [3.34019240e-03, 9.24840030e-01, 7.18197100e-02],
       [5.85653000e-01, 4.10576280e-01, 3.77069250e-03],
       [8.58843450e-01, 1.41141010e-01, 1.55746110e-05],
       [3.08219970e-01, 6.90642700e-01, 1.13732110e-03],
       [6.99526250e-01, 2.90807460e-01, 9.66635400e-03],
       [6.99526250e-01, 2.90807460e-01, 9.66635400e-03],
       [2.03958320e-03, 5.10278750e-02, 9.46932500e-01],
       [2.65482600e-01, 7.34014000e-01, 5.03491700e-04],
       [8.58843450e-01, 1.41141010e-01, 1.55746110e-05],
       [8.58843450e-01, 1.41141010e-01, 1.55746110e-05],
       [1.36437710e-01, 8.58879500e-01, 4.68287150e-03],
       [2.03958320e-03, 5.10278750e-02, 9.46932500e-01],
       [6.99526250e-01, 2.90807460e-01, 9.66635400e-03],
       [8.58843450e-01, 1.41141010e-01, 1.55746110e-05],
       [3.84846330e-01, 6.15148500e-01, 5.24299200e-06],
       [8.58843450e-01, 1.41141

In [21]:
## Retrieved from https://github.com/scikit-learn/scikit-learn/issues/3298
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelBinarizer

def rocauc_ovo(truth, pred, average="macro", multi_class="ovo"):

    lb = LabelBinarizer()
    lb.fit(truth)

    truth = lb.transform(truth)   
    
    return roc_auc_score(truth, pred, average=average, multi_class=multi_class)

In [22]:
ovo1 = rocauc_ovo(y_test_over, y_prob, average="macro", multi_class="ovo")
ovo1

0.9421804658445879

In [23]:
def rocauc_ovr(truth, pred, average="macro", multi_class="ovr"):

    lb = LabelBinarizer()
    lb.fit(truth)

    truth = lb.transform(truth)   

    return roc_auc_score(truth, pred, average=average, multi_class=multi_class)

In [24]:
ovr1 = rocauc_ovr(y_test_over, y_prob, average="macro", multi_class="ovr")
ovr1

0.9421804658445879

In [25]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over,
                                                    test_size = 0.3,
                                                    random_state=234,
                                                    stratify=y_over)

In [26]:
dat2 = pd.DataFrame(X_test_over[:,0])
dat2['test'] = y_test_over

In [27]:
dat2

Unnamed: 0,0,test
0,BCH-SA-04,0
1,NRS110,1
2,NRS109,2
3,NRS183,1
4,BCH-SA-05,0
...,...,...
191,NRS112,1
192,SR1065,0
193,NRS203,0
194,CFBREBSa129,0


In [28]:
X_train_over = X_train_over[:,1:]
X_test_over = X_test_over[:,1:]

In [29]:
model1_over2 = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],)),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax'),
])

In [33]:
model1_over2.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [34]:
model1_over2.fit(X_train_over, y_train_over,
          batch_size=64, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 455 samples, validate on 196 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
E

<keras.callbacks.callbacks.History at 0x1a3e695160>

In [67]:
acc_test_over2 = model1_over2.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over2*100))

over-sampling test accuracy: 85.71%


In [35]:
pred2 = model1_over2.predict_classes(X_test_over)
pred2

array([0, 1, 2, 0, 0, 0, 2, 2, 2, 2, 1, 0, 0, 2, 0, 0, 2, 1, 2, 1, 2, 0,
       2, 1, 0, 1, 0, 0, 2, 2, 2, 2, 1, 0, 0, 2, 0, 2, 1, 1, 1, 0, 1, 2,
       1, 2, 0, 1, 2, 1, 2, 0, 1, 0, 0, 2, 2, 1, 0, 1, 2, 0, 1, 1, 1, 0,
       0, 2, 0, 2, 1, 2, 2, 0, 1, 0, 0, 1, 0, 1, 2, 1, 1, 2, 0, 1, 1, 1,
       2, 1, 1, 0, 1, 1, 0, 2, 1, 2, 0, 1, 2, 0, 2, 1, 0, 0, 0, 1, 2, 2,
       0, 2, 0, 2, 1, 2, 1, 2, 1, 2, 0, 1, 2, 1, 0, 1, 2, 0, 0, 1, 2, 2,
       0, 2, 1, 2, 2, 1, 1, 2, 1, 0, 1, 2, 2, 0, 0, 0, 0, 2, 2, 2, 1, 1,
       0, 2, 1, 2, 1, 0, 2, 1, 2, 1, 0, 1, 2, 1, 2, 0, 1, 0, 0, 2, 0, 2,
       1, 0, 0, 0, 0, 0, 2, 2, 2, 0, 2, 1, 2, 0, 2, 1, 0, 0, 0, 0])

In [36]:
dat2['pred'] = pred2
dat2

Unnamed: 0,0,test,pred
0,BCH-SA-04,0,0
1,NRS110,1,1
2,NRS109,2,2
3,NRS183,1,0
4,BCH-SA-05,0,0
...,...,...,...
191,NRS112,1,1
192,SR1065,0,0
193,NRS203,0,0
194,CFBREBSa129,0,0


In [37]:
proba2 = model1_over2.predict_proba(X_test_over)
dat_proba2 = pd.DataFrame(proba2)

In [38]:
dat_proba2

Unnamed: 0,0,1,2
0,0.744439,0.253666,0.001895
1,0.005637,0.992833,0.001530
2,0.023216,0.070600,0.906184
3,0.778771,0.219659,0.001570
4,0.831228,0.168737,0.000035
...,...,...,...
191,0.001488,0.967119,0.031393
192,0.831228,0.168737,0.000035
193,0.617932,0.382009,0.000060
194,0.744439,0.253666,0.001895


In [39]:
dat_proba2.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba2.csv", index = False,
         header=None)

In [40]:
dat2.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/2pyo.csv", index = False,
         header=None)

In [71]:
hist1_over2 = model1_over2.fit(X_train_over, y_train_over,
          batch_size=64, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 455 samples, validate on 196 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [72]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist1_over2.history['accuracy'])*100))

over-sampling train accuracy: 87.01%


In [30]:
df_proba2 = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_2.xlsx",
                        sheet_name=1,
                        index_col=None)

In [31]:
df_proba2

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p002ykpresabs_qual,NRS148,2,2,0.000056,1.748042e-03,9.981960e-01
1,p002ykpresabs_qual,BCH-SA-03,1,0,0.712007,2.879924e-01,9.646217e-07
2,p002ykpresabs_qual,NRS218,1,1,0.006222,9.937732e-01,4.482882e-06
3,p002ykpresabs_qual,NRS036,0,0,0.882617,1.173831e-01,2.310933e-10
4,p002ykpresabs_qual,NRS386,1,0,0.571179,4.288184e-01,2.444667e-06
...,...,...,...,...,...,...,...
4279,pyopresabsSTCC_qual,NRS112,1,1,0.001860,9.979747e-01,1.653396e-04
4280,pyopresabsSTCC_qual,SR1065,0,0,0.982940,1.705227e-02,7.349168e-06
4281,pyopresabsSTCC_qual,NRS203,0,0,0.997093,1.962516e-03,9.441347e-04
4282,pyopresabsSTCC_qual,CFBREBSa129,0,0,1.000000,3.031141e-13,3.208205e-09


In [32]:
y_prob2 = df_proba2[df_proba2['phage']=='pyokpresabs_qual'].iloc[:,-3:]
y_prob2 = y_prob2.to_numpy()
y_prob2

array([[7.4443877e-01, 2.5366583e-01, 1.8954171e-03],
       [5.6372737e-03, 9.9283326e-01, 1.5295038e-03],
       [2.3215990e-02, 7.0600264e-02, 9.0618370e-01],
       [7.7877074e-01, 2.1965879e-01, 1.5703891e-03],
       [8.3122814e-01, 1.6873695e-01, 3.4841340e-05],
       [7.4443877e-01, 2.5366583e-01, 1.8954171e-03],
       [5.4060670e-03, 1.8711940e-02, 9.7588193e-01],
       [3.1530153e-04, 7.6636076e-02, 9.2304870e-01],
       [2.3215990e-02, 7.0600264e-02, 9.0618370e-01],
       [3.1530153e-04, 7.6636076e-02, 9.2304870e-01],
       [9.5427510e-02, 9.0257627e-01, 1.9961866e-03],
       [8.4118134e-01, 1.5863627e-01, 1.8238454e-04],
       [7.4443877e-01, 2.5366583e-01, 1.8954171e-03],
       [2.3215990e-02, 7.0600264e-02, 9.0618370e-01],
       [6.8781220e-01, 3.1206520e-01, 1.2260700e-04],
       [7.7877074e-01, 2.1965879e-01, 1.5703891e-03],
       [3.1530153e-04, 7.6636076e-02, 9.2304870e-01],
       [9.5427510e-02, 9.0257627e-01, 1.9961866e-03],
       [5.4060670e-03, 1.871

In [33]:
ovo2 = rocauc_ovo(y_test_over, y_prob2, average="macro", multi_class="ovo")
ovo2

0.9405159937602686

In [34]:
ovr2 = rocauc_ovr(y_test_over, y_prob2, average="macro", multi_class="ovr")
ovr2

0.9405159937602686

In [35]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over,
                                                    test_size = 0.3,
                                                    random_state=345,
                                                    stratify=y_over)

In [36]:
dat3 = pd.DataFrame(X_test_over[:,0])
dat3['test'] = y_test_over

In [37]:
dat3

Unnamed: 0,0,test
0,NRS168,1
1,NRS383,1
2,NRS148,2
3,NRS109,2
4,NRS213,0
...,...,...
191,NRS255,2
192,NRS255,2
193,NRS266,1
194,NRS001,1


In [38]:
X_train_over = X_train_over[:,1:]
X_test_over = X_test_over[:,1:]

In [45]:
model1_over3 = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],)),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax'),
])

In [46]:
model1_over3.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [47]:
model1_over3.fit(X_train_over, y_train_over,
          batch_size=16, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 455 samples, validate on 196 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1a3eca20f0>

In [92]:
acc_test_over3 = model1_over3.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over3*100))

over-sampling test accuracy: 83.16%


In [48]:
pred3 = model1_over3.predict_classes(X_test_over)
pred3

array([1, 1, 2, 2, 0, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 1, 0, 0, 2, 2, 2, 1,
       0, 2, 0, 1, 2, 1, 1, 0, 0, 2, 1, 2, 1, 1, 1, 2, 1, 2, 0, 2, 0, 2,
       0, 1, 2, 1, 1, 2, 0, 2, 1, 0, 0, 2, 0, 0, 2, 1, 2, 1, 0, 0, 2, 1,
       1, 0, 2, 0, 1, 0, 0, 0, 0, 2, 2, 0, 0, 0, 2, 0, 0, 2, 0, 2, 0, 0,
       2, 1, 2, 0, 0, 2, 0, 1, 2, 2, 1, 2, 2, 1, 2, 2, 0, 0, 2, 1, 2, 1,
       1, 2, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 0, 2, 0, 1, 0, 1, 0, 2, 2, 0,
       1, 2, 2, 2, 2, 2, 2, 0, 1, 0, 0, 2, 1, 1, 1, 2, 0, 2, 2, 0, 2, 0,
       0, 1, 2, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 2, 2, 0, 1, 1, 2, 0, 1, 2,
       2, 0, 2, 0, 1, 2, 0, 2, 0, 2, 1, 1, 0, 0, 2, 2, 2, 1, 1, 1])

In [49]:
dat3['pred'] = pred3
dat3

Unnamed: 0,0,test,pred
0,NRS168,1,1
1,NRS383,1,1
2,NRS148,2,2
3,NRS109,2,2
4,NRS213,0,0
...,...,...,...
191,NRS255,2,2
192,NRS255,2,2
193,NRS266,1,1
194,NRS001,1,1


In [50]:
proba3 = model1_over3.predict_proba(X_test_over)
dat_proba3 = pd.DataFrame(proba3)

In [51]:
dat_proba3

Unnamed: 0,0,1,2
0,0.105407,0.894581,1.162835e-05
1,0.195538,0.804347,1.152475e-04
2,0.000019,0.072096,9.278851e-01
3,0.000013,0.104697,8.952906e-01
4,0.693035,0.306428,5.368003e-04
...,...,...,...
191,0.000057,0.001517,9.984261e-01
192,0.000057,0.001517,9.984261e-01
193,0.018125,0.981694,1.806314e-04
194,0.321893,0.678106,5.015443e-07


In [52]:
dat_proba3.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba3.csv", index = False,
         header=None)

In [53]:
dat3.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/3pyo.csv", index = False,
         header=None)

In [96]:
hist1_over3 = model1_over3.fit(X_train_over, y_train_over,
          batch_size=16, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 455 samples, validate on 196 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [97]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist1_over3.history['accuracy'])*100))

over-sampling train accuracy: 88.11%


In [39]:
df_proba3 = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_2.xlsx",
                        sheet_name=2,
                        index_col=None)

In [40]:
df_proba3

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p002ykpresabs_qual,NRS109,2,2,0.004477,0.013518,9.820048e-01
1,p002ykpresabs_qual,NRS109,2,2,0.004477,0.013518,9.820048e-01
2,p002ykpresabs_qual,NRS222,0,0,0.851725,0.148269,5.980786e-06
3,p002ykpresabs_qual,NRS109,2,2,0.004477,0.013518,9.820048e-01
4,p002ykpresabs_qual,GA50245,0,0,0.812055,0.187945,1.161034e-07
...,...,...,...,...,...,...,...
4279,pyopresabsSTCC_qual,NRS255,2,2,0.000633,0.000928,9.984396e-01
4280,pyopresabsSTCC_qual,NRS255,2,2,0.000633,0.000928,9.984396e-01
4281,pyopresabsSTCC_qual,NRS266,1,1,0.025932,0.974061,7.323514e-06
4282,pyopresabsSTCC_qual,NRS001,1,1,0.000597,0.999403,3.675362e-10


In [41]:
y_prob3 = df_proba3[df_proba3['phage']=='pyokpresabs_qual'].iloc[:,-3:]
y_prob3 = y_prob3.to_numpy()
y_prob3

array([[1.05407440e-01, 8.94580960e-01, 1.16283490e-05],
       [1.95538030e-01, 8.04346700e-01, 1.15247545e-04],
       [1.91486090e-05, 7.20958300e-02, 9.27885060e-01],
       [1.27997320e-05, 1.04696600e-01, 8.95290600e-01],
       [6.93035200e-01, 3.06428000e-01, 5.36800300e-04],
       [8.91438000e-01, 1.08561660e-01, 3.39223020e-07],
       [7.03646800e-01, 2.95027900e-01, 1.32535990e-03],
       [1.27997320e-05, 1.04696600e-01, 8.95290600e-01],
       [3.21893270e-01, 6.78106300e-01, 5.01544300e-07],
       [8.91438000e-01, 1.08561660e-01, 3.39223020e-07],
       [6.93035200e-01, 3.06428000e-01, 5.36800300e-04],
       [1.27997320e-05, 1.04696600e-01, 8.95290600e-01],
       [1.81600250e-02, 9.81579840e-01, 2.60072000e-04],
       [7.03646800e-01, 2.95027900e-01, 1.32535990e-03],
       [7.51712400e-01, 2.48287440e-01, 1.67258620e-07],
       [1.05407440e-01, 8.94580960e-01, 1.16283490e-05],
       [8.91438000e-01, 1.08561660e-01, 3.39223020e-07],
       [9.57151200e-01, 4.28232

In [42]:
ovo3 = rocauc_ovo(y_test_over, y_prob3, average="macro", multi_class="ovo")
ovo3

0.9144387207364306

In [43]:
ovr3 = rocauc_ovr(y_test_over, y_prob3, average="macro", multi_class="ovr")
ovr3

0.9144387207364306

In [44]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over,
                                                    test_size = 0.3,
                                                    random_state=456,
                                                    stratify=y_over)

In [45]:
dat4 = pd.DataFrame(X_test_over[:,0])
dat4['test'] = y_test_over

In [46]:
dat4

Unnamed: 0,0,test
0,NRS178,1
1,NRS109,2
2,NRS073,1
3,CFBREBSa119,0
4,NRS109,2
...,...,...
191,NRS236,1
192,NRS029,0
193,NRS148,2
194,CFBRSa28,0


In [47]:
X_train_over = X_train_over[:,1:]
X_test_over = X_test_over[:,1:]

In [48]:
model1_over4 = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],)),
    Dense(3, activation='softmax'),
])

In [59]:
model1_over4.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [60]:
model1_over4.fit(X_train_over, y_train_over,
          batch_size=16, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 455 samples, validate on 196 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1a3f155b38>

In [113]:
acc_test_over4 = model1_over4.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over4*100))

over-sampling test accuracy: 85.71%


In [61]:
pred4 = model1_over4.predict_classes(X_test_over)
pred4

array([1, 2, 0, 0, 2, 1, 1, 2, 2, 0, 2, 1, 1, 2, 0, 0, 0, 2, 2, 1, 2, 2,
       1, 1, 1, 0, 2, 2, 1, 1, 2, 0, 0, 2, 2, 0, 0, 0, 1, 1, 1, 2, 1, 0,
       1, 1, 2, 2, 0, 0, 2, 0, 2, 2, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 2, 0,
       0, 0, 1, 2, 0, 0, 1, 2, 0, 1, 2, 0, 0, 0, 0, 2, 1, 2, 1, 0, 1, 0,
       0, 2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 2, 1, 0, 2, 2, 0, 1, 1, 2, 0, 1,
       2, 2, 2, 0, 2, 2, 2, 1, 1, 1, 0, 2, 0, 1, 0, 0, 2, 2, 2, 0, 0, 0,
       2, 1, 2, 0, 0, 0, 1, 2, 2, 0, 0, 2, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2,
       1, 1, 1, 2, 2, 0, 0, 2, 1, 0, 0, 2, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 2, 1, 1, 1, 0, 2, 0, 0, 0, 1, 2, 0, 2, 2, 1, 1, 2, 0, 2])

In [62]:
dat4['pred'] = pred4
dat4

Unnamed: 0,0,test,pred
0,NRS178,1,1
1,NRS109,2,2
2,NRS073,1,0
3,CFBREBSa119,0,0
4,NRS109,2,2
...,...,...,...
191,NRS236,1,1
192,NRS029,0,1
193,NRS148,2,2
194,CFBRSa28,0,0


In [63]:
proba4 = model1_over4.predict_proba(X_test_over)
dat_proba4 = pd.DataFrame(proba4)

In [64]:
dat_proba4

Unnamed: 0,0,1,2
0,0.012761,0.987239,1.959349e-07
1,0.027085,0.109052,8.638632e-01
2,0.792741,0.204852,2.406992e-03
3,0.830995,0.168971,3.317414e-05
4,0.027085,0.109052,8.638632e-01
...,...,...,...
191,0.152473,0.846993,5.343235e-04
192,0.206379,0.793612,8.724219e-06
193,0.002777,0.057920,9.393021e-01
194,0.830995,0.168971,3.317414e-05


In [65]:
dat_proba4.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/proba4.csv", index = False,
         header=None)

In [66]:
dat4.to_csv("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/4pyo.csv", index = False,
         header=None)

In [117]:
hist1_over4 = model1_over4.fit(X_train_over, y_train_over,
          batch_size=16, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 455 samples, validate on 196 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100


Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [118]:
print('over-sampling train accuracy: %.2f%%' % (np.mean(hist1_over4.history['accuracy'])*100))

over-sampling train accuracy: 86.54%


In [49]:
df_proba4 = pd.read_excel("/Users/Rebecca/Desktop/Claudia/neural network/new_phage_qual/dataset/NN_over_2.xlsx",
                        sheet_name=3,
                        index_col=None)

In [50]:
df_proba4

Unnamed: 0,phage,strain,phenotype,prediction,0,1,2
0,p002ykpresabs_qual,NRS110,1,1,0.000003,0.999997,5.870196e-13
1,p002ykpresabs_qual,NRS216,1,1,0.039254,0.960745,9.078969e-07
2,p002ykpresabs_qual,NRS386,1,1,0.326752,0.673248,1.061032e-07
3,p002ykpresabs_qual,CFBRSa25,0,0,0.611084,0.388916,7.664974e-07
4,p002ykpresabs_qual,BCH-SA-03,1,0,0.611084,0.388916,7.664974e-07
...,...,...,...,...,...,...,...
4279,pyopresabsSTCC_qual,NRS236,1,1,0.000052,0.999768,1.803156e-04
4280,pyopresabsSTCC_qual,NRS029,0,1,0.322350,0.677496,1.533154e-04
4281,pyopresabsSTCC_qual,NRS148,2,2,0.000006,0.000026,9.999682e-01
4282,pyopresabsSTCC_qual,CFBRSa28,0,0,0.999288,0.000176,5.361527e-04


In [51]:
y_prob4 = df_proba4[df_proba4['phage']=='pyokpresabs_qual'].iloc[:,-3:]
y_prob4 = y_prob4.to_numpy()
y_prob4

array([[1.2761022e-02, 9.8723870e-01, 1.9593487e-07],
       [2.7084640e-02, 1.0905208e-01, 8.6386320e-01],
       [7.9274064e-01, 2.0485236e-01, 2.4069923e-03],
       [8.3099530e-01, 1.6897143e-01, 3.3174143e-05],
       [2.7084640e-02, 1.0905208e-01, 8.6386320e-01],
       [7.2012990e-02, 9.2530360e-01, 2.6833606e-03],
       [6.3467050e-02, 7.4327767e-01, 1.9325529e-01],
       [2.7084640e-02, 1.0905208e-01, 8.6386320e-01],
       [2.7774759e-03, 5.7920440e-02, 9.3930210e-01],
       [8.3099530e-01, 1.6897143e-01, 3.3174143e-05],
       [2.7084640e-02, 1.0905208e-01, 8.6386320e-01],
       [7.2012990e-02, 9.2530360e-01, 2.6833606e-03],
       [3.0826753e-01, 6.9172376e-01, 8.7076450e-06],
       [2.7774759e-03, 5.7920440e-02, 9.3930210e-01],
       [8.3099530e-01, 1.6897143e-01, 3.3174143e-05],
       [7.9274064e-01, 2.0485236e-01, 2.4069923e-03],
       [8.3099530e-01, 1.6897143e-01, 3.3174143e-05],
       [2.7774759e-03, 5.7920440e-02, 9.3930210e-01],
       [6.8332140e-03, 2.027

In [52]:
ovo4 = rocauc_ovo(y_test_over, y_prob4, average="macro", multi_class="ovo")
ovo4

0.947655652235805

In [53]:
ovr4 = rocauc_ovr(y_test_over, y_prob4, average="macro", multi_class="ovr")
ovr4

0.947655652235805

In [54]:
ovos = [ovo1, ovo2, ovo3, ovo4]
np.mean(ovos)

0.936197708144273

In [55]:
np.std(ovos)

0.012837240709955034

In [56]:
ovrs = [ovr1, ovr2, ovr3, ovr4]
np.mean(ovrs)

0.936197708144273

In [57]:
np.std(ovrs)

0.012837240709955034

In [119]:
accs = [acc_test_over, acc_test_over2, acc_test_over3, acc_test_over4]

In [120]:
mean = np.mean(accs)
print('over-sampling test accuracy mean: %.2f%%' % (mean*100))

over-sampling test accuracy mean: 85.71%


In [121]:
std = np.std(accs)
print('over-sampling test accuracy standard deviation:', std)

over-sampling test accuracy standard deviation: 0.018038429694177537


In [122]:
accs_train = [np.mean(hist1_over.history['accuracy']), np.mean(hist1_over2.history['accuracy']), np.mean(hist1_over3.history['accuracy']),
             np.mean(hist1_over4.history['accuracy'])]

In [123]:
mean_train = np.mean(accs_train)
print('over-sampling train accuracy mean: %.2f%%' % (mean_train*100))

over-sampling train accuracy mean: 86.84%


In [124]:
std_train = np.std(accs_train)
print('over-sampling train accuracy standard deviation:', std_train)

over-sampling train accuracy standard deviation: 0.008767562
