In [1]:
## This file implements neural networks and random forest on p0017Spresabs_qual.
## Due to the imbalanced data and limited cases for class 2, we implement both over-sampling method.
## For fully-connected neural networks, the accuracy is 95.43% for over-sampling data and 93.40% with dropout and regularization.
## For random forest, the accuracy is 94.42% for over-sampling data.
## For random forest with cross-validation, the mean accuracy is 92.99% for over-sampling data.

In [2]:
from numpy.random import seed
seed(100)
import tensorflow
tensorflow.random.set_seed(123)

In [3]:
import pandas as pd

df = pd.read_csv('/Users/Rebecca/Desktop/Claudia/neural network/phage_qual/p0017Spresabs_qual.csv')
df.shape

(255, 146)

In [4]:
df.rename(columns={'Unnamed: 0':'id'}, inplace=True)

In [5]:
df['pheno']

0      0
1      0
2      1
3      0
4      0
5      0
6      0
7      0
8      1
9      0
10     0
11     1
12     0
13     0
14     0
15     0
16     1
17     1
18     0
19     0
20     0
21     0
22     0
23     0
24     0
25     0
26     0
27     0
28     0
29     1
      ..
225    0
226    0
227    1
228    0
229    1
230    0
231    0
232    0
233    0
234    0
235    0
236    0
237    0
238    0
239    0
240    0
241    0
242    0
243    0
244    1
245    0
246    0
247    0
248    0
249    0
250    1
251    0
252    0
253    0
254    0
Name: pheno, Length: 255, dtype: int64

In [6]:
df['pheno'].value_counts()

0    218
1     35
2      2
Name: pheno, dtype: int64

In [7]:
df.head()

Unnamed: 0,id,TTTTGAAGCATTAAGATTACTTATCATTTTTAAATTTCAATTTAAACTAACAGTAATTTATGTAGCTTTTGTAATTCTCATAATAACCTTTACTTCATTT,TTTTCCAGTAAT,TTTTAATACATAT,TTTTAAATATTATAA,TTTGAAGCATTAAGATTACTTATCATTTTTAAATTTCAATTTAAACTAACAGTAATTTATGTAGCTTTTGTAATTCTCATAATAACCTTTACTTCATTTA,TTTATCTTTATGA,TTTAATTTAGTAAGT,TTTAAAAAGATGAATAATGTAAATGAAGTAAAGGTTATTATGAGAATTACAAAAGCTACATAAATTACTGTTAGTTTAAATTGAAATTTAAAAATGATAA,TTGAAGCATTAAGATTACTTATCATTTTTAAATTTCAATTTAAACTAACAGTAATTTATGTAGCTTTTGTAATTCTCATAATAACCTTTACTTCATTTAC,...,group_2403,group_3458,group_3904,group_426,group_475,group_6375,group_7822,group_8071,group_8913,pheno
0,107,1,0,0,0,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,109,1,1,1,1,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,115,1,0,0,0,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,1
3,120335,1,0,0,0,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,120337,1,0,0,0,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [8]:
df_clean = df.drop(columns=['id'])

In [9]:
df_clean.shape

(255, 145)

In [10]:
df_clean.head()

Unnamed: 0,TTTTGAAGCATTAAGATTACTTATCATTTTTAAATTTCAATTTAAACTAACAGTAATTTATGTAGCTTTTGTAATTCTCATAATAACCTTTACTTCATTT,TTTTCCAGTAAT,TTTTAATACATAT,TTTTAAATATTATAA,TTTGAAGCATTAAGATTACTTATCATTTTTAAATTTCAATTTAAACTAACAGTAATTTATGTAGCTTTTGTAATTCTCATAATAACCTTTACTTCATTTA,TTTATCTTTATGA,TTTAATTTAGTAAGT,TTTAAAAAGATGAATAATGTAAATGAAGTAAAGGTTATTATGAGAATTACAAAAGCTACATAAATTACTGTTAGTTTAAATTGAAATTTAAAAATGATAA,TTGAAGCATTAAGATTACTTATCATTTTTAAATTTCAATTTAAACTAACAGTAATTTATGTAGCTTTTGTAATTCTCATAATAACCTTTACTTCATTTAC,TTCCATCGAATCAC,...,group_2403,group_3458,group_3904,group_426,group_475,group_6375,group_7822,group_8071,group_8913,pheno
0,1,0,0,0,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,1,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,1
3,1,0,0,0,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
X = df_clean.loc[:, df_clean.columns != 'pheno'].values
y = df_clean['pheno'].values
print(X.shape, y.shape)

(255, 144) (255,)


In [12]:
# over-sampling
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler
overS = RandomOverSampler(random_state=100)
X_over, y_over = overS.fit_resample(X, y)
print(sorted(Counter(y_over).items()))

[(0, 218), (1, 218), (2, 218)]


Using TensorFlow backend.


In [13]:
# split into train, test data (over)
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over,
                                                    test_size = 0.3,
                                                    random_state=123,
                                                    stratify=y_over)

In [14]:
############# Fully-Connected Neural Network ################

In [15]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.regularizers import l1
import numpy as np

In [16]:
#### neural network on over-sampling data
model1_over = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],)),
    Dense(3, activation='softmax'),
])

In [17]:
model1_over.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [18]:
model1_over.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 457 samples, validate on 197 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100


Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1a3c85c128>

In [19]:
acc_test_over = model1_over.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over*100))

over-sampling test accuracy: 95.43%


In [20]:
#### add dropout and regularizer
model1_over2 = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_over.shape[1],), activity_regularizer=l1(0.001)),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax'),
    Dropout(0.2, ),
])

In [21]:
model1_over2.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [22]:
model1_over2.fit(X_train_over, y_train_over,
          batch_size=32, epochs=100,
          validation_data=(X_test_over, y_test_over))

Train on 457 samples, validate on 197 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100


Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1a3d0153c8>

In [23]:
acc_test_over2 = model1_over2.evaluate(X_test_over, y_test_over)[1]
print('over-sampling test accuracy: %.2f%%' % (acc_test_over2*100))

over-sampling test accuracy: 93.40%


In [24]:
############## Random Forest ##############

In [25]:
###### random forest on over-sampling data
from sklearn.ensemble import RandomForestClassifier

rf_over = RandomForestClassifier(n_estimators=100, random_state=123)
rf_over.fit(X_train_over,y_train_over)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=123,
                       verbose=0, warm_start=False)

In [26]:
from sklearn.metrics import accuracy_score
y_pred_over = rf_over.predict(X_test_over)
print('over-sampling test accuracy: %.2f%%' % (accuracy_score(y_test_over, y_pred_over)*100))

over-sampling test accuracy: 94.42%


In [27]:
## random forest model with CV on over-sampling data
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

#rfcv_over = RandomForestClassifier(n_estimators=100, random_state=123)

accs_over = cross_val_score(estimator=rf_over, X=X_train_over, y=y_train_over, cv=5)
print(accs_over)
print(accs_over.mean())

[0.93548387 0.93548387 0.91208791 0.94444444 0.92222222]
0.9299444641380126
