In [None]:
#################################### Libraries #######################################
#%% libraries
import os # setting working directory
import numpy as np # for generating random embeddings
import pandas as pd # importing a csv
import spacy # basic text processing
import keras
from keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras import layers
plt.style.use('ggplot')
from statistics import harmonic_mean
import pickle
from keras.layers import Embedding, LSTM, Conv1D, Bidirectional, Dense, Concatenate, GlobalMaxPooling1D


In [None]:
################################## preliminaries #################################
# sets working directory
os.chdir('/kaggle/input/cancer-data/')

# sets random seeds
tf.random.set_seed(0)


In [None]:
##################################### loads data ####################################
training1 = pd.read_csv('training1_v1.csv')
training2 = pd.read_csv('training2_v1.csv')
testing1 = pd.read_csv('testing1_v1.csv')


x_valid = pickle.load(open('x_training1', "rb"))
x_train = pickle.load(open('x_training2', "rb"))
x_test = pickle.load(open('x_testing1', "rb"))
y_valid = pickle.load(open('y_training1', "rb"))
y_train = pickle.load(open('y_training2', "rb"))
y_test = pickle.load(open('y_testing1', "rb"))

embeddings_pretrained = pickle.load(open('embeddings_w2v', "rb" ) )
embeddings_random = pickle.load(open('embeddings_random', "rb"))

max_len = pickle.load(open('max_len', "rb"))

dim_embed = embeddings_pretrained.shape[1]
num_embed = embeddings_pretrained.shape[0]

In [None]:
########################## custom CNN with multiple filter window sizes ##############################

class Conv1D_multiple_filters(keras.layers.Layer):
    def __init__(self, filter_size_list, filter_num_list, activation, pooling_fun):
        super().__init__()
            
        self.num_window_sizes = len(filter_size_list)
        self.convolutions_list = [Conv1D(filter_num_list[i],
                                   filter_size_list[i],
                                   activation = activation)
                             for i in range(self.num_window_sizes)
                             ]
        self.pooling_fun = pooling_fun()
        self.concat = Concatenate()
        
    def call(self, x):
        # outputs
        x = [self.convolutions_list[i](x) for i in range(self.num_window_sizes)]
        x = [self.pooling_fun(x[i]) for i in range(self.num_window_sizes)]
        x = self.concat(x)

        return x


In [None]:
############################## tracking model fitting ####################################

def plot_loss(history):
    val_loss = history.history['val_loss']
    epochs = range(1, len(val_loss) + 1)

    plt.plot(epochs, val_loss, 'b')
    plt.xlabel("Epoch")
    plt.ylabel("Validation Loss")
    plt.legend()



In [None]:
##################################### model definition #############################################
##### modeling choices
### choice between pretrained and random word embeddings (comment out one of the choices to use the other)
embedding_matrix = embeddings_pretrained
#embedding_matrix = embeddings_random

# CNN hyperparameter choices
filter_size_list = [3,5]        # filter window sizes
filter_num_list = [32,32]       # number of filters for each window size
    
### biLSTM hyperparameter choices
rnn_dim_hidden = 32

# mini-batch size
batch_size = 32
num_epochs = 10


##### defining the model
# using the Sequential module
model = Sequential()

# adds word embedding layer to model
model.add(Embedding(num_embed, dim_embed, 
                           weights=[embedding_matrix], 
                           input_length = max_len, 
                           trainable = False))

### adds CNN or biLSM (choose by commenting out one of these lines)
model.add(Conv1D_multiple_filters(filter_size_list, filter_num_list, activation = 'relu', pooling_fun = GlobalMaxPooling1D))
#model.add(Bidirectional(LSTM(rnn_dim_hidden)))

# adds a MLP(1) layer
model.add(Dense(1, activation = 'sigmoid'))

# chooses loss function, optimizer, and evaluation metrics
model.compile(optimizer = 'adam',
              loss = 'binary_crossentropy',
              metrics = ['Precision', 'Recall'])

# adds model summary (just for user)
model.summary()


In [None]:
################################ model fitting ###############################

#%% runs 
history = model.fit(x_train, y_train,
                    epochs = num_epochs,
                    verbose = False,
                    validation_data = (x_valid, y_valid),
                    batch_size = batch_size)


In [None]:
############################ results ##############################
loss, precision, recall = model.evaluate(x_test, y_test, verbose = False)
f1 = harmonic_mean([precision, recall])

print("Testing precision:  {:.4f}".format(precision))
print("Testing recall:  {:.4f}".format(recall))
print("Testing F1:  {:.4f}".format(f1))

plot_loss(history)