In [2]:
# Imports

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow.compat.v1 as tf
import tensorflow_hub as hub
#from keras import backend as K
from tensorflow.compat.v1.keras import backend as K
from tensorflow.keras.utils import plot_model

from keras.models import Model, Input
from keras.layers.merge import add
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Conv1D, Concatenate
from keras.layers import Bidirectional, concatenate, SpatialDropout1D, GlobalMaxPooling1D
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, BatchNormalization, Bidirectional, Lambda

In [3]:
def read_data():
    #os.chdir('D:/TU_Graz/Thesis/Datasets/Reddit_features')
    
    train = pd.read_csv("../input/bio-tagged/train_final_all.csv")
    test = pd.read_csv("../input/bio-tagged/test_final_all.csv")
    
    #train = pd.read_csv("train_final_all.csv")
    #test = pd.read_csv("test_final_all.csv")
    data = train.append(test)

    return train, test, data

In [4]:
def create_lists(data, category):
    words = list(set(data["Token"].values))
    words.append("ENDPAD")
    n_words = len(words)
    tags = list(set(data["BIO"].values))
    n_tags = len(tags)

    return words, n_words, tags, n_tags

In [5]:
def group_sentences(data, category):
    all_sents = []
    sent_ids = data['Sent_id'].unique()
    for curr_id in sent_ids:
        tmp_df = data[data['Sent_id'] == curr_id]
        tmp_df = pd.concat([tmp_df['Token'], tmp_df["Token_index"], tmp_df.iloc[:,4:44], tmp_df[category]], axis = 1)
        #tmp_df = pd.concat([tmp_df['Token'], tmp_df["Token_index"], tmp_df.iloc[:,4:149], tmp_df[category]], axis = 1)
        records = tmp_df.to_records(index=False)
        all_sents.append(records)
    return all_sents

In [6]:
def remove_sents_over_threshold(sents, threshold):
    sentences = list()
    for s in sents:
        if len(s) < threshold:
            sentences.append(s)
    return sentences

In [7]:
def prepare_and_pad(sentences, max_len, tag2idx):
    
    X1 = [[w[0] for w in s] for s in sentences]
    
    new_X = []
    for seq in X1:
        new_seq = []
        for i in range(max_len):
            try:
                new_seq.append(seq[i])
            except:
                new_seq.append("__PAD__")
        new_X.append(new_seq)
    X1 = new_X
    
    X2 = []
    for sentence in sentences:
        sent_ft = list()
        for word in sentence:
            ft = list()
            #for i in range(1, 147):
            for i in range(1, 41):
                ft.append(word[i])
            sent_ft.append(ft)
        for j in range(len(sentence)-1, max_len-1):
            ft = list()
            #for i in range(1, 147):
            for i in range(1, 41):
                ft.append(0)
            sent_ft.append(ft)
        X2.append(sent_ft)  
        
    y = [[tag2idx[w[len(w)-1]] for w in s] for s in sentences]
    y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])
    
    return X1, X2, y

In [8]:
def ElmoEmbedding(x):
    return elmo_model(inputs={"tokens": tf.squeeze(tf.cast(x, tf.string)),
                              "sequence_len": tf.constant(batch_size*[max_len])},
                      signature="tokens",
                      as_dict=True)["elmo"]

In [9]:
def build_model(max_len, n_words, n_tags): 
    
    # Input Layers
    word_input_layer = Input(shape=(max_len, 40))
    #word_input_layer = Input(shape=(max_len, 146))
    elmo_input_layer = Input(shape=(max_len,), dtype=tf.string)
    word_output_layer = Dense(n_tags, activation = 'softmax')(word_input_layer)
    elmo_output_layer = Lambda(ElmoEmbedding, output_shape=(None, 1024))(elmo_input_layer)
    output_layer = Concatenate()([word_output_layer, elmo_output_layer])
    output_layer = BatchNormalization()(output_layer)
    output_layer = Bidirectional(LSTM(units=512, return_sequences=True, recurrent_dropout=0.2, dropout=0.2))(output_layer)
    output_layer = Dense(n_tags, activation='softmax')(output_layer)

    #input_text = Input(shape=(max_len,), dtype=tf.string)
    #embedding = Lambda(ElmoEmbedding, output_shape=(None, 1024))(input_text)
    #x = Bidirectional(LSTM(units=512, return_sequences=True, recurrent_dropout=0.2, dropout=0.2))(embedding)
    #x_rnn = Bidirectional(LSTM(units=512, return_sequences=True, recurrent_dropout=0.2, dropout=0.2))(x)
    #x = add([x, x_rnn])  # residual connection to the first biLSTM
    #out_tmp = TimeDistributed(Dense(n_tags, activation="softmax"))(x)
    
    # main LSTM
    #x_new = concatenate([out_tmp, fts])
    #x_sp = SpatialDropout1D(0.1)(x_new)
    #main_lstm = Bidirectional(LSTM(units=200, return_sequences=True, recurrent_dropout=0.3))(x_sp)
    #out = TimeDistributed(Dense(n_tags, activation="softmax"))(main_lstm)
    model = Model([elmo_input_layer, word_input_layer], output_layer)
    return model

In [10]:
def plot_learning_curves(hist, curve1, curve2):
    plt.figure(figsize=(6,6))
    plt.plot(hist[curve1])
    plt.plot(hist[curve2])
    plt.show()

In [11]:
# Read the data
print("Loading the data...")
train, test, data = read_data()
print("Done.")

# Create word, tag and char lists
print("Creating sets of words and tags...")
words, n_words, tags, n_tags = create_lists(data, "BIO")
print("Done.")

# Create list of sentences
print("Creating sentence list...")
sents = group_sentences(data, 'BIO')
print("Done.")

# Remove submissions longer than a certain threshold
print("Removing submissions longer than threshold...")
sentences = remove_sents_over_threshold(sents, 300)
print("Done.")

# Create word and tag maps
print("Creating word and tag maps...")
max_len = 300
tag2idx = {t: i for i, t in enumerate(tags)}
print("Done.")

# Pad data
print("Preparing and padding training data...")
X1, X2, y = prepare_and_pad(sentences, max_len, tag2idx)
print("Done")

# Split to train and test
print("Splitting data...")
#X_tr = X[0:1185]
#y_tr = y[0:1185]
#X_te = X[1186:]
#y_te = y[1186:]
X1_tr, X1_te, y1_tr, y1_te = train_test_split(X1, y, test_size=0.22, shuffle=False)
X2_tr, X2_te, y2_tr, y2_te = train_test_split(X2, y, test_size=0.22, shuffle=False)
y_te = y1_te
print("Done.")

# Setting parameters
print("Setting parameters...")
batch_size = 32
plt.style.use("ggplot")
tf.compat.v1.disable_eager_execution()
sess = tf.compat.v1.Session()
K.set_session(sess)
elmo_model = hub.Module("https://tfhub.dev/google/elmo/3", trainable=True)
sess.run(tf.global_variables_initializer())
sess.run(tf.tables_initializer())
print("Done.")

# Build the model
print("Building the model...")
model = build_model(max_len, n_words, n_tags)

# Compile the model
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.summary()
print("Done.")

# Split the train data to train and validation data
print("Split to train and validation data...")
X1_tr, X1_val, y1_tr, y1_val = train_test_split(X1_tr, y1_tr, test_size=0.2, random_state=2021)
X2_tr, X2_val, y2_tr, y2_val = train_test_split(X2_tr, y2_tr, test_size=0.2, random_state=2021)
X1_tr = X1_tr[:(len(X1_tr)//batch_size) * batch_size]
X2_tr = X2_tr[:(len(X2_tr)//batch_size) * batch_size]
X1_val = X1_val[:(len(X1_val)//batch_size) * batch_size]
X2_val = X2_val[:(len(X2_val)//batch_size) * batch_size]
y_tr = y1_tr
y_tr = y_tr[:(len(y_tr)//batch_size) * batch_size]
y_val = y1_val
y_val = y_val[:(len(y_val)//batch_size) * batch_size] 
y_tr = y_tr.reshape(y_tr.shape[0], y_tr.shape[1], 1)
y_val = y_val.reshape(y_val.shape[0], y_val.shape[1], 1)
print("Done.")

# Fit the model
print("Fitting the model....")
history = model.fit([np.array(X1_tr), np.array(X2_tr).reshape((len(X2_tr), max_len, 40))], 
                    y_tr, 
                    validation_data=([np.array(X1_val), np.array(X2_val).reshape((len(X2_val), max_len, 40))], y_val),
                    batch_size=batch_size, epochs=20, verbose=1)

#history = model.fit(np.array(X_tr), y_tr, batch_size=24, epochs=5, validation_split=0.3, verbose=1)
hist = pd.DataFrame(history.history)


# Plotting learning curves
print("Plotting learning curves...")
plot_learning_curves(hist, "accuracy", "val_accuracy")
plot_learning_curves(hist, "loss", "val_loss")
print("Done.")

In [13]:
y_pred = model.predict([X1_te, np.array(X2_te).reshape((len(X2_te), max_len, 40))])
p = np.argmax(y_pred, axis=-1)
y_orig = []
for sent in y_te:
    for tag in sent:
        y_orig.append(tag)
y_preds = []
for sent in p:
    for tag in sent:
        y_preds.append(tag)
        
report = classification_report( y_orig, y_preds )
print(report) 

In [14]:
tag2idx