In [1]:
# Imports

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow.compat.v1 as tf
import tensorflow_hub as hub
#from keras import backend as K
from tensorflow.compat.v1.keras import backend as K
from tensorflow.keras.utils import plot_model

from keras.models import Model, Input
from keras.layers.merge import add
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Lambda

In [2]:
def read_data():
    #os.chdir('D:/TU_Graz/Thesis/Datasets/Reddit_features')
    
    train = pd.read_csv("../input/bio-tagged/train_final_all.csv")
    test = pd.read_csv("../input/bio-tagged/test_final_all.csv")
    
    #train = pd.read_csv("train_final_all.csv")
    #test = pd.read_csv("test_final_all.csv")
    data = train.append(test)

    return train, test, data

In [3]:
def create_lists(data, category):
    words = list(set(data["Token"].values))
    words.append("ENDPAD")
    n_words = len(words)
    tags = list(set(data["BIO"].values))
    n_tags = len(tags)

    return words, n_words, tags, n_tags

In [4]:
def group_sentences(data, category):
    all_sents = []
    sent_ids = data['Sent_id'].unique()
    for curr_id in sent_ids:
        tmp_df = data[data['Sent_id'] == curr_id]
        tmp_df = pd.concat([tmp_df['Token'], tmp_df["Token_index"], tmp_df.iloc[:,4:149], tmp_df[category]], axis = 1)
        records = tmp_df.to_records(index=False)
        all_sents.append(records)
    return all_sents

In [5]:
def remove_sents_over_threshold(sents, threshold):
    sentences = list()
    for s in sents:
        if len(s) < threshold:
            sentences.append(s)
    return sentences

In [6]:
def prepare_and_pad(sentences, max_len, tag2idx):
    
    X = [[w[0] for w in s] for s in sentences]

    new_X = []
    for seq in X:
        new_seq = []
        for i in range(max_len):
            try:
                new_seq.append(seq[i])
            except:
                new_seq.append("__PAD__")
        new_X.append(new_seq)
    X = new_X
    
    y = [[tag2idx[w[len(w)-1]] for w in s] for s in sentences]
    y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])

    return X, y

In [7]:
def ElmoEmbedding(x):
    return elmo_model(inputs={"tokens": tf.squeeze(tf.cast(x, tf.string)),
                              "sequence_len": tf.constant(batch_size*[max_len])},
                      signature="tokens",
                      as_dict=True)["elmo"]

In [8]:
def build_model(max_len): 
    input_seq = Input(shape=(max_len,), dtype=tf.string)
    elmo = Lambda(ElmoEmbedding, output_shape=(None, 1024))(input_seq)
    bilstm_1 = Bidirectional(LSTM(units=512, return_sequences=True, recurrent_dropout=0.2, dropout=0.2))(elmo)
    bilstm_2 = Bidirectional(LSTM(units=512, return_sequences=True, recurrent_dropout=0.2, dropout=0.2))(bilstm_1)
    x = add([bilstm_1, bilstm_2])  # residual connection to the first biLSTM
    out = TimeDistributed(Dense(n_tags, activation="softmax"))(x)

    model = Model(input_seq, out)
    
    return model

In [9]:
def plot_learning_curves(hist, curve1, curve2):
    plt.figure(figsize=(6,6))
    plt.plot(hist[curve1])
    plt.plot(hist[curve2])
    plt.show()

In [10]:
# Read the data
print("Loading the data...")
train, test, data = read_data()
print("Done.")

# Create word, tag and char lists
print("Creating sets of words and tags...")
words, n_words, tags, n_tags = create_lists(data, "BIO")
print("Done.")

# Create list of sentences
print("Creating sentence list...")
sents = group_sentences(data, 'BIO')
print("Done.")

# Remove submissions longer than a certain threshold
print("Removing submissions longer than threshold...")
sentences = remove_sents_over_threshold(sents, 300)
print("Done.")

# Create word and tag maps
print("Creating word and tag maps...")
max_len = 300
tag2idx = {t: i for i, t in enumerate(tags)}
print("Done.")

# Pad data
print("Preparing and padding training data...")
X, y = prepare_and_pad(sentences, max_len, tag2idx)
print("Done")

# Split to train and test
print("Splitting data...")
#X_tr = X[0:1185]
#y_tr = y[0:1185]
#X_te = X[1186:]
#y_te = y[1186:]
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.22, shuffle=False)
print("Done.")

# Setting parameters
print("Setting parameters...")
batch_size = 32
plt.style.use("ggplot")
tf.compat.v1.disable_eager_execution()
sess = tf.compat.v1.Session()
K.set_session(sess)
elmo_model = hub.Module("https://tfhub.dev/google/elmo/3", trainable=True)
sess.run(tf.global_variables_initializer())
sess.run(tf.tables_initializer())
print("Done.")

# Build the model
print("Building the model...")
model = build_model(max_len)

# Compile the model
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.summary()
print("Done.")

# Print the model
print("Print the model...")
plot_model(model,'Elmo_tag.png',show_shapes= True)
print("Done.")

# Split the train data to train and validation data
print("Split to train and validation data...")
X_tr, X_val, y_tr, y_val = train_test_split(X_tr, y_tr, test_size=0.2, random_state=2021)
X_tr = X_tr[:(len(X_tr)//batch_size) * batch_size]
X_val = X_val[:(len(X_val)//batch_size) * batch_size]
y_tr = y_tr[:(len(y_tr)//batch_size) * batch_size] 
y_val = y_val[:(len(y_val)//batch_size) * batch_size] 
##X_tr, X_val = X_tr[:29*batch_size], X_tr[-7*batch_size:]
#y_tr, y_val = y_tr[:29*batch_size], y_tr[-7*batch_size:]
y_tr = y_tr.reshape(y_tr.shape[0], y_tr.shape[1], 1)
y_val = y_val.reshape(y_val.shape[0], y_val.shape[1], 1)
print("Done.")

history = model.fit(np.array(X_tr), y_tr, validation_data=(np.array(X_val), y_val),
                    batch_size=batch_size, epochs=10, verbose=1)

#history = model.fit(np.array(X_tr), y_tr, batch_size=24, epochs=5, validation_split=0.3, verbose=1)
hist = pd.DataFrame(history.history)

# Plotting learning curves
print("Plotting learning curves...")
plot_learning_curves(hist, "accuracy", "val_accuracy")
plot_learning_curves(hist, "loss", "val_loss")
print("Done.")

In [11]:
p = model.predict(np.array(X_te))
#p = model.predict(np.array(X_te[i:i+batch_size]))[0]
p = np.argmax(p, axis=-1)
#y_te = y_te[0:192]

y_orig = []
for sent in y_te:
    for tag in sent:
        y_orig.append(tag)
        
y_preds = []
for sent in p:
    for tag in sent:
        y_preds.append(tag)
    
report = classification_report( y_orig, y_preds )
print(report)

In [13]:
tag2idx

In [14]:
i = 1
p = model.predict(np.array(X_te[i:i+batch_size]))[0]
p = np.argmax(p, axis=-1)
print("{:15} {:5}: ({})".format("Word", "Pred", "True"))
print("="*30)
for w, true, pred in zip(X_te[i], y_te[i], p):
    if w != "__PAD__":
        print("{:15}:{:5} ({})".format(w, tags[pred], tags[true]))

In [None]:
i = 19
p = model.predict(np.array(X_te[i:i+batch_size]))[0]
p = np.argmax(p, axis=-1)
print("{:15} {:5}: ({})".format("Word", "Pred", "True"))
print("="*30)
for w, true, pred in zip(X_te[i], y_te[i], p):
    if w != "__PAD__":
        print("{:15}:{:5} ({})".format(w, tags[pred], tags[true]))