In [11]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import pickle

#import tensorflow as tf
import tensorflow.compat.v1 as tf
import tensorflow_hub as hub
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.compat.v1.keras import backend as K

from keras.models import Model, Input
from keras.layers import Concatenate, LSTM, TimeDistributed, Dense, BatchNormalization, Bidirectional, Lambda

In [12]:
def read_data():
    os.chdir('D:/TU_Graz/Thesis/Datasets/Reddit_features')
    train = pd.read_csv("train_final_all.csv")
    test = pd.read_csv("test_final_all.csv")
    data = train.append(test)

    return train, test, data

In [13]:
def create_lists(data_df, word_category, class_category):
    words = list(set(data_df[word_category].values))
    words.append("ENDPAD")
    n_words = len(words)
    tags = list(set(data_df[class_category].values))
    n_tags = len(tags)
    return words, n_words, tags, n_tags

In [14]:
def group_sentences(data, sent_identificator, category):
    all_sents = []
    sent_ids = data[sent_identificator].unique()
    for curr_id in sent_ids:
        tmp_df = data[data[sent_identificator] == curr_id]
        tmp_df = pd.concat([tmp_df['Token'], tmp_df["Token_index"], tmp_df.iloc[:,4:44], tmp_df[category]], axis = 1) 
        records = tmp_df.to_records(index=False)
        all_sents.append(records)
        
    return all_sents

In [15]:
def remove_sents_over_threshold(sents, threshold):
    sentences = list()
    for s in sents:
        if len(s) < threshold:
            sentences.append(s)
    return sentences

In [16]:
def prepare_and_pad(sentences, max_len, tag2idx):
    X1 = [[w[0] for w in s] for s in sentences]
    new_X = []
    for seq in X1:
        new_seq = []
        for i in range(max_len):
            try:
                new_seq.append(seq[i])
            except:
                new_seq.append("__PAD__")
        new_X.append(new_seq)
    X1 = new_X
    
    X2 = []
    for sentence in sentences:
        sent_ft = list()
        for word in sentence:
            ft = [word[i] for i in range(1, 41)]
            sent_ft.append(ft)
        for j in range(len(sentence)-1, max_len-1):
            ft = [0] * 40
            sent_ft.append(ft)
        X2.append(sent_ft)  
        
    y = [[tag2idx[w[len(w)-1]] for w in s] for s in sentences]
    y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])
    
    return X1, X2, y

In [17]:
def ElmoEmbedding(x):
    return elmo_model(inputs={"tokens": tf.squeeze(tf.cast(x, tf.string)),
                              "sequence_len": tf.constant(batch_size*[max_len])},
                      signature="tokens",
                      as_dict=True)["elmo"]

In [18]:
def build_model(max_len, n_words, n_tags): 
    word_input_layer = Input(shape=(max_len, 40, ))
    elmo_input_layer = Input(shape=(max_len,), dtype=tf.string)
    
    word_output_layer = Dense(n_tags, activation = 'softmax')(word_input_layer)
    elmo_output_layer = Lambda(ElmoEmbedding, output_shape=(None, 1024))(elmo_input_layer)
    
    output_layer = Concatenate()([word_output_layer, elmo_output_layer])
    output_layer = BatchNormalization()(output_layer)
    output_layer = Bidirectional(LSTM(units=512, return_sequences=True, recurrent_dropout=0.2, dropout=0.2))(output_layer)
    output_layer = TimeDistributed(Dense(n_tags, activation='softmax'))(output_layer)
    
    model = Model([elmo_input_layer, word_input_layer], output_layer)
    
    return model

In [19]:
def plot_learning_curves(hist, curve1, curve2):
    plt.figure(figsize=(6,6))
    plt.plot(hist[curve1])
    plt.plot(hist[curve2])
    plt.show()

In [None]:
max_len = 300
batch_size = 32
#batch_size = 1

plt.style.use("ggplot")

print("Loading the data...")
train, test, data = read_data()

################################## TRAIN ################################

print("Creating sets of words and tags...")
words, n_words, tags, n_tags = create_lists(train, "Token", "BIO")

print("Creating sentence list...")
sents = group_sentences(train, "Sent_id", "BIO")
sentences = [s for s in sents if len(s) < max_len]

print("Creating word and tag maps...")
tag2idx = {t: i for i, t in enumerate(tags)}

print("Preparing and padding training data...")
X1, X2, y = prepare_and_pad(sentences, max_len, tag2idx)

print("Splitting data...")
X1_train, X1_valid, y_train, y_valid = train_test_split(X1, y, test_size=0.2, random_state=2021)
X2_train, X2_valid, y_train, y_valid = train_test_split(X2, y, test_size=0.2, random_state=2021)
X1_train = X1_train[:(len(X1_train) // batch_size) * batch_size]
X2_train = X2_train[:(len(X2_train) // batch_size) * batch_size]
X1_valid = X1_valid[:(len(X1_valid) // batch_size) * batch_size]
X2_valid = X2_valid[:(len(X2_valid) // batch_size) * batch_size]

y_train = y_train[:(len(y_train) // batch_size) * batch_size]
y_valid = y_valid[:(len(y_valid) // batch_size) * batch_size] 
y_train = y_train.reshape(y_train.shape[0], y_train.shape[1], 1)
y_valid = y_valid.reshape(y_valid.shape[0], y_valid.shape[1], 1)

print("Setting parameters...")
tf.disable_eager_execution()
elmo_model = hub.Module("C:/Users/Kiki/Projects/ner_movies/Scripts/module_elmo3", trainable=True)
sess = tf.Session()
K.set_session(sess)
sess.run([tf.global_variables_initializer(), tf.tables_initializer()])

print("Building the model...")
model = build_model(max_len, n_words, n_tags)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.summary()

history = model.fit([np.array(X1_train), np.array(X2_train).reshape((len(X2_train), max_len, 40))],
                    y_train,
                    validation_data=([np.array(X1_valid), np.array(X2_valid).reshape((len(X2_valid), max_len, 40))], y_valid),
                    batch_size=batch_size, epochs=2, verbose=1)
#saver.save(sess, 'utilities/my_test_model', global_step=1000)
hist = pd.DataFrame(history.history)

#print("Load a previously saved model...")
#model = load_model("../input/trainedmodel/finalmodel.h5")
#reconstructed = tf.keras.models.load_model('../input/tranedmodel/finalmodel.h5')

print("Plotting learning curves...")
plot_learning_curves(hist, "acc", "val_acc")
plot_learning_curves(hist, "loss", "val_loss")

Loading the data...
Creating sets of words and tags...
Creating sentence list...
Creating word and tag maps...
Preparing and padding training data...
Splitting data...
Setting parameters...
Building the model...
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Instructions for updating:
Colocations handled automatically by placer.


Instructions for updating:
Colocations handled automatically by placer.


{}
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 300, 40)]    0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 300)]        0                                            
__________________________________________________________________________________________________
dense (Dense)                   (None, 300, 17)      697         input_1[0][0]                    
__________________________________________________________________________________________________
lambda (Lambda)                 (32, None, 1024)     0           input_2[0][0]                    
___________________________________________________________________________________________

In [None]:
y_pred = model.predict([X1_te, np.array(X2_te).reshape((len(X2_te), max_len, 40))])
p = np.argmax(y_pred, axis=-1)
y_orig = []
for sent in y_te:
    for tag in sent:
        y_orig.append(tag)
y_preds = []
for sent in p:
    for tag in sent:
        y_preds.append(tag)
        
report = classification_report( y_orig, y_preds )
print(report) 

In [None]:
tag2idx