In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%pip install 'tensorflow==1.13.1' 'keras==2.2.4' git+https://www.github.com/keras-team/keras-contrib.git

In [None]:
from random import randint
from future.utils import iteritems

import numpy as np
import pandas as pd

In [None]:
# Turkish
!curl -o /kaggle/working/cc.tr.300.bin.gz https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tr.300.bin.gz
!gunzip /kaggle/working/cc.tr.300.bin.gz

In [None]:
# Italian
!curl -o /kaggle/working/cc.it.300.bin.gz https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.it.300.bin.gz
!gunzip /kaggle/working/cc.it.300.bin.gz

In [None]:
import fasttext
#fasttext.util.download_model('tr', if_exists='ignore')
ft = fasttext.load_model('/kaggle/working/cc.tr.300.bin')

In [None]:
data = pd.read_csv("../input/dodiom-dataset/it_corpus_second_run.csv", converters={
    'idiom_indices': eval,
    'idiom_words': eval,
    'lemmas': eval,
    'words': eval
})
data = data[data.words.map(len) <= 16]
data = data[data.likes + data.dislikes + data.reports >= 0]
data = data[data.rating >= 0.0]
data = data.reset_index()
len(data)

In [None]:
sentences = []
words = set()

for index, row in data.iterrows():
    sentence = []
    for iw, word in enumerate(row["lemmas"]):
        tag = 'O'
        if iw == row["idiom_indices"][0]:
            if row.category == "idiom":
                tag = 'B-idiom'
            else:
                tag = "B-nonidiom"
        elif iw in row["idiom_indices"][1:]:
            if row.category == "idiom":
                tag = 'I-idiom'
            else:
                tag = "I-nonidiom"
        sentence.append((word, tag))
        words.add(word)
    sentences.append(sentence)

In [None]:
sentences[randint(0, len(sentences))]

In [None]:
words = list(words)
words.append("</s>")
n_words = len(words)

tags = ['B-idiom', 'B-nonidiom', 'I-idiom', 'I-nonidiom', 'O']
n_tags = len(tags)

word2idx = {w: i for i, w in enumerate(words)}
idx2word = {v: k for k, v in iteritems(word2idx)}
tag2idx = {t: i for i, t in enumerate(tags)}
idx2tag = {v: k for k, v in iteritems(tag2idx)}

In [None]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

maxlen = max([len(s) for s in sentences])

#X = [[ft.get_word_vector(w[0]) for w in s] for s in sentences]
#X = pad_sequences(maxlen=maxlen, sequences=X, padding="post",value=ft.get_word_vector("</s>"))

X = [[word2idx[w[0]] for w in s] for s in sentences]
X = pad_sequences(maxlen=maxlen, sequences=X, padding="post",value=n_words - 1)

y = [[tag2idx[w[1]] for w in s] for s in sentences]
y = pad_sequences(maxlen=maxlen, sequences=y, padding="post", value=tag2idx["O"])
y = [to_categorical(i, num_classes=n_tags) for i in y]

# Split train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
X_train.shape

In [None]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
import keras as k
from keras_contrib.layers import CRF

input = Input(shape=X_train[0].shape)
word_embedding_size = 150

# Embedding Layer
model = Embedding(input_dim=n_words, output_dim=word_embedding_size, input_length=maxlen)(input)

# BI-LSTM Layer
model = Bidirectional(LSTM(units=word_embedding_size, 
                           return_sequences=True, 
                           dropout=0.5, 
                           recurrent_dropout=0.5, 
                           kernel_initializer=k.initializers.he_normal()))(model)
model = LSTM(units=word_embedding_size * 2, 
             return_sequences=True, 
             dropout=0.5, 
             recurrent_dropout=0.5, 
             kernel_initializer=k.initializers.he_normal())(model)

# TimeDistributed Layer
model = TimeDistributed(Dense(n_tags, activation="relu"))(model)  

# CRF Layer
crf = CRF(n_tags)

out = crf(model)  # output
model = Model(input, out)

In [None]:
from keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt
import tensorflow as tf

#Optimiser 
adam = k.optimizers.Adam(lr=0.0005, beta_1=0.9, beta_2=0.999)

# Compile model
model.compile(optimizer=adam, loss=crf.loss_function, metrics=[crf.accuracy])

model.summary()

In [None]:
%%time

# Saving the best model only
filepath="ner-bi-lstm-td-model-{val_crf_viterbi_accuracy:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_crf_viterbi_accuracy', verbose=0, save_best_only=True, mode='max')
earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=15)
callbacks_list = [checkpoint, earlystopping]

# Fit the best model
history = model.fit(X_train, np.array(y_train), 
                    validation_data=(X_test, np.array(y_test)),
                    batch_size=256,
                    epochs=100,
                    verbose=1,
                    callbacks=callbacks_list)

In [None]:
# Plot the graph 
plt.style.use('ggplot')

def plot_history(history):
    accuracy = history.history['crf_viterbi_accuracy']
    val_accuracy = history.history['val_crf_viterbi_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(accuracy) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, accuracy, 'b', label='Training acc')
    plt.plot(x, val_accuracy, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

plot_history(history)

In [None]:
pred = model.predict(X_test)
print(len(X_test))

In [None]:
def onehot_to_IOB(arr):
    if np.array_equal(arr, [1, 0, 0, 0, 0]):
        return "B-idiom"
    elif np.array_equal(arr, [0, 1, 0, 0, 0]):
        return "B-nonidiom"
    elif np.array_equal(arr, [0, 0, 1, 0, 0]):
        return "I-idiom"
    elif np.array_equal(arr, [0, 0, 0, 1, 0]):
        return "I-nonidiom"
    elif np.array_equal(arr, [0, 0, 0, 0, 1]):
        return "O"
    raise Exception("Invalid one-hot")

In [None]:
from tabulate import tabulate
from random import randint

rand_item = randint(0, len(X_test))
print(f"Item: {rand_item}")
result = [[idx2word[x], onehot_to_IOB(y), onehot_to_IOB(y_pred)] for x, y, y_pred in zip(X_test[rand_item], y_test[rand_item], pred[rand_item])]
print(tabulate(result, headers=["word", "target", "prediction"]))