# Name Entity Recognition with Keras
In this notebook, I will build a Name Entity Recognition Model using Keras to evaluate student writing using dataset for Kaggle Competition [Feedback Prize - Evaluating Student Writing](https://www.kaggle.com/c/feedback-prize-2021). In this Project, Modeling part will be very easy, what's challenging is converting this dataset to Name Entity Recognization format that can be handled by Keras Name Entity Recognition Model. I also perform some Exploratory Data Analysis to find insights.
## Import Packages

In [None]:
import json
import pandas as pd
import numpy as np
import time
import tensorflow as tf
import seaborn as sns
from tensorflow import keras
from sklearn.model_selection import train_test_split
from collections import defaultdict
import matplotlib.pyplot as plt

## Common Parameters

In [None]:
vocab_size = 10000 # Vocabulary size
sequence_length = 1024 # Sequence Length
batch_size = 128 # Batch size
unk_token = "<unk>" # Unknownd token
vectorizer_path = "vectorizer.json"
# Use output dataset for inference
output_dataset_path = "../input/name-entity-recognition-with-keras-output/"
model_path = "model.h5"
embed_size = 64
hidden_size = 64
modes = ["training", "inference"] # There is training and inference mode
mode = modes[1]
epochs = 10
dropout = 0.2 # Dropout rate for the Model.

## Import Datasets

In [None]:
train = pd.read_csv("../input/feedback-prize-2021/train.csv")
train.head()

In [None]:
submission = pd.read_csv("../input/feedback-prize-2021/sample_submission.csv")
submission.head()

## EDA & Preprocessing

### Add File Path to train and submission Files

In [None]:
train["file_path"] = train["id"].apply(lambda item: "../input/feedback-prize-2021/train/" + item + ".txt")
train.head()

In [None]:
submission["file_path"] = submission["id"].apply(lambda item: "../input/feedback-prize-2021/test/" + item + ".txt")
submission.head()

## Distribution of Labels

In [None]:
train["discourse_type"].value_counts().plot(kind="bar")

In [None]:
discourse_types = np.array(["<PAD>", "<None>"] + sorted(train["discourse_type"].unique()))
discourse_types_index = dict([(discoure_type, index) for (index, discoure_type) in enumerate(discourse_types)])
discourse_types, discourse_types_index

## Distribution of discourse_type_num

In [None]:
train["discourse_type_num"].value_counts().plot(kind="bar")

### Number of Unique files

In [None]:
len(train["id"].unique())

### Tokenization

I am trying to build a Tokenizer to tokenize sentences that extracted from predictionstring and see if it can match with the discourse_text including shifting left and right. A good Tokenizer can match more cases without shifting sentences or just a few shifting, so that it may have a better prediction on test set.

In [None]:
def get_range(item):
    locations = [int(location) for location in item["predictionstring"].split(" ")]
    return (locations[0], locations[-1])

In [None]:
character_counter = defaultdict(int)
character_counter
allow_set = set("'&%-_/$+ÂÃÅËÓâåóþ@|~¢£¢£")
def tokenize(text):
    tokens = []
    chars = []
    for i in range(len(text)):
        c = text[i].lower()
        character_counter[c] += 1
        is_valid = c.isalnum() or c in allow_set
        if i >= 1 and i < len(text) - 1:
            if text[i-1].isdigit() and text[i+1].isdigit():
                is_valid = True
            elif text[i-1].isalpha() and text[i+1].isalpha() and c == ".":
                is_valid = True
        if is_valid:
            chars.append(c)
        if (not is_valid or i == len(text) - 1) and len(chars) > 0:
            tokens.append("".join(chars))
            chars.clear()
    return tokens

In [None]:
%%time
begin = time.time()
last_id = ""
contents = []
wrong_samples = []
token_list = []
annotation_list = []
num_samples = len(train)
unmaptch_count = 0 # Number of sentences extracted from predictionstring that doesn't discourse_text
match_count = 0 # Number of sentences extracted from predictionstring that matches discourse_text including shifting
completely_match_count = 0 # Number of sentences extracted from predictionstring that matches discourse_text without shifting
mismatch_count = 0
for i in range(len(train)):
    item = train.iloc[i]
    identifier = item["id"] 
    discourse_type_id = discourse_types_index[item["discourse_type"]]
    if identifier != last_id:
        last_id = identifier
        with open(item["file_path"]) as f:
            content = "".join(f.readlines())
            contents.append(content)
            tokens = tokenize(content)
            token_list.append(tokens)
            annotations = [1] * len(tokens)
            annotation_list.append(annotations)
    annotation_range = get_range(item)
    extracted = tokens[annotation_range[0]:annotation_range[1]+1]
    discourse = tokenize(item["discourse_text"])
    delta = None
    num_tokens_to_compare = min(len(discourse), 3)
    
    # Compare text extracted from predictionstring with discourse_text, shift discourse_text or right if needed, just compare a few words for performance
    for j in range(10):
        if len(extracted) < num_tokens_to_compare or len(discourse) <= j + num_tokens_to_compare:
            break
        if extracted[0:num_tokens_to_compare] == discourse[j:num_tokens_to_compare+j]:
            delta = j
            break
    if delta == None:
        for j in range(10):
            if len(discourse) < num_tokens_to_compare and len(extracted) <= j + num_tokens_to_compare:
                break
            if discourse[0:num_tokens_to_compare] == extracted[j:num_tokens_to_compare+j]:
                delta = -j
                break
    if delta == None:
        unmaptch_count += 1
    else:
        not_match = False
        for j in range(annotation_range[0] - delta, min(min(annotation_range[1] - delta + 1, len(tokens)), len(discourse) + annotation_range[0] - delta)): 
            if tokens[j] != discourse[j - annotation_range[0] + delta]:
                mismatch_count += 1
                not_match = True
                break
        if not not_match:
            for j in range(annotation_range[0] - delta, min(min(annotation_range[1] - delta + 1, len(tokens)), len(discourse) + annotation_range[0] - delta)): 
                annotation_list[-1][j] = discourse_type_id
            match_count += 1
        else:
            unmaptch_count += 1
        if delta == 0:
            completely_match_count += 1 
print("Unmatch count:%d Match Count: %d Completedly Match count: %d"%(unmaptch_count, match_count, completely_match_count))
print("Mismatch count:", mismatch_count)
print(token_list[0])
print(annotation_list[0])

### Filter samples without annotations

In [None]:
useful_tokens = []
useful_annotations = []
for i in range(len(annotation_list)):
    if np.sum(annotation_list[i]) != 0:
        useful_tokens.append(token_list[i])
        useful_annotations.append(annotation_list[i])
token_list = useful_tokens
annotation_list = useful_annotations

### Distribution of Word Counts

In [None]:
word_counter = defaultdict(int)
for tokens in token_list:
    for token in tokens:
        word_counter[token] += 1

In [None]:
word_count = pd.DataFrame({"key": word_counter.keys(), "count": word_counter.values()})

sns.barplot(x="key", y="count", data=word_count[:30])

In [None]:
word_count.describe()

#### Number of words

In [None]:
len(word_count)

#### Words appearing only once

In [None]:
(word_count["count"] == 1).sum()

### Distibution of Character Counts

In [None]:
character_count = pd.DataFrame({"key": character_counter.keys(), "count": character_counter.values()})
character_count.sort_values(by="count", ascending=False, inplace=True)
character_count.head(30)

In [None]:
sns.barplot(x="key", y="count", data=character_count[:30])

### Unique Characters

In [None]:
print(list(character_count['key'].unique()))

### Ditrubtion of Sentence Length

In [None]:
sentence_length = defaultdict(int)
for tokens in token_list:
    length = len(tokens)
    sentence_length[length] += 1
sentence_length = pd.DataFrame({"sentence_length": sentence_length.keys(), "count": sentence_length.values()})
sentence_length.head()

In [None]:
sentence_length.describe()

#### Number of sentences that has more than 1000 tokens

In [None]:
sentence_length[(sentence_length["sentence_length"] >= 1000)]["count"].sum()

### Vectorization

In [None]:
class Vectorizer:
    
    def __init__(self, vocab_size = None, sequence_length = None, unk_token = "<unk>"):
        self.vocab_size = vocab_size
        self.sequence_length = sequence_length
        self.unk_token = unk_token
        
    def fit_transform(self, sentences):
        word_counter = dict()
        for tokens in sentences:
            for token in tokens: 
                if token in word_counter:
                    word_counter[token] += 1
                else:
                    word_counter[token] = 1
        word_counter = pd.DataFrame({"key": word_counter.keys(), "count": word_counter.values()})
        word_counter.sort_values(by="count", ascending=False, inplace=True)
        vocab = set(word_counter["key"][0:self.vocab_size-1])
        word_index = dict()
        begin_index = 1 
        word_index[self.unk_token] = begin_index
        begin_index += 1
        Xs = []
        for i in range(len(sentences)):
            X = []
            for token in sentences[i]:
                if token not in word_index and token in vocab:
                    word_index[token] = begin_index
                    begin_index += 1
                if token in word_index:
                    X.append(word_index[token])
                else:
                    X.append(word_index[self.unk_token])
                if len(X) == self.sequence_length:
                    break
            if len(X) < self.sequence_length:
                X += [0] * (self.sequence_length - len(X))
            Xs.append(X)
        self.word_index = word_index
        self.vocab = vocab
        return Xs
    
    def transform(self, sentences):
        Xs = []
        for i in range(len(sentences)):
            X = []
            for token in sentences[i]:
                if token in self.word_index:
                    X.append(self.word_index[token])
                else:
                    X.append(self.word_index[self.unk_token])
                if len(X) == self.sequence_length:
                    break
            if len(X) < self.sequence_length:
                X += [0] * (self.sequence_length - len(X))
            Xs.append(X)
        return Xs
    
    def load(self, path):
        with open(path, 'r') as f:
            dic = json.load(f)
            self.vocab_size = dic['vocab_size']
            self.sequence_length = dic['sequence_length']
            self.unk_token = dic['unk_token']
            self.word_index = dic['word_index']
            
    def save(self, path):
        with open(path, 'w') as f:
            data = json.dumps({
                "vocab_size": self.vocab_size, 
                "sequence_length": self.sequence_length, 
                "unk_token": self.unk_token,
                "word_index": self.word_index
            })
            f.write(data)

In [None]:
%%time
vectorizer = Vectorizer(vocab_size = vocab_size, sequence_length = sequence_length, unk_token = unk_token)
if mode == modes[0]:
    Xs = vectorizer.fit_transform(token_list)
    vectorizer.save(vectorizer_path)

else:
    vectorizer.load(output_dataset_path + vectorizer_path)
    Xs = vectorizer.transform(token_list)
ys = []
annotation_count = [0] * len(discourse_types_index)
for annotation in annotation_list:
    if len(annotation) <= sequence_length:
        ys.append(annotation + [0] * (sequence_length - len(annotation)))
    else:
        ys.append(annotation[0:sequence_length])
    for item in ys[-1]:
        annotation_count[item] += 1
X_train, X_val, y_train, y_val = train_test_split(np.array(Xs), np.array(ys), test_size = 0.2, random_state=42)

## Disturbution of annotation

In [None]:
annotation_count_df = pd.DataFrame({
    "key":discourse_types,
    "value": list(range(len(discourse_types))),
    "count": annotation_count
})
plt.figure(figsize=(15, 10))
sns.barplot(x="key", y="count", data=annotation_count_df)

## Create Tensorflow Dataset

In [None]:
def make_dataset(X, y, batch_size, mode="train"):
    ds = tf.data.Dataset.from_tensor_slices((X, y))
    if mode == "train":
        ds = ds.shuffle(512)
    ds = ds.batch(batch_size).cache().prefetch(tf.data.AUTOTUNE)
    return ds

In [None]:
train_ds = make_dataset(X_train, y_train, batch_size)
val_ds = make_dataset(X_val, y_val, batch_size, mode="valid")

## Modeling

### Name Entity Recognition  Model

In [None]:
model = keras.Sequential([
    keras.layers.Embedding(vocab_size, embed_size, input_length=sequence_length),
    keras.layers.SpatialDropout1D(dropout),
    keras.layers.Bidirectional(keras.layers.LSTM(hidden_size, dropout=dropout, recurrent_dropout=dropout)),
    keras.layers.RepeatVector(sequence_length),
    keras.layers.Bidirectional(keras.layers.LSTM(hidden_size, return_sequences=True)),
    keras.layers.TimeDistributed(keras.layers.Dense(len(discourse_types), activation="softmax"))
])
model.summary()

In [None]:
keras.utils.plot_model(model, show_shapes=True, show_dtype=True)

## Training

In [None]:
if mode == modes[0]:
    checkpoint = keras.callbacks.ModelCheckpoint(
        model_path, 
        save_best_only=True,
        save_weights_only=True
    )
    early_stop = keras.callbacks.EarlyStopping(
        min_delta=1e-4, 
        patience=10
    )
    reduce_lr = keras.callbacks.ReduceLROnPlateau(
        factor=0.3,
        patience=2, 
        min_lr=1e-7
    )
    loss = tf.keras.losses.SparseCategoricalCrossentropy()
    callbacks = [early_stop, checkpoint, reduce_lr]
    optimizer = tf.keras.optimizers.Adam(1e-3)
    model.compile(loss=loss, optimizer=optimizer)
    model.fit(train_ds, epochs=epochs, validation_data=val_ds, callbacks=callbacks)
else:
    model.load_weights(output_dataset_path + model_path)

### Model Evaluation

In [None]:
from sklearn.metrics import f1_score, classification_report
def evaluate(model, dataset):
    all_true_tag_ids, all_predicted_tag_ids = [], []
    for x, y in dataset:
        output = model.predict(x)
        predictions = np.argmax(output, axis=-1)
        predictions = np.reshape(predictions, [-1])

        true_tag_ids = np.reshape(y, [-1])

        mask = (true_tag_ids > 0) & (predictions > 0)
        true_tag_ids = true_tag_ids[mask]
        predicted_tag_ids = predictions[mask]

        all_true_tag_ids.append(true_tag_ids)
        all_predicted_tag_ids.append(predicted_tag_ids)

    all_true_tag_ids = np.concatenate(all_true_tag_ids)
    all_predicted_tag_ids = np.concatenate(all_predicted_tag_ids)
    cls_report = classification_report(all_true_tag_ids, all_predicted_tag_ids)
    print(cls_report)
    f1 =  f1_score(all_true_tag_ids, all_predicted_tag_ids, average="micro")
    print("F1 Score:", f1)
evaluate(model, val_ds)

## Submission

In [None]:
%%time
contents = []
token_list = []
for i in range(len(submission)):
    item = submission.iloc[i]
    identifier = item["id"] 
    with open(item["file_path"]) as f:
        content = "".join(f.readlines())
        contents.append(content)
        tokens = tokenize(content)
        token_list.append(tokens)

In [None]:
X_test = vectorizer.transform(token_list)
test_ds = tf.data.Dataset.from_tensor_slices((X_test)).batch(batch_size)
y_pred = model.predict(test_ds)
y_pred = np.argmax(y_pred, axis=-1)
print(y_pred.shape)

In [None]:
predictionstrings = []
classes = []
ids = []
for i in range(y_pred.shape[0]):
    identifier = submission.iloc[i]["id"]
    last_prediction = 0
    indices = []
    upper_bound = min(y_pred.shape[1], len(token_list[i]))
    for j in range(upper_bound):
        if last_prediction != y_pred[i, j]:
            if len(indices) > 0:
                ids.append(identifier)
                predictionstrings.append(" ".join(indices))
                classes.append(discourse_types[last_prediction])
                indices = []
            last_prediction = y_pred[i, j]
        if y_pred[i, j] > 1:
            indices.append(str(j))
        if j == upper_bound - 1:
            if len(indices) > 0:
                ids.append(identifier)
                predictionstrings.append(" ".join(indices))
                classes.append(discourse_types[last_prediction])

In [None]:
sub_df = pd.DataFrame({"id": ids, "class": classes, "predictionstring": predictionstrings})
sub_df.to_csv("submission.csv", index=False)
sub_df.head()