In [1]:
import nltk
import numpy as np
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers, models
from nltk.corpus import treebank, brown, conll2000
from sklearn.model_selection import train_test_split
from tensorflow import keras

In [2]:
nltk.download('treebank')
nltk.download('brown')
nltk.download('conll2000')
nltk.download('universal_tagset')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

In [3]:
tagged_sentences = treebank.tagged_sents(tagset='universal') + brown.tagged_sents(tagset='universal') + conll2000.tagged_sents(tagset='universal')

print(tagged_sentences[0])
print(f"Dataset size: {len(tagged_sentences)}")

[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')]
Dataset size: 72202


In [4]:
sentences = []
tags = []

for sentence in tagged_sentences:
    sentence_words = []
    sentence_tags = []
    for word, tag in sentence:
        sentence_words.append(word)
        sentence_tags.append(tag)
    sentences.append(sentence_words)
    tags.append(sentence_tags)

print(sentences[0])
print(tags[0])

['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.']
['NOUN', 'NOUN', '.', 'NUM', 'NOUN', 'ADJ', '.', 'VERB', 'VERB', 'DET', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'NOUN', 'NUM', '.']


In [5]:
print(len(sentences), len(tags))

72202 72202


In [6]:
train_ratio = 0.75
test_ratio = 0.15
val_ratio = 0.1

train_sentences, test_sentences, train_tags, test_tags = train_test_split(sentences, tags, test_size=1 - train_ratio, random_state=1)
val_sentences, test_sentences, val_tags, test_tags = train_test_split(test_sentences, test_tags, test_size=test_ratio/(test_ratio + val_ratio), random_state=1)

In [7]:
print(len(train_sentences), len(train_tags))
print(len(val_sentences), len(val_tags))
print(len(test_sentences), len(test_tags))

54151 54151
7220 7220
10831 10831


In [8]:
sentence_tokenizer = keras.preprocessing.text.Tokenizer(oov_token="<OOV>")
sentence_tokenizer.fit_on_texts(train_sentences)

tag_tokenizer = keras.preprocessing.text.Tokenizer(oov_token="<OOV>")
tag_tokenizer.fit_on_texts(train_tags)

In [9]:
MAX_SEN_LEN = max([len(sentence) for sentence in train_sentences])

In [10]:
train_sentence_sequence = sentence_tokenizer.texts_to_sequences(train_sentences)
x_train = keras.preprocessing.sequence.pad_sequences(train_sentence_sequence, padding="post", maxlen=MAX_SEN_LEN)

train_tag_sequence = tag_tokenizer.texts_to_sequences(train_tags)
y_train = keras.preprocessing.sequence.pad_sequences(train_tag_sequence, padding="post", maxlen=MAX_SEN_LEN)

val_sentence_sequence = sentence_tokenizer.texts_to_sequences(val_sentences)
x_val = keras.preprocessing.sequence.pad_sequences(val_sentence_sequence, padding="post", maxlen=MAX_SEN_LEN)

val_tag_sequence = tag_tokenizer.texts_to_sequences(val_tags)
y_val = keras.preprocessing.sequence.pad_sequences(val_tag_sequence, padding="post", maxlen=MAX_SEN_LEN)

In [11]:
print(train_sentences[0], len(train_sentences[0]))
print(x_train[0], len(x_train[0]))
print(y_train[0], len(y_train[0]))

['This', 'may', 'be', 'due', 'to', 'the', 'heavy', 'interlobular', 'connective', 'tissue', 'barriers', 'present', '.'] 13
[   27    86    21   479     7     2   920 10903 20547  3327  5644   337
     4     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
  

In [12]:
y_train_one_hot = keras.utils.to_categorical(y_train, num_classes=len(tag_tokenizer.word_index) + 1)
y_val_one_hot = keras.utils.to_categorical(y_val, num_classes=len(tag_tokenizer.word_index) + 1)

In [14]:
num_tokens = len(sentence_tokenizer.word_index) + 1
num_classes = len(tag_tokenizer.word_index) + 1
embedding_dim = 128

print(num_tokens, num_classes)

52042 14


In [15]:
tf.random.set_seed(0)

input = layers.Input(shape=(MAX_SEN_LEN,))
model = layers.Embedding(input_dim=num_tokens, output_dim=embedding_dim, input_length=MAX_SEN_LEN)(input)
model = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(model)
outputs = layers.TimeDistributed(layers.Dense(num_classes, activation="softmax"))(model)

model = models.Model(inputs=input, outputs=outputs)
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])



In [16]:
model.summary()

In [19]:
es_callback = keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

model.fit(x_train, y_train_one_hot, batch_size=256, epochs=10, validation_data=(x_val, y_val_one_hot), callbacks=[es_callback])

Epoch 1/10
[1m212/212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 131ms/step - accuracy: 0.9984 - loss: 0.0050 - val_accuracy: 0.9961 - val_loss: 0.0122
Epoch 2/10
[1m212/212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 132ms/step - accuracy: 0.9987 - loss: 0.0042 - val_accuracy: 0.9961 - val_loss: 0.0123
Epoch 3/10
[1m212/212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 134ms/step - accuracy: 0.9989 - loss: 0.0038 - val_accuracy: 0.9961 - val_loss: 0.0126
Epoch 4/10
[1m212/212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 131ms/step - accuracy: 0.9990 - loss: 0.0035 - val_accuracy: 0.9961 - val_loss: 0.0128


<keras.src.callbacks.history.History at 0x7a26121dd240>

In [20]:
test_sentence_sequence = sentence_tokenizer.texts_to_sequences(test_sentences)
x_test = keras.preprocessing.sequence.pad_sequences(test_sentence_sequence, padding="post", maxlen=MAX_SEN_LEN)

test_tag_sequence = tag_tokenizer.texts_to_sequences(test_tags)
y_test = keras.preprocessing.sequence.pad_sequences(test_tag_sequence, padding="post", maxlen=MAX_SEN_LEN)

y_test_one_hot = keras.utils.to_categorical(y_test)

In [21]:
model.evaluate(x_test, y_test_one_hot)

[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step - accuracy: 0.9961 - loss: 0.0132


[0.012813247740268707, 0.9961857795715332]

In [33]:
def inference(samples):
  sentence_sequence = sentence_tokenizer.texts_to_sequences(samples)
  padded_sentence_sequence = keras.preprocessing.sequence.pad_sequences(sentence_sequence, padding="post", maxlen=MAX_SEN_LEN)

  predicted_tags = model.predict(padded_sentence_sequence)

  sentence_tags = []
  for i, preds in enumerate(predicted_tags):

    tags_sequence = [np.argmax(p) for p in preds[:len(sentence_sequence[i])]]

    words = [sentence_tokenizer.index_word[w] for w in sentence_sequence[i]]
    tags = [tag_tokenizer.index_word[t] for t in tags_sequence]

    sentence_tags.append(list(zip(words, tags)))

  return sentence_tags

In [40]:
samples = [
    "My name is Darsh",
    "The quick brown fox jumps over the lazy dog"
    ]

sentence_tags = inference (samples)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step


In [41]:
print(sentence_tags[1])

[('the', 'det'), ('quick', 'adj'), ('brown', 'noun'), ('fox', 'noun'), ('jumps', 'noun'), ('over', 'adp'), ('the', 'det'), ('lazy', 'adj'), ('dog', 'noun')]
