In [1]:
pip install torchtext==0.16.0

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install portalocker==2.8.2

Note: you may need to restart the kernel to use updated packages.


In [3]:
import numpy as np

import torch
import torchtext

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

import spacy
nlp = spacy.load('en_core_web_sm')

2024-09-19 19:32:57.863281: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
train_dataset = torchtext.datasets.CoNLL2000Chunking(split = 'train')
test_dataset = torchtext.datasets.CoNLL2000Chunking(split = 'test')

In [5]:
train_text = []
train_tags = []

for data_idx,data in enumerate(train_dataset):
  train_text.append(data[0])
  train_tags.append(data[1])

In [6]:
test_text = []
test_tags = []

for data_idx,data in enumerate(test_dataset):
  test_text.append(data[0])
  test_tags.append(data[1])

In [7]:
class Text_Tokenization:
  def __init__(self,nlp,tokenizer):
    self.nlp = nlp
    self.tokenizer = tokenizer

  def _spacy_tokenizer(self,text):
    text = ' '.join(text)
    return [token.lemma_ for token in self.nlp(text)]

  def _preprocess_doc(self,text):
    text = self._spacy_tokenizer(text)
    return '|'.join(text)

  def tokenize(self,doc,maxlen,vocab = None):
    doc = [self._preprocess_doc(text) for text in doc]

    if vocab == None:
      self.tokenizer.fit_on_texts(doc)
      self.vocab = self.tokenizer

    tokenized_doc = self.tokenizer.texts_to_sequences(doc)
    tokenized_doc = tf.keras.utils.pad_sequences(tokenized_doc,maxlen,padding = 'post',truncating = 'post')

    return tokenized_doc

In [8]:
class Tagging_rnn(tf.keras.Model):
  def __init__(self,text_vocab_size,embedding_size,hidden_size,tags_vocab_size):
    super().__init__()
    self.text_vocab_size = text_vocab_size
    self.embedding_size = embedding_size
    self.hidden_size = hidden_size
    self.tags_vocab_size = tags_vocab_size
    
    self.embedding = tf.keras.layers.Embedding(text_vocab_size,embedding_size,mask_zero = True)
    self.rnn = tf.keras.layers.SimpleRNN(hidden_size,return_sequences = True)
    self.layer = tf.keras.layers.Dense(tags_vocab_size,activation = 'softmax')
    
  def call(self,X):
    X = self.embedding(X)
    X = self.rnn(X)
    X = self.layer(X)
    
    return X

In [9]:
class Tagging_lstm(tf.keras.Model):
  def __init__(self,text_vocab_size,embedding_size,hidden_size,tags_vocab_size):
    super().__init__()
    self.text_vocab_size = text_vocab_size
    self.embedding_size = embedding_size
    self.hidden_size = hidden_size
    self.tags_vocab_size = tags_vocab_size
    
    self.embedding = tf.keras.layers.Embedding(text_vocab_size,embedding_size,mask_zero = True)
    self.lstm = tf.keras.layers.LSTM(hidden_size,return_sequences = True)
    self.layer = tf.keras.layers.Dense(tags_vocab_size,activation = 'softmax')
    
  def call(self,X):
    X = self.embedding(X)
    X = self.lstm(X)
    X = self.layer(X)
    
    return X

In [10]:
class Train_Model:
  def __init__(self,model,lr = 0.001,batch_size = 32,epochs = 5):
    self.model = model
    self.lr = lr
    self.batch_size = batch_size
    self.epochs = epochs

  def fit(self,X,y):
    self.model.compile(
        loss = 'categorical_crossentropy',
        optimizer = 'adam',
        metrics = ['accuracy']
    )

    print(f'Metrics on Train Data:')
    self.model.fit(X,y,batch_size = self.batch_size,epochs = self.epochs)


  def eval(self,X,y):
    print(f'\nMetrics on Test Data:')
    self.model.evaluate(X,y)

In [11]:
def Tag(sentence,model):
  pass

In [12]:
maxlen = 80

text_tokenizer = Tokenizer(oov_token = '<unk>',split = '|')
tag_tokenizer = Tokenizer(oov_token = '<unk>',split = '|')

text_tokenization = Text_Tokenization(nlp,text_tokenizer)
tags_tokenization = Text_Tokenization(nlp,tag_tokenizer)

tokenized_train_text = text_tokenization.tokenize(train_text,maxlen)
tokenized_train_tags = tags_tokenization.tokenize(train_tags,maxlen)

text_vocab = text_tokenization.vocab
tags_vocab = tags_tokenization.vocab


tokenized_test_text = text_tokenization.tokenize(test_text,maxlen,text_vocab)
tokenized_test_tags = tags_tokenization.tokenize(test_tags,maxlen,tags_vocab)

In [13]:
text_vocab_size = len(text_vocab.word_index) + 1
tags_vocab_size = len(tags_vocab.word_index) + 1

embedding_size = 164
hidden_size = 164

model_rnn = Tagging_rnn(text_vocab_size,embedding_size,hidden_size,tags_vocab_size)
model_lstm = Tagging_lstm(text_vocab_size,embedding_size,hidden_size,tags_vocab_size)

In [14]:
tokenized_train_tags = tf.keras.utils.to_categorical(tokenized_train_tags)
tokenized_test_tags = tf.keras.utils.to_categorical(tokenized_test_tags)

Trainer_rnn = Train_Model(model_rnn,epochs = 50)
Trainer_lstm = Train_Model(model_lstm,epochs = 50)

Trainer_rnn.fit(tokenized_train_text,tokenized_train_tags)
Trainer_rnn.eval(tokenized_test_text,tokenized_test_tags)

Trainer_lstm.fit(tokenized_train_text,tokenized_train_tags)
Trainer_lstm.eval(tokenized_test_text,tokenized_test_tags)

Metrics on Train Data:
Epoch 1/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 34ms/step - accuracy: 0.1102 - loss: 2.4412
Epoch 2/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 36ms/step - accuracy: 0.2253 - loss: 1.3526
Epoch 3/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 36ms/step - accuracy: 0.2533 - loss: 1.1638
Epoch 4/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 36ms/step - accuracy: 0.2731 - loss: 1.0540
Epoch 5/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 37ms/step - accuracy: 0.3059 - loss: 0.9750
Epoch 6/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 43ms/step - accuracy: 0.3219 - loss: 0.9313
Epoch 7/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 47ms/step - accuracy: 0.3476 - loss: 0.8287
Epoch 8/50
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 47ms/step - accuracy: 0.3584 - loss: 0.7907
E

KeyboardInterrupt: 