In [72]:
import glob
import os
from sklearn.preprocessing import LabelEncoder

In [73]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from gensim.models import KeyedVectors
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Masking, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping


### 1) Pre - Processing

In [74]:
# parameters for pre - processing
max_length = 15 # maximum sentence length
word_dim = 300 # word dimension


#### converting each sentence as a list of words with maximum of "max_length" words in a sentence. Any sentence greater than "max_length" words is splitted. 

In [75]:
total_tags = []
sentences = []
input_folder = glob.glob(r'/kaggle/input/text-pos/*')

for pos_file in input_folder:
    try:
        with open(pos_file, 'r') as file:
            # Read the contents of the file
            pos_data = file.read()
            a = pos_data.split()
            temp_words = []
            temp_tags = []
            for i in a:
                temp_words.append(i.rsplit("/",1)[0].lower())
                temp_tags.append(i.rsplit("/",1)[1])
                if i[-1] == "." or len(temp_words) == max_length:  # maximum length of sentence is 15  any sentence more than 15 length is shortened to 15.
                    sentences.append(temp_words)
                    total_tags.append(temp_tags)
                    temp_words, temp_tags = [],[]
    except FileNotFoundError:
        print(f"File '{pos_file}' not found.")

In [76]:
total_words = [i for sublist in sentences for i in sublist]
flatten_tags = [i for sublist in total_tags for i in sublist]
unique_words = set(total_words)
unique_tags = set(flatten_tags)

print("total number of sentences are", len(sentences))
print("total number of words are",len(total_words))
print("total number of tags are",len(flatten_tags))
print("total unique words are",len(unique_words))
print("total unique tags are",len(unique_tags))


total number of sentences are 8081
total number of words are 94156
total number of tags are 94156
total unique words are 10955
total unique tags are 46


In [77]:
# from transformers import BertModel, BertTokenizer

# # Load pre-trained BERT tokenizer and model
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertModel.from_pretrained('bert-base-uncased')

# # Convert words into input IDs using the tokenizer
# input_ids = [tokenizer.convert_tokens_to_ids(words) for words in sentences]

# # Pad sequences to the same length
# max_length = max(len(ids) for ids in input_ids)
# padded_input_ids = [ids + [tokenizer.pad_token_id] * (max_length - len(ids)) for ids in input_ids]

# # Convert lists of input IDs to PyTorch tensors
# input_tensors = torch.tensor(padded_input_ids)

# # Forward pass through the BERT model to obtain contextualized embeddings
# with torch.no_grad():
#     outputs = model(input_tensors)

# # Get the contextualized word embeddings (output of the last layer)
# last_hidden_states = outputs.last_hidden_state

# # Print the shape of the contextualized word embeddings
# print("Shape of contextualized word embeddings:", last_hidden_states.shape)


#### I'm creating a dictionary where each word in my vocabulary is a key, and the corresponding value for each word is its word vector obtained from GloVe embeddings. This will help me map words to their vector representations for future use.

In [78]:
# 1. Download and load GloVe embeddings
def load_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

# Adjust the file path to point to your downloaded GloVe embeddings file 
glove_file = r'/kaggle/input/pos-tag-text-analytics/glove.6B.300d.txt'
# word vectors of dimesnion 300
glove_embeddings = load_glove_embeddings(glove_file)

# 3. Get word vectors for a list of words
word_vectors = {}
no_word_vectors = []
for word in unique_words:
    if word in glove_embeddings:
        # the word present in glove embeddings
        word_vectors[word] = glove_embeddings[word]
    else:
        # the word is not in glove embeddings
        no_word_vectors.append(word)

####  If a word doesn't have a GloVe vector, I'm assigning it a zero vector."


In [79]:
base_array = np.zeros(word_dim,dtype = np.float64)
for word in no_word_vectors:
    word_vectors[word] = base_array

#### Transforming a sentence, represented as a list of words, into its vector representation

In [80]:
sentence_vectors = []

# Word Vectors  --- sentences (words) ---> sentences  (word vectors)
for sentence in sentences:
    temp_sentence = []
    for word in sentence:
        temp_sentence.append(word_vectors[word])
    sentence_vectors.append(temp_sentence)



#### Converting a sentence into a list of part-of-speech tags, where each tag is represented by a numerical label.

In [81]:
# Label Encoding for tags  --- sentences (words) ---> sentences  (labels or numbers)
unique_tags.add("no_word")
le = LabelEncoder()
le.fit(list(unique_tags))
total_tags_y = [le.transform(i) for i in total_tags] 
print("Sentences represented as pos tagging (after labelling)\nExample for 3 Sentences")
total_tags_y[:3]

Sentences represented as pos tagging (after labelling)
Example for 3 Sentences


[array([36, 13, 10, 22, 13, 20, 20, 24, 45, 10, 20, 20, 20,  2,  3]),
 array([20,  5,  9, 22,  5,  1,  9,  4,  5, 19, 22, 13, 10, 22, 13]),
 array([10, 20, 39, 10, 19, 13, 19,  6])]

#### sentence into a list of part-of-speech tags, where each tag is represented by a one hot vector

In [82]:
# Convert word-level labels to one-hot encoded format
num_classes = len(unique_tags)  # Number of classes in your target variable
word_labels_one_hot = [to_categorical(labels, num_classes=num_classes) for labels in total_tags_y]

In [83]:
no_word_vector = to_categorical(le.transform(["no_word"]))
no_word_vector

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])

#### pad_sequence --> any given sequence of length less than "max_length" is padded by zero padding at front (word vector dimension).

#### pad_sequence_labels --> labels also padded


In [84]:
def pad_sequence(sentences,max_length,word_dim):
    sentence_vectors_padded = []
    word_labels_one_hot_padded = []
    
    for sentence in sentences:
        to_be_added = max_length - len(sentence)
        temp_array = np.vstack((np.zeros((to_be_added,word_dim)),np.array(sentence)))
        sentence_vectors_padded.append(temp_array)

    sentence_vectors_padded = np.array(sentence_vectors_padded)
    return sentence_vectors_padded

def pad_sequence_labels(word_labels_one_hot,max_length,label_dim):
    word_labels_one_hot_padded = []
    
    for sentence in word_labels_one_hot:
        to_be_added = max_length - len(sentence)
        repeated_a = np.repeat(no_word_vector, to_be_added, axis=0).reshape(-1, no_word_vector.shape[1])
        temp_array = np.vstack((repeated_a,np.array(sentence)))
        word_labels_one_hot_padded.append(temp_array)

    word_labels_one_hot_padded = np.array(word_labels_one_hot_padded)
    return word_labels_one_hot_padded


In [85]:
sentence_vectors_padded = pad_sequence(sentence_vectors, max_length,word_dim)
word_labels_one_hot_padded = pad_sequence_labels(word_labels_one_hot,max_length,47)

In [86]:
sentence_vectors_padded.shape, word_labels_one_hot_padded.shape

((8081, 15, 300), (8081, 15, 47))

#### in the above output (number of sentnces, maximum length of sentence, word dimension)
#### second output is (number of sentences, maximum length of sentence, tag dimension)

### 2) Train Test Validation Split

In [87]:
X = np.array(sentence_vectors_padded)
y = np.array(word_labels_one_hot_padded)

# Split the dataset into training, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

### 3) Modelling (LSTM)

In [88]:
hidden_units = 100
num_features = word_dim

model = Sequential([
    Masking(mask_value=0.0, input_shape=(max_length, num_features)),
    LSTM(units=hidden_units, return_sequences=True),
    Dropout(0.2),  # Add dropout for regularization
    Dense(units=num_classes, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Define early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_val, y_val), callbacks=[early_stopping])


  super().__init__(**kwargs)


Epoch 1/50
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 30ms/step - accuracy: 0.2770 - loss: 2.6102 - val_accuracy: 0.5821 - val_loss: 0.9906
Epoch 2/50
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 28ms/step - accuracy: 0.5890 - loss: 0.9112 - val_accuracy: 0.6366 - val_loss: 0.6674
Epoch 3/50
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 28ms/step - accuracy: 0.6415 - loss: 0.6394 - val_accuracy: 0.6573 - val_loss: 0.5387
Epoch 4/50
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 29ms/step - accuracy: 0.6566 - loss: 0.5237 - val_accuracy: 0.6698 - val_loss: 0.4690
Epoch 5/50
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 29ms/step - accuracy: 0.6777 - loss: 0.4454 - val_accuracy: 0.6780 - val_loss: 0.4212
Epoch 6/50
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 29ms/step - accuracy: 0.6824 - loss: 0.3904 - val_accuracy: 0.6842 - val_loss: 0.3889
Epoch 7/50
[1m152/152

In [89]:
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)

[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.7111 - loss: 0.2521
Test Loss: 0.2591118812561035
Test Accuracy: 0.7094621658325195


In [90]:
y_test_pred = model.predict(X_test)

[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step


In [91]:
# to check the accuracy (word level) of model manually
def cal_accuracy(y_test, y_test_pred):
    # Get the index of the class with the highest probability for each word
    actual_tags_indices = np.argmax(y_test, axis=-1)  # Shape: (1, 15)
    # Get the index of the class with the highest probability for each word
    predicted_tags_indices = np.argmax(y_test_pred, axis=-1)  # Shape: (1, 15)

    actual_tags_indices_list = list(actual_tags_indices.flatten())
    predicted_tags_indices_list = list(predicted_tags_indices.flatten())

    i = 0
    count = 0
    while i < len(actual_tags_indices_list):
        if actual_tags_indices_list[i] == predicted_tags_indices_list[i]:
            count+=1
        i+=1
    accuracy = count/i *100
    return accuracy

In [92]:
test_accuracy = cal_accuracy(y_test, y_test_pred)

### 4) Test Results

In [93]:
print("test accuracy of model is : ", test_accuracy)

test accuracy of model is :  70.94619666048237
