In [29]:
!pip install seqeval
# SeqEval is a Python library used for evaluating sequence labeling tasks,
# particularly in natural language processing (NLP).
# SeqEval computes common evaluation metrics like precision,
# recall, and F1-score for these tasks



In [30]:
import sys
sys.path.insert(0, '/kaggle/input/aspect-based-sent-analysis')

# sys is a module
# sys.path: This is a list in Python that contains directories where 
# Python looks for modules to import.

# This line adds the directory /kaggle/input/conlleval 
# to the beginning (index 0) of the sys.path list.

In [31]:
# These are standard library module 
import io 
# Provides tools for handling I/O (input and output), such as working with streams, 
# file-like objects, or buffer handling.
import os
# Navigating directories or retrieving environment variables
import json
# SON (JavaScript Object Notation) data for serialization (converting objects to JSON) 
# and deserialization (reading JSON into objects).
import sys
# discussed above

# These are third party library module (installed Separately)
import numpy as np
# powerful library for numerical computing, especially for arrays and matrices
# Matrix operations, mathematical computations, or scientific tasks.
from seqeval.metrics import f1_score
# metrics in seqeval.metrics is essentially a class and f1_score is a function

import torch
#  Part of PyTorch, a popular framework for deep learning, 
#  enabling the creation and training of neural networks.
import torch.nn as nn
# Provides tools to build neural network layers.
import torch.optim as optim
# Implements optimization algorithms like SGD and Adam.
from torch.utils.data import Dataset, DataLoader
# Dataset: An abstract class in PyTorch that represents a dataset
# A utility that wraps a dataset and enables easier 
# batch loading, shuffling, and multiprocessing.

import matplotlib.pyplot as plt
# A plotting library for creating static, interactive, or animated visualizations.
from tqdm import tqdm
# Displays progress bars in loops and tasks to track execution.
# The full form of tqdm is "taqaddum," which means "progress" in Arabic
from conlleval import evaluate
# utilizing the evaluate function from conlleval class
import sys, io
import json


In [32]:
def preprocess_data(input_path, output_path):
    

    def create_labels(merged_tokens, merged_offsets, aspect_terms):

        labels = ["O"] * len(merged_tokens)
        for aspect in aspect_terms:
            a_from = int(aspect["from"])

            a_to = int(aspect["to"])
            token_found = False
            for i, (token, (start, end)) in enumerate(zip(merged_tokens, merged_offsets)):
                effective_end = end - 1 if token.endswith((".", ",")) else end

                if start >= a_from and effective_end <= a_to:
                    labels[i] = "B" if not token_found else "I"
                    token_found = True
                # token_found is used to handle multi-term aspect 
                # a_from and effective_end indicates the broad level of aspect term
                # whle start and end indicate words inside multi-term aspect term

        return labels
    

    def merge_tokens_with_punctuation(tokens, token_offsets):
        merged_tokens = []
        merged_offsets = []

        for i, (token, (start, end)) in enumerate(zip(tokens, token_offsets)):
            # zip(tokens, token_offsets):Combines the tokens list and 
            # the token_offsets list into pairs (tuples).
            if token in [".", ","]:
                if merged_tokens:
                    prev_token = merged_tokens[-1]
                    prev_start, _ = merged_offsets[-1]

                    merged_tokens[-1] = prev_token + token

                    merged_offsets[-1] = (prev_start, end)
                else:
                    merged_tokens.append(token)

                    merged_offsets.append((start, end))
            else:
                merged_tokens.append(token)
                merged_offsets.append((start, end))

        return merged_tokens, merged_offsets
    

    
    
    def get_token_offsets(sentence, tokens):

        token_offsets = []
        start_index = 0

        for token in tokens:
            index = sentence.find(token, start_index)
            # The method returns the index of the first occurrence of the substring.
            # If the substring is not found, it returns -1
            if index == -1:

                index = start_index

            token_offsets.append((index, index + len(token)))
            start_index = index + len(token)

        return token_offsets

    with open(input_path, "r", encoding="utf-8") as f:
        data = json.load(f)
# Input Type: File-like object containing JSON data.
# Output Type: Python object (dict, list, etc.), depending on the JSON structure.
# if file contains list of json objects then output will bw list of dictionaries.
    
    preprocessed = []

    for entry in data:

        sentence = entry["sentence"]

        tokens = sentence.split()
        
        token_offsets = get_token_offsets(sentence, tokens)

        merged_tokens, merged_offsets = merge_tokens_with_punctuation(tokens, token_offsets)

        labels = create_labels(merged_tokens, merged_offsets, entry.get("aspect_terms", []))
        
        preprocessed.append({
            "sentence": sentence,
            "tokens": merged_tokens,
            "labels": labels,
            "aspect_terms": [a["term"] for a in entry.get("aspect_terms", [])]
        })
        
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(preprocessed, f, indent=2)
    print(f"Preprocessed data saved to {output_path}")



In [33]:
def build_vocab(dataset, min_freq=1):

    
    def create_vocab_mapping(word_freq):

        vocab = {"<PAD>": 0, "<UNK>": 1}
        for word, freq in word_freq.items():
            if freq >= min_freq:

                vocab[word] = len(vocab)
        return vocab
    
    def count_word_frequencies():

        word_freq = {}

        for entry in dataset:
            for token in entry["tokens"]:

                word_freq[token] = word_freq.get(token, 0) + 1
        return word_freq
    

    word_freq = count_word_frequencies()
    return create_vocab_mapping(word_freq)

# Label mappings
label2id = {"O": 0, "B": 1, "I": 2}
id2label = {0: "O", 1: "B", 2: "I"}

In [34]:
class AspectDataset(Dataset):
    def __init__(self, data, vocab, label2id, max_len=100):
        self.data = data
        self.vocab = vocab
        self.label2id = label2id
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):

        def get_raw_tokens_and_labels():
            entry = self.data[idx]


            tokens = entry["tokens"]
            labels = entry["labels"]
            token_ids = [self.vocab.get(token, self.vocab["<UNK>"]) for token in tokens]

            label_ids = [self.label2id[label] for label in labels]

            return tokens, token_ids, label_ids

        def pad_and_truncate(tokens, token_ids, label_ids):

            orig_len = len(token_ids)

            if orig_len < self.max_len:
                pad_length = self.max_len - orig_len
                token_ids.extend([self.vocab["<PAD>"]] * pad_length)

                label_ids.extend([-100] * pad_length)
            else:
                token_ids = token_ids[:self.max_len]

                label_ids = label_ids[:self.max_len]

                tokens = tokens[:self.max_len]
                orig_len = self.max_len

            return tokens, token_ids, label_ids, orig_len

        tokens, token_ids, label_ids = get_raw_tokens_and_labels()

        tokens, token_ids, label_ids, orig_len = pad_and_truncate(tokens, token_ids, label_ids)

        return torch.tensor(token_ids, dtype=torch.long), torch.tensor(label_ids, dtype=torch.long), orig_len, tokens

In [35]:
def custom_collate(batch):
    
    
    token_ids = torch.stack([b[0] for b in batch])

    label_ids = torch.stack([b[1] for b in batch])

    orig_lens = [b[2] for b in batch]
    tokens = [b[3] for b in batch]  

    return token_ids, label_ids, orig_lens, tokens

In [36]:
class SequenceLabelingModel(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_labels, 
                 rnn_type="RNN", embedding_matrix=None, num_layers=1, dropout=0.5):
        
        super(SequenceLabelingModel, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        if embedding_matrix is not None:
            self.embedding.weight.data.copy_(torch.tensor(embedding_matrix, dtype=torch.float))
        
        # For single layer, PyTorch's dropout parameter is ignored, so we set it to 0 in that case.
        rnn_dropout = dropout if num_layers > 1 else 0
        
        if rnn_type.upper() == "RNN":
            self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers=num_layers, 
                              batch_first=True, bidirectional=True, dropout=rnn_dropout)
        elif rnn_type.upper() == "GRU":
            self.rnn = nn.GRU(embedding_dim, hidden_dim, num_layers=num_layers, 
                              batch_first=True, bidirectional=True, dropout=rnn_dropout)
        else:
            raise ValueError("Unsupported rnn_type: choose either 'RNN' or 'GRU'")
        
        
        self.dropout = nn.Dropout(dropout)

        self.fc = nn.Linear(hidden_dim * 2, num_labels)

    def forward(self, x):
        embeds = self.embedding(x)          

        outputs, _ = self.rnn(embeds)  

        outputs = self.dropout(outputs)         
        logits = self.fc(outputs)               
        return logits

In [37]:
def load_embeddings(embedding_path, vocab, embedding_dim):

    def initialize_embeddings():
        return np.random.uniform(-0.25, 0.25, (len(vocab), embedding_dim)).astype(np.float32)
    
    
    def update_embeddings(embeddings):

        with open(embedding_path, "r", encoding="utf-8") as f:
            for line in f:
                try:
                    values = line.rstrip().split()
                    if len(values) < embedding_dim + 1:
                        continue
                    word = values[0]

                    vector = np.asarray(values[1:], dtype="float32")
                    if word in vocab:
                        embeddings[vocab[word]] = vector
                except (ValueError, IndexError):
                    continue
        return embeddings

    embeddings = initialize_embeddings()

    return update_embeddings(embeddings)

In [38]:
def compute_predictions(model, data_loader, criterion, device):

    model.eval()
    total_loss = 0.0
    all_preds = []

    all_labels = []
    
    with torch.no_grad():
        for token_ids, label_ids, seq_len, tokens_batch in data_loader:
            token_ids = token_ids.to(device)

            label_ids = label_ids.to(device)
            logits = model(token_ids) 
            loss = criterion(logits.view(-1, logits.shape[-1]), label_ids.view(-1))

            total_loss += loss.item()
            predictions = torch.argmax(logits, dim=-1)  
            predictions = predictions.cpu().numpy()
            label_ids = label_ids.cpu().numpy()
            batch_size = token_ids.size(0)
            for i in range(batch_size):

                length = seq_len[i]
                pred_tags = [id2label[p] if id2label[p] == "O" else id2label[p] + "-TERM" 
                             
                           for p in predictions[i][:length]]
                true_tags = [id2label[label_ids[i][j]] if id2label[label_ids[i][j]] == "O" 
                           else id2label[label_ids[i][j]] + "-TERM" for j in range(length)]
                all_preds.append(pred_tags)

                all_labels.append(true_tags)

    
    avg_loss = total_loss / len(data_loader)

    return avg_loss, all_preds, all_labels

In [39]:
def calculate_metrics(all_labels, all_preds):

    old_stdout = sys.stdout
    sys.stdout = io.StringIO()
    
    flat_true = [tag for sentence in all_labels for tag in sentence]

    flat_pred = [tag for sentence in all_preds for tag in sentence]
    
    precision, recall, f1 = evaluate(flat_true, flat_pred)
    sys.stdout = old_stdout
    
    from seqeval.metrics import f1_score

    tag_f1 = f1_score(all_labels, all_preds)
    
    return precision, recall, f1/100, tag_f1

def evaluate_model(model, data_loader, criterion, device):

    avg_loss, all_preds, all_labels = compute_predictions(model, data_loader, criterion, device)
    precision, recall, f1, tag_f1 = calculate_metrics(all_labels, all_preds)
    
    print(f"\n\n\nChunk-level metrics:")

    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")

    print(f"F1: {f1:.2f}")

    print(f"\nTag-level F1: {tag_f1:.2f}\n\n\n")
    
    return avg_loss, f1

In [40]:
def train_model(model, train_loader, val_loader, num_epochs, lr, device, model_save_path):

    def setup_training():
        model.to(device)

        optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)

        criterion = nn.CrossEntropyLoss(ignore_index=-100)

        return optimizer, criterion, [], [], 0.0

    def train_epoch(optimizer, criterion):

        model.train()

        epoch_loss = 0.0
        for token_ids, label_ids, _, _ in train_loader:

            token_ids = token_ids.to(device)

            label_ids = label_ids.to(device)

            optimizer.zero_grad()

            logits = model(token_ids)

            loss = criterion(logits.view(-1, logits.shape[-1]), label_ids.view(-1))

            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
        return epoch_loss / len(train_loader)

    def evaluate_and_save(val_f1, best_val_f1, no_improve_count):

        if val_f1 > best_val_f1:

            best_val_f1 = val_f1

            torch.save(model.state_dict(), model_save_path)
            no_improve_count = 0
        else:
            no_improve_count += 1

        return best_val_f1, no_improve_count

    optimizer, criterion, train_losses, val_losses, best_val_f1 = setup_training()

    patience = 10
    no_improve_count = 0

    for epoch in range(num_epochs):

        avg_train_loss = train_epoch(optimizer, criterion)

        train_losses.append(avg_train_loss)
        
        val_loss, val_f1 = evaluate_model(model, val_loader, criterion, device)
        val_losses.append(val_loss)

        print(f"Epoch {epoch+1}/{num_epochs}: Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss:.4f}, Val F1: {val_f1:.4f}")
        
        best_val_f1, no_improve_count = evaluate_and_save(val_f1, best_val_f1, no_improve_count)
        
        if no_improve_count >= patience:
            print("Early stopping triggered.")
            break

    return train_losses, val_losses, best_val_f1

In [41]:
def setup_inference_model(vocab, model_embedding, model_rnn, device):

    # Determine embedding file and dimension
    if model_embedding.lower() == "glove":
        embedding_dim = 300
        embedding_file = "/kaggle/input/aspect-based-sent-analysis/glove.6B.300d.txt"

    elif model_embedding.lower() == "fasttext":
        embedding_dim = 300
        embedding_file = "/kaggle/input/aspect-based-sent-analysis/wiki-news-300d-1M.vec/wiki-news-300d-1M.vec"

    else:
        raise ValueError("Unsupported embedding type for inference")
        
    embedding_matrix = load_embeddings(embedding_file, vocab, embedding_dim)

    model = SequenceLabelingModel(
            vocab_size=len(vocab),
            embedding_dim=embedding_dim,
            hidden_dim=128,
            num_labels=len(label2id),
            rnn_type=model_rnn,
            embedding_matrix=embedding_matrix,
            num_layers=2,
            dropout=0.5
        )

    model_save_path = "/kaggle/input/aspect-based-sent-analysis/best_model_GRU_fasttext.pth"

    if not os.path.exists(model_save_path):
        raise FileNotFoundError(f"Saved model file {model_save_path} not found.")
    
    model.load_state_dict(torch.load(model_save_path, map_location=device))
    model.to(device)

    return model

def run_inference(args, vocab, device):
    
    # Prepare test data
    if not os.path.exists("test.task1.json"):
        preprocess_data(args.test, "test.task1.json")

    with open("test.task1.json", "r", encoding="utf-8") as f:
        test_data = json.load(f)
    test_dataset = AspectDataset(test_data, vocab, label2id, max_len=args.max_len)

    test_loader = DataLoader(test_dataset, batch_size=args.batch_size, collate_fn=custom_collate)


    try:
        model = setup_inference_model(vocab, args.model_embedding, args.model_rnn, device)

        criterion = nn.CrossEntropyLoss(ignore_index=-100)
        test_loss, test_f1 = evaluate_model(model, test_loader, criterion, device)

        print(f"Test Loss: {test_loss:.4f}, Test F1: {test_f1:.4f}")
    except (ValueError, FileNotFoundError) as e:
        print(f"Error during inference: {str(e)}")


    # with open("test_predictions.txt", "w") as f:
    #  for tokens, preds in zip(tokens_batch, all_preds):
    #     for token, tag in zip(tokens, preds):
    #         f.write(f"{token}\t{tag}\n")
    #     f.write("\n")




In [42]:
def main():
   
    args = lambda: None  
    # This method is a quick and simple way to create an empty object 
    # that can have arbitrary attributes assigned to it.
    
    args.train = "/kaggle/input/aspect-based-sent-analysis/train.json"
    args.val = "/kaggle/input/aspect-based-sent-analysis/val.json"
    args.test = "/kaggle/input/aspect-based-sent-analysis/test_task1.json"
    args.epochs = 100
    args.lr = 0.0005
    args.max_len = 100
    args.batch_size = 32
    args.inference = False  
    args.model_rnn = "GRU"
    args.model_embedding = "fasttext"
    

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

   
    if not os.path.exists("train.task1.json"):
      preprocess_data(args.train, "train.task1.json")
#   if preprocessing not done then perform preprocessing  

    with open("train.task1.json", "r", encoding="utf-8") as f:
        train_data = json.load(f)

    vocab = build_vocab(train_data)

    
    with open("/kaggle/working/vocab_task1.json","w") as f:
     json.dump(vocab, f)

    print("Vocabulary size:", len(vocab))

    # If inference flag is set, run inference and exit
    if args.inference:
        run_inference(args, vocab, device)
        return

    
    if not os.path.exists("val.task1.json"):
        preprocess_data(args.val, "val.task1.json")

    with open("val.task1.json", "r", encoding="utf-8") as f:
        val_data = json.load(f)


    model_configs = [
        {"rnn_type": "RNN", "embedding": "glove", "embedding_path": "/kaggle/input/aspect-based-sent-analysis/glove.6B.300d.txt", "embedding_dim": 300},
        {"rnn_type": "RNN", "embedding": "fasttext", "embedding_path": "/kaggle/input/aspect-based-sent-analysis/wiki-news-300d-1M.vec/wiki-news-300d-1M.vec", "embedding_dim": 300},
        {"rnn_type": "GRU", "embedding": "glove", "embedding_path": "/kaggle/input/aspect-based-sent-analysis/glove.6B.300d.txt", "embedding_dim": 300},
        {"rnn_type": "GRU", "embedding": "fasttext", "embedding_path": "/kaggle/input/aspect-based-sent-analysis/wiki-news-300d-1M.vec/wiki-news-300d-1M.vec", "embedding_dim": 300},
    ]

    
    train_dataset = AspectDataset(train_data, vocab, label2id, max_len=args.max_len)
    val_dataset = AspectDataset(val_data, vocab, label2id, max_len=args.max_len)
    

    train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, collate_fn=custom_collate)
    val_loader = DataLoader(val_dataset, batch_size=args.batch_size, collate_fn=custom_collate)


    
   

    results = {}
    for config in model_configs:
        print("\n========================================")
        print(f"Training Model: {config['rnn_type']} with {config['embedding']}")
        print("========================================")

        embedding_matrix = load_embeddings(config["embedding_path"], vocab, config["embedding_dim"])

        model = SequenceLabelingModel(
                vocab_size=len(vocab),
                embedding_dim=config["embedding_dim"],
                hidden_dim=128,
                num_labels=len(label2id),
                rnn_type=config["rnn_type"],
                embedding_matrix=embedding_matrix,
                num_layers=2,
                dropout=0.5
            )

        model_save_path = f"best_model_{config['rnn_type']}_{config['embedding']}.pth"

        train_losses, val_losses, best_f1 = train_model(
            model, train_loader, val_loader,
            num_epochs=args.epochs, lr=args.lr, device=device,
            model_save_path=model_save_path
        )
        # Plot loss curves
        plt.figure()
        plt.plot(train_losses, label="Train Loss")
        plt.plot(val_losses, label="Val Loss")
        plt.xlabel("Epoch")
        plt.ylabel("Loss")
        plt.legend()

        plot_filename = f"{config['rnn_type']}_{config['embedding']}_loss.png"
        plt.savefig(plot_filename)
        
        plt.close()
        print(f"Best Validation F1 for {config['rnn_type']} with {config['embedding']}: {best_f1:.4f}")
        results[f"{config['rnn_type']}_{config['embedding']}"] = best_f1

    print("\nFinal model performance on validation set:")

    for key, val in results.items():
        print(f"{key}: F1 = {val:.4f}")

In [43]:

if __name__ == "__main__": 
    main()

# __name__: This is a built-in variable in Python. When a Python script is run, 
# this variable is automatically set to "__main__" for that script.

# If the script is imported into another script,
# the __name__ variable is set to the name of the module (not "__main__").

# so The construct is used for defining a block of code that should 
# only execute when the script is run directly, and not when it is imported

# That's why on import task1 Nothing prints automatically

Vocabulary size: 6360
Preprocessed data saved to val.task1.json

Training Model: RNN with glove



Chunk-level metrics:
Precision: 79.00
Recall: 47.53
F1: 0.59

Tag-level F1: 0.59



Epoch 1/100: Train Loss: 0.3232, Val Loss: 0.1885, Val F1: 0.5935



Chunk-level metrics:
Precision: 70.99
Recall: 57.14
F1: 0.63

Tag-level F1: 0.63



Epoch 2/100: Train Loss: 0.1890, Val Loss: 0.1483, Val F1: 0.6332



Chunk-level metrics:
Precision: 76.30
Recall: 56.59
F1: 0.65

Tag-level F1: 0.65



Epoch 3/100: Train Loss: 0.1369, Val Loss: 0.1489, Val F1: 0.6498



Chunk-level metrics:
Precision: 70.96
Recall: 65.11
F1: 0.68

Tag-level F1: 0.68



Epoch 4/100: Train Loss: 0.1013, Val Loss: 0.1399, Val F1: 0.6791



Chunk-level metrics:
Precision: 75.78
Recall: 60.16
F1: 0.67

Tag-level F1: 0.67



Epoch 5/100: Train Loss: 0.0698, Val Loss: 0.1562, Val F1: 0.6708



Chunk-level metrics:
Precision: 75.32
Recall: 65.38
F1: 0.70

Tag-level F1: 0.70



Epoch 6/100: Train Loss: 0.0488, Val Loss: 0.1651, V