In [15]:
import json
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score
from tqdm import tqdm
import os
from gensim.models import KeyedVectors
import re
import gensim.downloader as api

# Importing conlleval for evaluation
from conlleval import evaluate

In [6]:
def preprocess_data(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    processed_data = []
    
    for item in data:
        sentence = item['sentence']
        aspect_terms = item['aspect_terms']
        
        # Tokenize the sentence
        tokens = sentence.split()
        
        # Initialize all labels as 'O'
        labels = ['O'] * len(tokens)
        
        # Extract aspect terms
        terms = []
        
        for aspect in aspect_terms:
            term = aspect['term']
            terms.append(term)
            
            # Get the start and end positions
            start = int(aspect['from'])
            end = int(aspect['to'])
            
            # Find the tokens that correspond to this aspect term
            term_tokens = []
            term_indices = []
            
            char_index = 0
            for i, token in enumerate(tokens):
                token_start = char_index
                token_end = token_start + len(token)
                
                # Check if this token overlaps with the aspect term
                if token_end > start and token_start < end:
                    term_tokens.append(token)
                    term_indices.append(i)
                
                char_index = token_end + 1  # +1 for the space
            
            # Apply BIO tagging
            if term_indices:
                labels[term_indices[0]] = 'B'  # Beginning of aspect term
                for idx in term_indices[1:]:
                    labels[idx] = 'I'  # Inside of aspect term
        
        processed_item = {
            'sentence': sentence,
            'tokens': tokens,
            'labels': labels,
            'aspect_terms': terms
        }
        
        processed_data.append(processed_item)
    
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(processed_data, f, indent=2)
    
    return processed_data

In [7]:
# Dataset class
class AspectTermDataset(Dataset):
    def __init__(self, data, word_to_idx, label_to_idx):
        self.data = data
        self.word_to_idx = word_to_idx
        self.label_to_idx = label_to_idx
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        tokens = item['tokens']
        labels = item['labels']
        
        # Convert tokens to indices
        token_indices = [self.word_to_idx.get(token.lower(), self.word_to_idx['<UNK>']) for token in tokens]
        
        # Convert labels to indices
        label_indices = [self.label_to_idx[label] for label in labels]
        
        return {
            'tokens': torch.tensor(token_indices, dtype=torch.long),
            'labels': torch.tensor(label_indices, dtype=torch.long),
            'lengths': len(tokens)
        }

In [8]:
# Collate function for batching
def collate_fn(batch):
    # Sort the batch by length in descending order
    batch = sorted(batch, key=lambda x: x['lengths'], reverse=True)
    
    # Get the length of each sequence
    lengths = [item['lengths'] for item in batch]
    
    # Get the maximum length in the batch
    max_length = max(lengths)
    
    # Pad the sequences
    tokens = torch.zeros(len(batch), max_length, dtype=torch.long)
    labels = torch.zeros(len(batch), max_length, dtype=torch.long)
    
    for i, item in enumerate(batch):
        tokens[i, :item['lengths']] = item['tokens']
        labels[i, :item['lengths']] = item['labels']
    
    return {
        'tokens': tokens,
        'labels': labels,
        'lengths': torch.tensor(lengths, dtype=torch.long)
    }

In [16]:
# Model classes
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, pretrained_embeddings=None):
        super(RNNModel, self).__init__()
        
        if pretrained_embeddings is not None:
            self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=False)
        else:
            self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
    def forward(self, tokens, lengths):
        embedded = self.embedding(tokens)
        
        # Pack the sequences
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, lengths.cpu(), batch_first=True)
        
        packed_output, hidden = self.rnn(packed_embedded)
        
        # Unpack the sequences
        output, _ = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        
        # Pass through the fully connected layer
        output = self.fc(output)
        
        return output

class GRUModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, pretrained_embeddings=None):
        super(GRUModel, self).__init__()
        
        if pretrained_embeddings is not None:
            self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=False)
        else:
            self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
    def forward(self, tokens, lengths):
        embedded = self.embedding(tokens)
        
        # Pack the sequences
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, lengths.cpu(), batch_first=True)
        
        packed_output, hidden = self.gru(packed_embedded)
        
        # Unpack the sequences
        output, _ = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        
        # Pass through the fully connected layer
        output = self.fc(output)
        
        return output

In [17]:
# Load GloVe embeddings
def load_glove_embeddings(path, word_to_idx, embedding_dim=300):
    embeddings = np.zeros((len(word_to_idx), embedding_dim))
    
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            if word in word_to_idx:
                vector = np.asarray(values[1:], dtype='float32')
                embeddings[word_to_idx[word]] = vector
    
    return torch.FloatTensor(embeddings)

In [18]:
# Load FastText embeddings
def load_fasttext_embeddings(path, word_to_idx, embedding_dim=300):
    model = KeyedVectors.load_word2vec_format(path, binary=False)
    embeddings = np.zeros((len(word_to_idx), embedding_dim))
    
    for word, idx in word_to_idx.items():
        if word in model:
            embeddings[idx] = model[word]
    
    return torch.FloatTensor(embeddings)

In [20]:
# Evaluation function
def evaluate_model(model, data_loader, criterion, device):
    model.eval()
    total_loss = 0
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for batch in data_loader:
            tokens = batch['tokens'].to(device)
            labels = batch['labels'].to(device)
            lengths = batch['lengths']
            
            outputs = model(tokens, lengths)
            
            # Reshape outputs and labels for loss calculation
            outputs_flat = outputs.view(-1, outputs.shape[-1])
            labels_flat = labels.view(-1)
            
            # Calculate loss (ignore padding)
            mask = labels_flat != 0  # Assuming 0 is the padding index
            loss = criterion(outputs_flat[mask], labels_flat[mask])
            
            total_loss += loss.item()
            
            # Get predictions
            _, predictions = torch.max(outputs, dim=2)
            
            # Collect predictions and labels (ignoring padding)
            for i in range(len(lengths)):
                length = lengths[i].item()
                pred = predictions[i, :length].cpu().numpy()
                lab = labels[i, :length].cpu().numpy()
                
                all_predictions.extend(pred)
                all_labels.extend(lab)
    
    # Calculate F1 score
    f1 = f1_score(all_labels, all_predictions, average='weighted')
    
    return total_loss / len(data_loader), f1

In [21]:
# Training function
def train(model, train_loader, val_loader, optimizer, criterion, device, epochs, model_save_path):
    best_f1 = 0.0
    train_losses = []
    val_losses = []
    
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        
        for batch in tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs}'):
            tokens = batch['tokens'].to(device)
            labels = batch['labels'].to(device)
            lengths = batch['lengths']
            
            optimizer.zero_grad()
            
            outputs = model(tokens, lengths)
            
            # Reshape outputs and labels for loss calculation
            outputs = outputs.view(-1, outputs.shape[-1])
            labels = labels.view(-1)
            
            # Calculate loss (ignore padding)
            mask = labels != 0  # Assuming 0 is the padding index
            loss = criterion(outputs[mask], labels[mask])
            
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
        
        train_loss = epoch_loss / len(train_loader)
        train_losses.append(train_loss)
        
        # Validation
        val_loss, val_f1 = evaluate_model(model, val_loader, criterion, device)
        val_losses.append(val_loss)
        
        print(f'Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val F1: {val_f1:.4f}')
        
        # Save the best model
        if val_f1 > best_f1:
            best_f1 = val_f1
            torch.save(model.state_dict(), model_save_path)
            print(f'Model saved to {model_save_path}')
    
    return train_losses, val_losses

In [22]:
# Convert predictions to BIO format
def convert_to_bio(idx_to_label, predictions, lengths):
    bio_predictions = []
    
    for i, length in enumerate(lengths):
        bio_predictions.append([idx_to_label[pred] for pred in predictions[i, :length]])
    
    return bio_predictions

In [23]:
# Calculate F1 score using conlleval
def calculate_f1_conlleval(tokens, true_labels, pred_labels):
    results = []
    
    for sample_tokens, sample_true, sample_pred in zip(tokens, true_labels, pred_labels):
        for token, true, pred in zip(sample_tokens, sample_true, sample_pred):
            results.append(f"{token} {true} {pred}")
        results.append("")  # Empty line between sentences
    
    prec, rec, f1 = evaluate(results)
    return prec, rec, f1

In [24]:
# Function to plot training and validation losses
def plot_losses(train_losses, val_losses, title, save_path):
    plt.figure(figsize=(10, 6))
    plt.plot(train_losses, label='Training Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.title(title)
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)
    plt.savefig(save_path)
    plt.close()

In [25]:
# Testing function
def test_model(model, test_data, word_to_idx, label_to_idx, idx_to_label, device):
    model.eval()
    
    test_dataset = AspectTermDataset(test_data, word_to_idx, label_to_idx)
    test_loader = DataLoader(test_dataset, batch_size=32, collate_fn=collate_fn)
    
    all_predictions = []
    all_tokens = []
    all_true_labels = []
    
    with torch.no_grad():
        for batch in test_loader:
            tokens = batch['tokens'].to(device)
            labels = batch['labels'].to(device)
            lengths = batch['lengths']
            
            outputs = model(tokens, lengths)
            
            # Get predictions
            _, predictions = torch.max(outputs, dim=2)
            
            # Convert predictions to BIO format
            bio_predictions = convert_to_bio(idx_to_label, predictions.cpu(), lengths)
            
            # Get true labels
            bio_true = convert_to_bio(idx_to_label, labels.cpu(), lengths)
            
            # Get tokens
            batch_tokens = []
            for i, length in enumerate(lengths):
                batch_tokens.append([test_data[i]['tokens'][j] for j in range(length)])
            
            all_predictions.extend(bio_predictions)
            all_tokens.extend(batch_tokens)
            all_true_labels.extend(bio_true)
    
    # Calculate F1 score using conlleval
    prec, rec, f1 = calculate_f1_conlleval(all_tokens, all_true_labels, all_predictions)
    
    return prec, rec, f1

In [37]:
def main():
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f'Using device: {device}')
    
    # Preprocess data
    print('Preprocessing data...')
    train_data = preprocess_data('train.json', 'train_task_1.json')
    val_data = preprocess_data('val.json', 'val_task_1.json')
    
    # Build vocabulary
    print('Building vocabulary...')
    word_to_idx = {'<PAD>': 0, '<UNK>': 1}
    label_to_idx = {'<PAD>': 0, 'O': 1, 'B': 2, 'I': 3}
    idx_to_label = {0: '<PAD>', 1: 'O', 2: 'B', 3: 'I'}
    
    for item in train_data:
        for token in item['tokens']:
            if token.lower() not in word_to_idx:
                word_to_idx[token.lower()] = len(word_to_idx)
    
    vocab_size = len(word_to_idx)
    output_dim = len(label_to_idx)
    
    # print("Vocabulary: ", word_to_idx)
    
    print(f'Vocabulary size: {vocab_size}')
    print(f'Number of labels: {output_dim}')
    
    # Load pretrained embeddings    
    try:
        print('Loading pretrained embeddings...')
        embedding_dim = 300
        glove_embeddings = np.zeros((len(word_to_idx), embedding_dim))
        fasttext_embeddings = np.zeros((len(word_to_idx), embedding_dim))
        
        # Load models
        print('Loading models...')
        glove_model = api.load("glove-wiki-gigaword-300")
        print('Glove model loaded')
        fasttext_model = api.load("fasttext-wiki-news-subwords-300")
        print('FastText model loaded')
        
        # Fill embedding matrices
        print('Filling embedding matrices...')
        for word, idx in word_to_idx.items():
            if word in glove_model:
                glove_embeddings[idx] = glove_model[word]
            if word in fasttext_model:
                fasttext_embeddings[idx] = fasttext_model[word]
        
        # Convert to torch tensors
        print('Converting to torch tensors...')
        glove_embeddings = torch.FloatTensor(glove_embeddings)
        fasttext_embeddings = torch.FloatTensor(fasttext_embeddings)
        
    except Exception as e:
        print(f"Error loading from gensim: {e}")
        print("Using random initializations for demonstration.")
        embedding_dim = 300
        glove_embeddings = torch.FloatTensor(np.random.normal(0, 0.01, (vocab_size, embedding_dim)))
        fasttext_embeddings = torch.FloatTensor(np.random.normal(0, 0.01, (vocab_size, embedding_dim)))
    
    # Create datasets and dataloaders
    print('Creating datasets and dataloaders...')
    train_dataset = AspectTermDataset(train_data, word_to_idx, label_to_idx)
    val_dataset = AspectTermDataset(val_data, word_to_idx, label_to_idx)
    
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=32, collate_fn=collate_fn)
    
    # Define models, loss, and optimizer
    embedding_dim = 300
    hidden_dim = 200
    
    # Create the directory for saving models if it doesn't exist
    os.makedirs('models', exist_ok=True)
    
    models = {
        'RNN_GloVe': RNNModel(vocab_size, embedding_dim, hidden_dim, output_dim, glove_embeddings).to(device),
        'RNN_FastText': RNNModel(vocab_size, embedding_dim, hidden_dim, output_dim, fasttext_embeddings).to(device),
        'GRU_GloVe': GRUModel(vocab_size, embedding_dim, hidden_dim, output_dim, glove_embeddings).to(device),
        'GRU_FastText': GRUModel(vocab_size, embedding_dim, hidden_dim, output_dim, fasttext_embeddings).to(device)
    }
    
    criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding index
    epochs = 10
    
    results = {}
    
    # Train and evaluate each model
    for name, model in models.items():
        print(f'\nTraining {name}...')
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        model_save_path = f'models/{name}_best.pt'
        
        train_losses, val_losses = train(model, train_loader, val_loader, optimizer, criterion, device, epochs, model_save_path)
        
        # Plot losses
        plot_losses(train_losses, val_losses, f'{name} Training and Validation Loss', f'plots/{name}_loss.png')
        
        # Load the best model
        model.load_state_dict(torch.load(model_save_path))
        
        # Evaluate on validation set
        _, _, val_f1 = test_model(model, val_data, word_to_idx, label_to_idx, idx_to_label, device)
        
        results[name] = {
            'val_f1': val_f1
        }
        
        print(f'{name} validation F1: {val_f1}')
    
    # Print results summary
    print('\nResults Summary:')
    for name, result in results.items():
        print(f'{name}: F1 = {result["val_f1"]:.4f}')
    
    # Find the best model
    best_model_name = max(results, key=lambda x: results[x]['val_f1'])
    print(f'\nBest model: {best_model_name} with F1 = {results[best_model_name]["val_f1"]:.4f}')
    
    # Save best model info
    with open('best_model_info.json', 'w') as f:
        json.dump({
            'model_name': best_model_name,
            'f1_score': results[best_model_name]['val_f1']
        }, f)

In [27]:
# Test function for inference
def load_and_test(test_file, model_path, model_type, word_to_idx_path, label_to_idx_path):
    # Load model configuration
    with open(word_to_idx_path, 'r') as f:
        word_to_idx = json.load(f)
    
    with open(label_to_idx_path, 'r') as f:
        label_to_idx = json.load(f)
    
    idx_to_label = {int(idx): label for label, idx in label_to_idx.items()}
    
    # Load test data
    test_data = preprocess_data(test_file, f'test_task1.json')
    
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Initialize model
    vocab_size = len(word_to_idx)
    output_dim = len(label_to_idx)
    embedding_dim = 300
    hidden_dim = 200
    
    if model_type.startswith('RNN'):
        model = RNNModel(vocab_size, embedding_dim, hidden_dim, output_dim).to(device)
    elif model_type.startswith('GRU'):
        model = GRUModel(vocab_size, embedding_dim, hidden_dim, output_dim).to(device)
    else:
        raise ValueError(f"Unknown model type: {model_type}")
    
    # Load model weights
    model.load_state_dict(torch.load(model_path, map_location=device))
    
    # Test the model
    prec, rec, f1 = test_model(model, test_data, word_to_idx, label_to_idx, idx_to_label, device)
    
    print(f'Test Results for {model_type}:')
    print(f'Precision: {prec:.4f}')
    print(f'Recall: {rec:.4f}')
    print(f'F1 Score: {f1:.4f}')
    
    return {
        'precision': prec,
        'recall': rec,
        'f1_score': f1
    }

In [38]:
if __name__ == "__main__":
    # Create directory for plots
    os.makedirs('plots', exist_ok=True)
    
    # Run main function
    main()

Using device: cpu
Preprocessing data...
Building vocabulary...
Vocabulary size: 5763
Number of labels: 4
Loading pretrained embeddings...
Loading models...
Glove model loaded
FastText model loaded
Filling embedding matrices...
Converting to torch tensors...
Creating datasets and dataloaders...

Training RNN_GloVe...


Epoch 1/10: 100%|██████████| 77/77 [00:32<00:00,  2.36it/s]


Epoch 1/10, Train Loss: 0.2862, Val Loss: 0.1613, Val F1: 0.9399
Model saved to models/RNN_GloVe_best.pt


Epoch 2/10: 100%|██████████| 77/77 [00:23<00:00,  3.29it/s]


Epoch 2/10, Train Loss: 0.1419, Val Loss: 0.1334, Val F1: 0.9478
Model saved to models/RNN_GloVe_best.pt


Epoch 3/10: 100%|██████████| 77/77 [00:16<00:00,  4.79it/s]


Epoch 3/10, Train Loss: 0.0938, Val Loss: 0.1329, Val F1: 0.9474


Epoch 4/10: 100%|██████████| 77/77 [00:21<00:00,  3.59it/s]


Epoch 4/10, Train Loss: 0.0650, Val Loss: 0.1321, Val F1: 0.9553
Model saved to models/RNN_GloVe_best.pt


Epoch 5/10: 100%|██████████| 77/77 [00:22<00:00,  3.37it/s]


Epoch 5/10, Train Loss: 0.0372, Val Loss: 0.1482, Val F1: 0.9544


Epoch 6/10: 100%|██████████| 77/77 [00:17<00:00,  4.30it/s]


Epoch 6/10, Train Loss: 0.0210, Val Loss: 0.1681, Val F1: 0.9501


Epoch 7/10: 100%|██████████| 77/77 [00:17<00:00,  4.49it/s]


Epoch 7/10, Train Loss: 0.0112, Val Loss: 0.1720, Val F1: 0.9496


Epoch 8/10: 100%|██████████| 77/77 [00:19<00:00,  4.02it/s]


Epoch 8/10, Train Loss: 0.0085, Val Loss: 0.1928, Val F1: 0.9516


Epoch 9/10: 100%|██████████| 77/77 [00:16<00:00,  4.53it/s]


Epoch 9/10, Train Loss: 0.0033, Val Loss: 0.1998, Val F1: 0.9515


Epoch 10/10: 100%|██████████| 77/77 [00:16<00:00,  4.62it/s]


Epoch 10/10, Train Loss: 0.0015, Val Loss: 0.2187, Val F1: 0.9499


KeyError: tensor(1)

In [None]:
# Example usage of test function:
results = load_and_test('val.json', 'models/GRU_GloVe_best.pt', 'GRU_GloVe', 'word_to_idx.json', 'label_to_idx.json')