In [1]:
import os
import time
import shutil
import random
import re
from typing import Tuple
from argparse import Namespace

import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertTokenizer
from sklearn.metrics import accuracy_score
from tqdm import tqdm

# Set seeds
seed = 1111
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True  # Ensure reproducibility


In [12]:
# Configuration 
tokenizer = BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased')
args = Namespace(
    emb_size=200,
    num_layers=5,
    hidden_size=50,
    vocab_size=tokenizer.vocab_size,
    max_seq_len=30,
    device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
    batch_size=16,
    lr=3e-3,
    num_epochs=100,
    patience=10,
    lr_patience=10,
    lr_factor=0.5,
    savedir='model_rnn'
)
os.makedirs(args.savedir, exist_ok=True)

### Data 

In [3]:
# Load and preprocess data
def load_data(file_path):
    try:
        return pd.read_csv(file_path, sep='\r\n', engine='python', header=None).loc[:, 0].values.tolist()
    except Exception as e:
        print(f"Error loading data: {e}")
        return []

def preprocess_tweet(tweet):
    tweet = re.sub(r'http\S+', '', tweet)
    tweet = re.sub(r'@\S+', '', tweet)
    tweet = re.sub(r'#\S+', '', tweet)
    tweet = tweet.lower()
    tweet = re.sub(r'\W', ' ', tweet)
    tweet = re.sub(r'\s+', ' ', tweet).strip()
    return tweet

X_train = [preprocess_tweet(tweet) for tweet in load_data('./data_mex20/mex20_train.txt')]
X_val = [preprocess_tweet(tweet) for tweet in load_data('./data_mex20/mex20_val.txt')]
y_train = np.array(load_data('./data_mex20/mex20_train_labels.txt')).reshape(-1)
y_val = np.array(load_data('./data_mex20/mex20_val_labels.txt')).reshape(-1)


In [4]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.encodings = tokenizer(texts, add_special_tokens=True, return_tensors='pt',
                                   truncation=True, max_length=max_length, padding='max_length',
                                   return_attention_mask=True)
        self.labels = torch.tensor(labels, dtype=torch.float)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)


# Create datasets    
train_dataset = TextDataset(X_train, y_train, tokenizer, args.max_seq_len)
val_dataset = TextDataset(X_val, y_val, tokenizer, args.max_seq_len)

# Create dataloaders 
train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=args.batch_size)


### Model 

In [10]:
class SimpleRNNLayer(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(SimpleRNNLayer, self).__init__()
        self.hidden_size = hidden_size
        
        # Linear layers for input and hidden state
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, hidden_size)
        
        # Activation function
        self.tanh = nn.Tanh()

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.tanh(self.i2h(combined))
        output = self.h2o(hidden)
        return output, hidden

    def init_hidden(self, batch_size):
        return torch.zeros(batch_size, self.hidden_size)

class TextClassifier(nn.Module):
    def __init__(self, args):
        super(TextClassifier, self).__init__()
        self.embedding = nn.Embedding(args.vocab_size, args.emb_size)
        self.rnn = SimpleRNNLayer(args.emb_size, args.hidden_size)
        self.fc = nn.Linear(args.hidden_size, 1)
        
    def forward(self, x):
        embedded = self.embedding(x)
        batch_size, seq_len, _ = embedded.size()
        
        hidden = self.rnn.init_hidden(batch_size)
        for i in range(seq_len):
            output, hidden = self.rnn(embedded[:, i, :], hidden)
        
        output = self.fc(hidden)
        return output

In [13]:
def get_preds(raw_logit):
    return torch.sigmoid(raw_logit)

def model_eval(model, data, device, loss_fn):
    model.eval()
    val_loss, correct = 0, 0
    with torch.no_grad():
        for item in data:
            ids, labels = item['input_ids'].to(device), item['labels'].to(device)
            outputs = get_preds(model(ids))
            loss = loss_fn(outputs.view(-1), labels)
            val_loss += loss.item()
            preds = (outputs.view(-1) > 0.5).float()
            correct += (preds == labels).sum().item()
    val_loss /= len(data.dataset)
    accuracy = correct / len(data.dataset)
    return val_loss, accuracy

def save_checkpoint(state, is_best, checkpoint_path, filename="checkpoint.pt"):
    filename = os.path.join(checkpoint_path, filename)
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, os.path.join(checkpoint_path, "model_best.pt"))

model = TextClassifier(args).to(args.device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, "min", patience=args.lr_patience, factor=args.lr_factor)

start_time = time.time()
best_metric = 0
n_no_improve = 0
train_loss_history, train_metric_history = [], []
val_loss_history, val_metric_history = [], []

for epoch in range(args.num_epochs):
    model.train()
    train_loss_epoch, correct = 0, 0
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{args.num_epochs}")
    for item in loop:
        ids, labels = item['input_ids'].to(args.device), item['labels'].to(args.device)
        optimizer.zero_grad()
        outputs = get_preds(model(ids))
        loss = criterion(outputs.view(-1), labels)
        loss.backward()
        optimizer.step()
        train_loss_epoch += loss.item()
        preds = (outputs.view(-1) > 0.5).float()
        correct += (preds == labels).sum().item()
        loop.set_postfix(train_loss=train_loss_epoch/len(train_loader), train_accuracy=correct/len(train_loader.dataset))

    train_loss = train_loss_epoch / len(train_loader)
    train_accuracy = correct / len(train_loader.dataset)
    train_loss_history.append(train_loss)
    train_metric_history.append(train_accuracy)

    val_loss, val_accuracy = model_eval(model, val_loader, args.device, criterion)
    val_loss_history.append(val_loss)
    val_metric_history.append(val_accuracy)

    scheduler.step(val_loss)

    is_improvement = val_accuracy > best_metric
    if is_improvement:
        best_metric = val_accuracy
        n_no_improve = 0
    else:
        n_no_improve += 1

    save_checkpoint({'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 
                     'scheduler': scheduler.state_dict(), 'best_metric': best_metric}, is_improvement, args.savedir)

    if n_no_improve >= args.patience:
        print("No improvement. Breaking out of loop.")
        break

    print(f'Epoch [{epoch+1}/{args.num_epochs}], Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}, Time: {time.time() - start_time:.2f}s')

print(f"Total Training Time: {time.time() - start_time:.2f} seconds")


Epoch 1/100: 100%|██████████| 330/330 [00:23<00:00, 14.07it/s, train_accuracy=0.709, train_loss=0.604]   


Epoch [1/100], Train Loss: 0.6040, Train Acc: 0.7088, Val Loss: 0.0380, Val Acc: 0.7121, Time: 24.12s


Epoch 2/100: 100%|██████████| 330/330 [00:25<00:00, 13.14it/s, train_accuracy=0.716, train_loss=0.597]  


Epoch [2/100], Train Loss: 0.5967, Train Acc: 0.7156, Val Loss: 0.0381, Val Acc: 0.7019, Time: 49.67s


Epoch 3/100: 100%|██████████| 330/330 [00:25<00:00, 13.07it/s, train_accuracy=0.73, train_loss=0.581]   


Epoch [3/100], Train Loss: 0.5812, Train Acc: 0.7302, Val Loss: 0.0384, Val Acc: 0.7053, Time: 75.34s


Epoch 4/100: 100%|██████████| 330/330 [00:25<00:00, 13.15it/s, train_accuracy=0.746, train_loss=0.561]  


Epoch [4/100], Train Loss: 0.5607, Train Acc: 0.7463, Val Loss: 0.0392, Val Acc: 0.7019, Time: 100.88s


Epoch 5/100: 100%|██████████| 330/330 [00:25<00:00, 12.88it/s, train_accuracy=0.754, train_loss=0.544]  


Epoch [5/100], Train Loss: 0.5442, Train Acc: 0.7545, Val Loss: 0.0421, Val Acc: 0.7002, Time: 126.95s


Epoch 6/100: 100%|██████████| 330/330 [00:25<00:00, 13.14it/s, train_accuracy=0.759, train_loss=0.529]  


Epoch [6/100], Train Loss: 0.5289, Train Acc: 0.7592, Val Loss: 0.0436, Val Acc: 0.6985, Time: 152.49s


Epoch 7/100: 100%|██████████| 330/330 [00:24<00:00, 13.49it/s, train_accuracy=0.766, train_loss=0.52]   


Epoch [7/100], Train Loss: 0.5197, Train Acc: 0.7662, Val Loss: 0.0449, Val Acc: 0.6968, Time: 177.42s


Epoch 8/100: 100%|██████████| 330/330 [00:24<00:00, 13.41it/s, train_accuracy=0.769, train_loss=0.506]  


Epoch [8/100], Train Loss: 0.5061, Train Acc: 0.7694, Val Loss: 0.0458, Val Acc: 0.7002, Time: 202.47s


Epoch 9/100: 100%|██████████| 330/330 [00:37<00:00,  8.89it/s, train_accuracy=0.772, train_loss=0.499]  


Epoch [9/100], Train Loss: 0.4993, Train Acc: 0.7725, Val Loss: 0.0480, Val Acc: 0.7002, Time: 240.07s


Epoch 10/100: 100%|██████████| 330/330 [00:26<00:00, 12.44it/s, train_accuracy=0.775, train_loss=0.49]   


Epoch [10/100], Train Loss: 0.4904, Train Acc: 0.7749, Val Loss: 0.0514, Val Acc: 0.6899, Time: 267.08s


Epoch 11/100: 100%|██████████| 330/330 [00:26<00:00, 12.64it/s, train_accuracy=0.772, train_loss=0.501]  


No improvement. Breaking out of loop.
Total Training Time: 293.62 seconds
