In [1]:
# Tools 
import os
import time
import shutil
import random
from typing import Tuple
from argparse import Namespace
import matplotlib.pyplot as plt
import re 

# Preprocessing
import nltk
from nltk.corpus import stopwords
from nltk import ngrams
from nltk.tokenize import TweetTokenizer
from nltk import FreqDist
import pandas as pd
import numpy as np

# Pytorch 
import torch 
# from bitnet import BitLinear # Binary layer 
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertTokenizer

# Scikit learn
from sklearn.metrics import accuracy_score


In [2]:
# Set seeds
seed = 1111
random.seed(seed) # python seed 
np.random.seed(seed) # numpy seed 
torch.manual_seed(seed) # torch seed 
torch.backends.cudnn.benchmark = False # Ensure reproducibility of CUDA 


In [3]:
# Configuration 
tokenizer = BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased') # pre-trained tokenizer
args = Namespace()
args.emb_size = 50
args.num_layers = 2
args.n_heads = 2
args.head_size = 25
args.vocab_size = tokenizer.vocab_size
args.max_seq_len = 100
args.device = 'cpu'
args.batch_size = 16
print(args)

Namespace(emb_size=50, num_layers=2, n_heads=2, head_size=25, vocab_size=31002, max_seq_len=100, device='cpu', batch_size=16)


### Data 

In [4]:
# Tweets 
X_train = pd.read_csv('./data_mex20/mex20_train.txt', sep='\r\n', engine='python', header=None).loc[:, 0].values.tolist()
X_val = pd.read_csv('./data_mex20/mex20_val.txt', sep='\r\n', engine='python', header=None).loc[:, 0].values.tolist()
# Labels 
y_train = np.array(pd.read_csv('./data_mex20/mex20_train_labels.txt', sep='\r\n', engine='python', header=None)).reshape(-1)
y_val = np.array(pd.read_csv('./data_mex20/mex20_val_labels.txt', sep='\r\n', engine='python', header=None)).reshape(-1)

# Preprocess data 
def preprocess_tweet(tweet):
    tweet = re.sub(r'http\S+', '', tweet)  # Eliminar URLs
    tweet = re.sub(r'@\S+', '', tweet)     # Eliminar menciones
    tweet = re.sub(r'#\S+', '', tweet)     # Eliminar hashtags
    tweet = tweet.lower()                  # Convertir a minúsculas
    tweet = re.sub(r'\W', ' ', tweet)      # Eliminar caracteres especiales
    tweet = re.sub(r'\s+', ' ', tweet).strip()  # Eliminar espacios extra
    return tweet

for i, tweet in enumerate(X_train):
    X_train[i] = preprocess_tweet(tweet)
for i, tweet in enumerate(X_val):
    X_val[i] = preprocess_tweet(tweet)
    
# print(X_train)
# print(X_val)

In [66]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        # self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=128)
        self.encodings = tokenizer(
            texts,
            add_special_tokens=True,  # Agrega los tokens especiales '[CLS]' y '[SEP]'
            return_tensors='pt',      # Retorna tensores de PyTorch
            truncation=True,          # Trunca textos que superen la longitud máxima del modelo
            max_length=max_length,    # Longitud máxima para los inputs
            padding='max_length',     # Añade padding hasta 'max_length' si el tweet es más corto
            return_attention_mask=True  # Retorna la máscara de atención
            )
        self.labels = labels 

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item 

    def __len__(self):
        return len(self.encodings['input_ids'])

# Create datasets    
train_dataset = TextDataset(X_train, y_train, tokenizer, args.max_seq_len)
val_dataset = TextDataset(X_val, y_val, tokenizer, args.max_seq_len)

# Create dataloaders 
train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=args.batch_size)


### Model 

In [85]:
# Transformer model 

class Attention(nn.Module):
    """ One head of self-attention """

    def __init__(self, emb_size, head_size, max_seq_len):
        super().__init__()
        self.key = nn.Linear(emb_size, head_size, bias=False)
        self.query = nn.Linear(emb_size, head_size, bias=False)
        self.value = nn.Linear(emb_size, head_size, bias=False)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x, mask=None):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B, T, C = x.shape
        
        # keys, queries, values 
        k = self.key(x)  # (B, T, hs)
        q = self.query(x)  # (B, T, hs)
        v = self.value(x)  # (B, T, hs)

        # compute scores 
        scores = torch.matmul(q, k.transpose(-2, -1)) * (k.shape[-1] ** -0.5) # (B, T, hs) @ (B, hs, T) = (B, T, T)

        # apply mask 
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf')) 

        # normalize scores
        attn = F.softmax(scores, dim=-1)
        attn = self.dropout(attn)
        
        # weighted sum 
        out = torch.matmul(attn, v) # (B, T, T) @ (B, T, hs) = (B, T, hs)

        return out

class MultiHeadAttention(nn.Module):
    """ Multi Head Attention block's Transformer """

    def __init__(self, n_heads, head_size, emb_size, max_seq_len):
        super().__init__()
        self.heads = nn.ModuleList([Attention(emb_size, head_size, max_seq_len) for _ in range(n_heads)])
        self.proy = nn.Linear(n_heads * head_size, emb_size)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x, mask=None):
        x = torch.cat([h(x, mask) for h in self.heads], dim=-1)
        x = self.dropout(self.proy(x))
        return x

class FeedForward(nn.Module):
    """ A feed forward layer: Linear + Relu """

    def __init__(self, emb_size):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(emb_size, 4 * emb_size),
            nn.ReLU(), 
            nn.Linear(4 * emb_size, emb_size), 
            nn.Dropout(0.2)
        )

    def forward(self, x):
        return self.net(x)

class block(nn.Module):
    """ Transformer block """

    def __init__(self, emb_size, n_heads, head_size, max_seq_len):
        super().__init__()
        self.mha = MultiHeadAttention(n_heads, head_size, emb_size, max_seq_len)
        self.ln1 = nn.LayerNorm(emb_size)
        self.ln2 = nn.LayerNorm(emb_size)
        self.ff = FeedForward(emb_size)
        
    def forward(self, x, mask=None):
        x = x + self.mha(self.ln1(x), mask)
        x = x + self.ff(self.ln2(x))

        return x

class Transformer(nn.Module):
    """ Model for generation and classification """

    def __init__(self, args):
        super(Transformer, self).__init__()
        self.args = args
        self.emb = nn.Embedding(args.vocab_size, args.emb_size)
        self.pos = nn.Embedding(args.max_seq_len, args.emb_size)
        self.ln_f = nn.LayerNorm(args.emb_size) # final layer norm 
        self.lm_head = nn.Linear(args.emb_size, 1) # final dense layer 
        self.layers = nn.Sequential(*[block(args.emb_size, args.n_heads, args.head_size, args.max_seq_len) for _ in range(args.num_layers)])

        # initialize weights 
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, mask):
        B, T = idx.shape # idx is a chunk of sequences (B, T)
        x = self.emb(idx) + self.pos(torch.arange(T, device=self.args.device))
        x = self.layers(x, mask)
        x = self.ln_f(x)
        logit = self.lm_head(x) 

        return logit 
