In [None]:
!pip install transformers torch




In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
import numpy as np
from sklearn.metrics import classification_report


In [None]:
class SpanishDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [None]:
import pandas as pd

train_data_path = '/content/train_es.tsv'
dev_data_path = '/content/dev_es.tsv'

train_data = pd.read_csv(train_data_path, sep='\t')
dev_data = pd.read_csv(dev_data_path, sep='\t')


In [None]:

train_dataset = SpanishDataset(
    texts=train_data.text.to_numpy(),
    labels=train_data.HS.to_numpy(),
    tokenizer=tokenizer,
    max_len=128
)

dev_dataset = SpanishDataset(
    texts=dev_data.text.to_numpy(),
    labels=dev_data.HS.to_numpy(),
    tokenizer=tokenizer,
    max_len=128
)


In [None]:
model = BertForSequenceClassification.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased', num_labels=2)


pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
batch_size = 16

train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_data_loader = DataLoader(dev_dataset, batch_size=batch_size)


In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)


In [None]:
epochs = 3
total_steps = len(train_data_loader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

model = model.to(device)


Using device: cpu


In [None]:
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)




In [None]:
model.eval()
predictions, true_labels = [], []

for batch in dev_data_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)




In [None]:
model.save_pretrained('path/to/save/model')
tokenizer.save_pretrained('path/to/save/model')


('path/to/save/model/tokenizer_config.json',
 'path/to/save/model/special_tokens_map.json',
 'path/to/save/model/vocab.txt',
 'path/to/save/model/added_tokens.json')

In [None]:
model.eval()
predictions, true_labels = [], []

for batch in dev_data_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    logits = logits.detach().cpu().numpy()
    label_ids = labels.to('cpu').numpy()

    predictions.append(logits)
    true_labels.append(label_ids)


In [None]:
from sklearn.metrics import classification_report

flat_predictions = np.concatenate(predictions, axis=0)
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

flat_true_labels = np.concatenate(true_labels, axis=0)

report = classification_report(flat_true_labels, flat_predictions, target_names=['Class 0', 'Class 1'])

print(report)


              precision    recall  f1-score   support

     Class 0       1.00      0.00      0.01       278
     Class 1       0.44      1.00      0.62       222

    accuracy                           0.45       500
   macro avg       0.72      0.50      0.31       500
weighted avg       0.75      0.45      0.28       500



In [None]:
!pip install nltk
import nltk
from nltk.corpus import wordnet
import random

nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

def synonym_replacement(sentence, n):
    words = sentence.split()
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word.isalpha()]))
    random.shuffle(random_word_list)
    num_replaced = 0

    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break

    sentence = ' '.join(new_words)
    return sentence

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            synonym = l.name().replace("_", " ").replace("-", " ").lower()
            synonym = "".join([char for char in synonym if char.isalpha()])
            synonyms.add(synonym)
    if word in synonyms:
        synonyms.remove(word)
    return list(synonyms)




[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
def random_swap(sentence, n=2):
    words = sentence.split()
    new_words = words.copy()
    n_swaps = min(n, len(words) // 2)

    for _ in range(n_swaps):
        idx1, idx2 = random.sample(range(len(new_words)), 2)
        new_words[idx1], new_words[idx2] = new_words[idx2], new_words[idx1]

    sentence = ' '.join(new_words)
    return sentence



In [None]:
import random

def random_deletion(sentence, p=0.5):
    words = sentence.split()
    if len(words) == 1:
        return sentence
    new_words = []
    for word in words:
        r = random.uniform(0, 1)
        if r > p:
            new_words.append(word)
    if len(new_words) == 0:
        return random.choice(words)
    sentence = ' '.join(new_words)
    return sentence



In [None]:
augmented_sentences = []
augmented_labels = []

for text, label in zip(train_data.text, train_data.HS):
    augmented_sentences.append(text)
    augmented_labels.append(label)

    augmented_sentences.append(synonym_replacement(text, n=1))
    augmented_labels.append(label)
    augmented_sentences.append(random_deletion(text, p=0.5))
    augmented_labels.append(label)
    augmented_sentences.append(random_swap(text, n=2))
    augmented_labels.append(label)

# Convert augmented data to DataFrame
augmented_data = pd.DataFrame({'text': augmented_sentences, 'HS': augmented_labels})


In [None]:

augmented_dataset = SpanishDataset(
    texts=augmented_data.text.to_numpy(),
    labels=augmented_data.HS.to_numpy(),
    tokenizer=tokenizer,
    max_len=128
)

augmented_data_loader = DataLoader(augmented_dataset, batch_size=batch_size, shuffle=True)


In [None]:
 for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in augmented_data_loader:

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)




In [None]:
model.eval()



BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
validation_predictions, validation_true_labels = [], []

for batch in dev_data_loader:
    # Move batch to device
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    logits = logits.detach().cpu().numpy()
    label_ids = labels.to('cpu').numpy()

    validation_predictions.append(logits)
    validation_true_labels.append(label_ids)


In [None]:
import numpy as np

flat_predictions = np.concatenate(validation_predictions, axis=0)
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

flat_true_labels = np.concatenate(validation_true_labels, axis=0)


In [None]:
from sklearn.metrics import classification_report

report = classification_report(flat_true_labels, flat_predictions, target_names=['Class 0', 'Class 1'])  # Adjust target_names as per your labels
print(report)


              precision    recall  f1-score   support

     Class 0       1.00      0.00      0.01       278
     Class 1       0.44      1.00      0.62       222

    accuracy                           0.45       500
   macro avg       0.72      0.50      0.31       500
weighted avg       0.75      0.45      0.28       500

