In [None]:
!pip install datasets
!pip install transformers
!sudo apt-get install swig
!sudo pip install jamspell
!wget https://github.com/bakwc/JamSpell-models/raw/master/en.tar.gz
!tar -xvf en.tar.gz

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.8.0-py3-none-any.whl (452 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m452.9/452.9 KB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 KB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash
  Downloading xxhash-3.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (213 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 

In [None]:
import string
import pandas as pd
import numpy as np

import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
import jamspell
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import torch
from torch import nn
from transformers import AutoConfig
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback, set_seed

from sklearn.metrics import classification_report, f1_score
from scipy.special import softmax
from tqdm import tqdm

import datasets
from datasets import load_dataset

In [None]:
dataset = datasets.load_dataset('tweet_eval', 'sentiment')

In [None]:
corrector = jamspell.TSpellCorrector()
corrector.LoadLangModel('en.bin')
lemmatizer= WordNetLemmatizer()
punct = ''.join([i for i in string.punctuation if i not in [',','?','!','.']])

def light_clean(x):
  # Removemos las url.
  x = re.sub(r"http\S+", ' ', x)
  # Removemos las palabras que empiezan por @.
  x = re.sub(r'@[\w]+', '', x)
  # Signos de puntuacion menos. 
  x = x.translate(str.maketrans('', '', punct))
  x = x.replace('#', '')
  # Spell check
  x = corrector.FixFragment(x)
  return x

In [None]:
# vectorizamos la funcion utilizada para limpiar el texto.
clean1_vect = np.vectorize(light_clean)

In [None]:
# Aplicamos la funcion en el training, validation y test set.
data = dataset.map(lambda e: {"text_clean": clean1_vect(e['text'])}, batched=True)

In [None]:
# Modelo a utilizar
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"

In [None]:
# Iniciamos el tokenizador
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True, max_length=512)
#config = AutoConfig.from_pretrained(MODEL)

In [None]:
# Inicializamos nuestro modelo.
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [None]:
#tokenizamos 
data = data.map(lambda e: tokenizer(e['text'], truncation=True, max_length=512), batched=True)

In [None]:
# Sets en Formato Dict, datos listos para darselos al modelo.
train_dataset = data['train']
val_dataset = data['validation']
test_dataset = data['test']

In [None]:
# Class distribution -> Unbalanced data.
train_df = pd.DataFrame(dataset['train'])
train_df['label'].value_counts(normalize=True).sort_index()

In [None]:
class_weights = (1 - (train_df['label'].value_counts().sort_index() / len(train_df))).values
class_weights = torch.from_numpy(class_weights).float().to("cuda")

In [None]:
train_dataset = train_dataset.rename_column('label', 'labels')

In [None]:
class WeightLossTrainer(Trainer):
  def compute_loss(self, model, inputs, return_outputs=False):
    # feed inputs to the model and extract logits
    outputs = model(**inputs)
    logits = outputs.get('logits')
    # extract label
    labels = inputs.get('labels')
    # define loss function with lass weights
    loss_func = nn.CrossEntropyLoss(weight=class_weights)
    # compute loss
    loss = loss_func(logits, labels)
    return (loss, outputs) if return_outputs else loss

In [None]:
modelFineTuned = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=3)

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
batch_size = 64
logging_steps = len(data['train']) // batch_size
training_args = TrainingArguments(
    output_dir='roberta-sentiment-analysis-finetune',
    num_train_epochs=10, 
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.001,
    warmup_steps = 50,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_steps=logging_steps,
    load_best_model_at_end = True,
    fp16= True,
    push_to_hub=True
)

In [None]:
trainer = Trainer(model=modelFineTuned,
                            args=training_args,
                            train_dataset = train_dataset,
                            eval_dataset=val_dataset,
                            tokenizer=tokenizer)

In [None]:
trainer.train()

In [None]:
test_preds_raw, test_labels , _ = trainer.predict(test_dataset)
test_preds = np.argmax(test_preds_raw, axis=-1)
print(classification_report(test_labels, test_preds, digits=3))

In [None]:
trainer.push_to_hub("roberta-sentiment-analysis-finetune")