In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
import torch
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import sys
sys.path.append('../')

from utils.metrics import RestMexMetrics
from utils.config import setConfig

device = setConfig()
metrics = RestMexMetrics()

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
df = pd.read_csv(r'../data/train/train.csv')
audf = pd.read_csv(r'../data/augmented/train.csv')
data = pd.concat([df, audf], ignore_index=True)

data['Title'] = data['Title'].astype(str)
data['Review'] = data['Review'].astype(str)
data['Town'] = data['Town'].astype(str)
data['Region'] = data['Region'].astype(str)
data['Type'] = data['Type'].astype(str)
data['Polarity'] = data['Polarity'].astype(int)

In [9]:

train, test = train_test_split(data, test_size=0.15, random_state=42)

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

X_test = '<title>' + test['Title'] + '<title> <review>' + test['Review'] + '<review>'
y_test = test['Polarity']

Train shape: (176843, 6)
Test shape: (31208, 6)


## Usando Tabularisai

In [19]:
def predict_sentiment(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    #sentiment_map = {1: "Very Negative", 2: "Negative", 3: "Neutral", 4: "Positive", 5: "Very Positive"}
    return [int(p)+1 for p in torch.argmax(probabilities, dim=-1).tolist()][0]

In [20]:
model_name = "tabularisai/multilingual-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [None]:
y_test_pred = []

for text in tqdm(X_test.tolist()):
    y_test_pred.append(predict_sentiment(text, model, tokenizer))

100%|██████████| 31208/31208 [20:25<00:00, 25.46it/s]


In [8]:
report = classification_report(y_test, y_test_pred, target_names=test['Polarity'].astype(int).unique(), output_dict=True)
report = pd.DataFrame(report)
f1 = report[y_test.unique()].loc['f1-score'].to_dict()

ResP_k = metrics.TypeScore(f1)

ResP_k

0.43071790705049995

## Ahora usando Finetuning

In [14]:
!ls /Users/roicort/GitHub/REST-MEX25/models/tabularisai_ft/

config.json       model.safetensors training_args.bin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

path = '/Users/roicort/GitHub/REST-MEX25/models/tabularisai_distilbert/'
model = AutoModelForSequenceClassification.from_pretrained(path)

In [23]:
y_test_pred = []

for text in tqdm(X_test.tolist()):
    y_test_pred.append(predict_sentiment(text, model, tokenizer))

100%|██████████| 31208/31208 [29:56<00:00, 17.37it/s]


In [24]:
report = classification_report(y_test, y_test_pred, target_names=test['Polarity'].astype(int).unique(), output_dict=True)
report = pd.DataFrame(report)
f1 = report[y_test.unique()].loc['f1-score'].to_dict()

ResP_k = metrics.TypeScore(f1)

ResP_k

0.5881071968321517

In [None]:
ResMT_k=0.8472
ResT_k=0.9563
ResP_k=0.5881071968321517

Sentiment_k = RestMexMetrics.RestMexScore(ResP_k, ResT_k, ResMT_k)
print(f"Sentiment(k): {Sentiment_k:.4f}")

Sentiment(k): 0.7790
