In [1]:
# Transformers installation
! pip install transformers datasets



In [None]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import pandas as pd
import numpy as np
from collections import Counter
from datasets import Dataset
from sklearn.model_selection import train_test_split


In [None]:
data = pd.read_csv('dataset_final.csv')

In [None]:
# Se define el tokenizer y el modelo
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "finiteautomata/beto-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

In [None]:
# ----- 1. Preprocesamiento de los datos -----
from sklearn.model_selection import train_test_split

X = list(data["text"])
y = list(data["label"])

print("Shape of (X, y) (", len(X), ',', len(y), ")")

X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.8, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=24)

print("Shape of (X_train, y_train) (", len(X_train), ',', len(y_train), ")")
print("Shape of (X_val, y_val) (", len(X_val), ',', len(y_val), ")")
print("Shape of (X_test, y_test) (", len(X_test), ',', len(y_test), ")")

X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)

In [None]:
# Torch dataset
import torch

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])



In [None]:
# ----- 3. Predicción -----#
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from transformers import Trainer

test_dataset = Dataset(X_test_tokenized)
test_trainer = Trainer(model)

# Make prediction
raw_pred, _, _ = test_trainer.predict(test_dataset)

# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)
target_names = ['Neg', 'Pos', 'Neu']

print(classification_report(y_test, y_pred, target_names=target_names))

cm = confusion_matrix(y_test, y_pred, labels=[0,1,2])

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=target_names)
disp.plot()
plt.show()

In [None]:
df_list = list(zip(X_test, y_test, y_pred))
df = pd.DataFrame(df_list, columns = ['Text', 'Label', 'Prediction'])
df.to_csv('beto-sentiment-analysis_predictions.csv', index=False)