In [41]:
import pandas as pd
import numpy as np 
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline
import torch
import tensorflow as tf
import keras_nlp
from sklearn.preprocessing import LabelEncoder
from ordered_set import OrderedSet

import sentiment_analysis_training as sat

In [42]:
print(tf.__version__)
print(keras_nlp.__version__)

2.20.0
0.25.1


In [43]:
train_df_es = pd.read_csv('./Kaggle/task2-train-dev/train.tsv', sep='\t')
validation_df_es = pd.read_csv('./Kaggle/task2-train-dev/dev.tsv', sep='\t')

print(train_df_es.shape, validation_df_es.shape)
train_df_es.head()

(5886, 3) (857, 3)


Unnamed: 0,id,tweet,label
0,1,El Atl√©tico resignado a perder HASHTAG üòî http...,sadness
1,2,Leer proporciona una mejor visi√≥n del mundo ü§ì ...,joy
2,3,Amo a Arya Stark por encima de todas las cosas...,joy
3,4,Gracias HASHTAG es incre√≠ble que una ni√±a logr...,others
4,5,Solo siento que hayamos perdido 24 esca√±os de ...,sadness


In [44]:
train_df_es['label '].unique()

array(['sadness ', 'joy ', 'others ', 'surprise ', 'disgust ', 'anger ',
       'fear '], dtype=object)

In [45]:
goemotions_to_fer = {
    "joy": "happy",
    "others": "neutral",
    "anger": "angry", 
    "surprise": "surprise",
    "sadness": "sad",
    "disgust": "disgust",
    "fear": "fear "

}

In [46]:
def transformar_etiquetas(df, columna, diccionario):
    df['sentiment_final'] = df[columna].str.strip().map(diccionario).fillna("neutral")
    return df

In [53]:
train_df_es = transformar_etiquetas(train_df_es, 'label ', goemotions_to_fer)

validation_df_es = transformar_etiquetas(validation_df_es, 'label ', goemotions_to_fer)


# train_df_es['sentiment_final'] = train_df_es['fer_labels'].apply(lambda labs: map_to_fer(labs, goemotions_to_fer))
# validation_df_es['sentiment_final'] = validation_df_es['label '].apply(lambda labs: map_to_fer(labs, goemotions_to_fer))


In [48]:
train_df_es.head()

Unnamed: 0,id,tweet,label,sentiment_final
0,1,El Atl√©tico resignado a perder HASHTAG üòî http...,sadness,sad
1,2,Leer proporciona una mejor visi√≥n del mundo ü§ì ...,joy,happy
2,3,Amo a Arya Stark por encima de todas las cosas...,joy,happy
3,4,Gracias HASHTAG es incre√≠ble que una ni√±a logr...,others,neutral
4,5,Solo siento que hayamos perdido 24 esca√±os de ...,sadness,sad


In [49]:
# all_labels = train_df_es['sentiment_final'].explode().value_counts()
# print(all_labels)

In [54]:
df_es = pd.concat([train_df_es, validation_df_es], ignore_index=True)

In [55]:
df_es = df_es.rename(columns={"label ": "sentiment_final", "tweet": "text"})
df_es.head()

Unnamed: 0,id,text,sentiment_final,sentiment_final.1
0,1,El Atl√©tico resignado a perder HASHTAG üòî http...,sadness,sad
1,2,Leer proporciona una mejor visi√≥n del mundo ü§ì ...,joy,happy
2,3,Amo a Arya Stark por encima de todas las cosas...,joy,happy
3,4,Gracias HASHTAG es incre√≠ble que una ni√±a logr...,others,neutral
4,5,Solo siento que hayamos perdido 24 esca√±os de ...,sadness,sad


### **Uso de Pytorch**

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Usando dispositivo: {device}")

In [None]:
label_encoder = LabelEncoder()
df_es['label_encoded'] = label_encoder.fit_transform(df_es['sentiment_final'])

# N√∫mero de clases
num_labels = len(label_encoder.classes_)
print(f"N√∫mero de emociones/clases: {num_labels}")
print(f"Clases: {label_encoder.classes_}")

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_es['text'].values,
    df_es['label_encoded'].values,
    test_size=0.2,
    random_state=42,
    stratify=df_es['label_encoded']
)

In [None]:
class EmotionDataset(Dataset):
    """Dataset personalizado para clasificaci√≥n de emociones en espa√±ol"""
    
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        # Tokenizar
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
tokenizer = BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')

train_dataset = EmotionDataset(
    texts=train_texts,
    labels=train_labels,
    tokenizer=tokenizer
)

val_dataset = EmotionDataset(
    texts=val_texts,
    labels=val_labels,
    tokenizer=tokenizer
)

In [None]:
train_loader = DataLoader(
    train_dataset,
    batch_size=16,
    shuffle=True,
    num_workers=2
)

val_loader = DataLoader(
    val_dataset,
    batch_size=16,
    shuffle=False,
    num_workers=2
)

In [None]:
model = BertForSequenceClassification.from_pretrained(
    'dccuchile/bert-base-spanish-wwm-cased',
    num_labels=num_labels
)

model.to(device)

In [None]:

optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
def train_epoch(model, data_loader, optimizer, device):
    """Entrena el modelo por una √©poca"""
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    
    progress_bar = tqdm(data_loader, desc="Entrenamiento")
    
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        # Forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        logits = outputs.logits
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # M√©tricas
        total_loss += loss.item()
        predictions = torch.argmax(logits, dim=-1)
        correct_predictions += (predictions == labels).sum().item()
        total_samples += labels.size(0)
        
        # Actualizar progress bar
        progress_bar.set_postfix({
            'loss': loss.item(),
            'acc': correct_predictions / total_samples
        })
    
    avg_loss = total_loss / len(data_loader)
    accuracy = correct_predictions / total_samples
    
    return avg_loss, accuracy

In [None]:
def eval_epoch(model, data_loader, device):
    """Eval√∫a el modelo en el conjunto de validaci√≥n"""
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    
    progress_bar = tqdm(data_loader, desc="Validaci√≥n")
    
    with torch.no_grad():
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            # Forward pass
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            logits = outputs.logits
            
            # M√©tricas
            total_loss += loss.item()
            predictions = torch.argmax(logits, dim=-1)
            correct_predictions += (predictions == labels).sum().item()
            total_samples += labels.size(0)
            
            # Actualizar progress bar
            progress_bar.set_postfix({
                'loss': loss.item(),
                'acc': correct_predictions / total_samples
            })
    
    avg_loss = total_loss / len(data_loader)
    accuracy = correct_predictions / total_samples
    
    return avg_loss, accuracy


In [None]:
num_epochs = 3
best_val_accuracy = 0

print("\n" + "="*70)
print("INICIANDO ENTRENAMIENTO CON BETO (BERT EN ESPA√ëOL)")
print("="*70 + "\n")

for epoch in range(num_epochs):
    print(f"\n√âpoca {epoch + 1}/{num_epochs}")
    print("-" * 70)
    
    # Entrenamiento
    train_loss, train_acc = train_epoch(model, train_loader, optimizer, device)
    print(f"\nTrain Loss: {train_loss:.4f} | Train Accuracy: {train_acc:.4f}")
    
    # Validaci√≥n
    val_loss, val_acc = eval_epoch(model, val_loader, device)
    print(f"Val Loss: {val_loss:.4f} | Val Accuracy: {val_acc:.4f}")
    
    # Guardar mejor modelo
    if val_acc > best_val_accuracy:
        best_val_accuracy = val_acc
        torch.save(model.state_dict(), './best_beto_model_pytorch.pt')
        print(f"‚úÖ Nuevo mejor modelo guardado (accuracy: {val_acc:.4f})")

In [None]:
model.save_pretrained('./beto_emotion_classifier_pytorch')
tokenizer.save_pretrained('./beto_emotion_classifier_pytorch')

# Guardar el label encoder
import pickle
with open('./beto_emotion_classifier_pytorch/label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

print("\n" + "="*70)
print("‚úÖ ENTRENAMIENTO COMPLETADO Y MODELO BETO GUARDADO")
print(f"Mejor accuracy en validaci√≥n: {best_val_accuracy:.4f}")
print("="*70)

In [None]:
def predict_emotion(text, model, tokenizer, label_encoder, device):
    """Funci√≥n para predecir la emoci√≥n de un texto en espa√±ol"""
    model.eval()
    
    encoding = tokenizer(
        text,
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=-1)
        predicted_class = torch.argmax(probabilities, dim=-1).item()
    
    emotion = label_encoder.inverse_transform([predicted_class])[0]
    confidence = probabilities[0][predicted_class].item()
    
    return emotion, confidence

### **Uso de Tensorflow**

In [None]:

import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
label_encoder = LabelEncoder()
df_es['label_encoded'] = label_encoder.fit_transform(df_es['sentiment_final'])

# N√∫mero de clases
num_labels = len(label_encoder.classes_)
print(f"N√∫mero de emociones/clases: {num_labels}")
print(f"Clases: {label_encoder.classes_}")

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_es['text'].values,
    df_es['label_encoded'].values,
    test_size=0.2,
    random_state=42,
    stratify=df_es['label_encoded']
)

In [None]:
tokenizer = BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')

# Tokenizar datos de entrenamiento
train_encodings = tokenizer(
    train_texts.tolist(),
    truncation=True,
    padding=True,
    max_length=128,
    return_tensors='tf'
)

In [None]:
val_encodings = tokenizer(
    val_texts.tolist(),
    truncation=True,
    padding=True,
    max_length=128,
    return_tensors='tf'
)

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    {
        'input_ids': train_encodings['input_ids'],
        'attention_mask': train_encodings['attention_mask']
    },
    train_labels
)).shuffle(1000).batch(16)

val_dataset = tf.data.Dataset.from_tensor_slices((
    {
        'input_ids': val_encodings['input_ids'],
        'attention_mask': val_encodings['attention_mask']
    },
    val_labels
)).batch(16)

In [None]:
model = TFBertForSequenceClassification.from_pretrained(
    'dccuchile/bert-base-spanish-wwm-cased',
    num_labels=num_labels
)

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

metrics = [tf.keras.metrics.SparseCategoricalAccuracy('accuracy')]

model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=metrics
)

In [None]:
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=3,
    verbose=1
)

In [None]:
model.save_pretrained('./beto_emotion_classifier_tf')
tokenizer.save_pretrained('./beto_emotion_classifier_tf')

# Guardar el label encoder
import pickle
with open('./beto_emotion_classifier_tf/label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

print("\n‚úÖ Modelo BETO entrenado y guardado exitosamente!")

In [None]:
def predict_emotion(text):
    """Funci√≥n para predecir la emoci√≥n de un texto en espa√±ol"""
    inputs = tokenizer(
        text,
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors='tf'
    )
    
    outputs = model(inputs)
    predictions = tf.nn.softmax(outputs.logits, axis=-1)
    predicted_class = tf.argmax(predictions, axis=-1).numpy()[0]
    
    emotion = label_encoder.inverse_transform([predicted_class])[0]
    confidence = predictions.numpy()[0][predicted_class]
    
    return emotion, confidence