# Preprocesamiento de Texto con spaCy


In [105]:
import spacy

# Cargar el modelo de spaCy para preprocesamiento
nlp = spacy.load("es_core_news_sm")

# Función de preprocesamiento
def preprocess_text(text:str) -> str:
    """
    Preprocesa el texto eliminando stopwords y puntuación, y lematiza las palabras.

    Args:
        text (str): Texto a preprocesar.

    Returns:
        str: Texto preprocesado.
    """
    
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

# Texto de ejemplo
text = "Este es un ejemplo de texto para preprocesar."
cleaned_text = preprocess_text(text)
print("Texto limpio:", cleaned_text)


Texto limpio: ejemplo texto preprocesar


# Obtener Embeddings con DistilBERT


In [106]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch

# Configurar el dispositivo (GPU si está disponible)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Tokenizador y modelo de DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased').to(device)

# Función para obtener embeddings de DistilBERT
def get_embeddings(text:str, model:DistilBertModel, tokenizer:DistilBertTokenizer) -> torch.Tensor:
    """
    Genera los embeddings para el texto dado utilizando el modelo DistilBERT.

    Args:
        text (str): Texto para el cual generar los embeddings.
        model (DistilBertModel): Modelo DistilBERT preentrenado.
        tokenizer (DistilBertTokenizer): Tokenizador DistilBERT preentrenado.

    Returns:
        torch.Tensor: Embeddings generados por el modelo DistilBERT.
    """
    encoded_input = tokenizer(text, return_tensors='pt').to(device)
    with torch.no_grad():
        outputs = model(**encoded_input)
    return outputs.last_hidden_state

# Obtener embeddings para el texto preprocesado
embeddings = get_embeddings(cleaned_text, model, tokenizer)
print("Embeddings:", embeddings)



Embeddings: tensor([[[-0.3835, -0.0783, -0.0687,  ..., -0.0525,  0.2811,  0.5119],
         [-0.3267, -0.2704,  0.0471,  ...,  0.1809,  0.3451,  0.5138],
         [-0.5358, -0.4345,  0.1403,  ...,  0.0510, -0.0980,  0.6037],
         ...,
         [ 0.2616, -0.1185,  0.2342,  ...,  0.0457, -0.0370,  0.2346],
         [-0.3324, -0.0216, -0.1785,  ...,  0.0930, -0.0885,  0.2521],
         [ 0.7774,  0.0768, -0.3888,  ...,  0.2486, -0.7300, -0.1771]]],
       device='cuda:0')


In [107]:
def compute_metrics(eval_pred):
    """
    Calcula las métricas de evaluación para el modelo.

    Args:
        eval_pred (tuple): Una tupla que contiene logits y etiquetas reales.

    Returns:
        dict: Un diccionario con la métrica calculada (precisión en este caso).
    """
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    metric = load_metric("accuracy", trust_remote_code=True)
    accuracy = metric.compute(predictions=predictions, references=labels)
    return accuracy


# Carga y Evaluación del Modelo sin Fine-Tuning:

In [108]:

from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer

# Cargar el modelo preentrenado sin fine-tuning
model_name = "distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)  # Ajusta el número de etiquetas según tu caso
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Definir los argumentos de entrenamiento (necesarios para el Trainer)
training_args = TrainingArguments(
    output_dir='./results',
    per_device_eval_batch_size=8,
    do_train=False,
    do_eval=True,
)

# Crear un Trainer para evaluar el modelo preentrenado
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics  # Incluir la función de métrica personalizada
)

# Evaluar el modelo preentrenado
baseline_results = trainer.evaluate()
print("Resultados del modelo sin fine-tuning:", baseline_results)




Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
100%|██████████| 250/250 [00:03<00:00, 82.34it/s]

Resultados del modelo sin fine-tuning: {'eval_loss': 0.7018254399299622, 'eval_accuracy': 0.4995, 'eval_runtime': 3.1329, 'eval_samples_per_second': 638.384, 'eval_steps_per_second': 79.798}





#  Importación de Librerías y Configuración del Logger


In [109]:

import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import logging
import warnings

# Configurar el logger para redirigir la salida a un archivo y establecer el nivel de logging a WARNING
logging.basicConfig(filename='training_log.txt', level=logging.WARNING)
logger = logging.getLogger()

# Silenciar advertencias específicas de PyTorch
warnings.filterwarnings("ignore", category=UserWarning, module='torch')



# Cargar y Preprocesar el Dataset de IMDb en Español


In [110]:
# Cargar el dataset de IMDb en español
data = pd.read_csv('IMDB_Dataset_SPANISH.csv')

# Subsamplear el dataset a 20000 muestras
data = data.sample(n=20000, random_state=42)

# Mapear etiquetas de texto a valores numéricos
label_mapping = {"positivo": 1, "negativo": 0}
data['sentimiento'] = data['sentimiento'].map(label_mapping)

# Dividir en entrenamiento y validación
train_texts, val_texts, train_labels, val_labels = train_test_split(data['review_es'], data['sentimiento'], test_size=0.2, random_state=42)




# Tokenización


In [111]:
# Tokenización
tokenizer = BertTokenizer.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True)


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


# Creación del Dataset

In [112]:
import torch

# Definir la clase Dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Seleccionar el dispositivo (GPU si está disponible)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Crear los datasets de entrenamiento y validación
train_dataset = Dataset(train_encodings, train_labels.tolist())
val_dataset = Dataset(val_encodings, val_labels.tolist())


#  Cargar el Modelo Preentrenado

In [113]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
model.to(device)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

# Configuración y Entrenamiento


In [114]:

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,  # Puedes ajustar según los resultados
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.1,  # Aumentar weight decay para evitar sobreajuste
    logging_dir='./logs',
    logging_steps=1000,
    fp16=True,
    evaluation_strategy="steps",
    save_steps=1000,
    save_total_limit=2,
    eval_steps=1000,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    disable_tqdm=False
)





# Implementación de un Callback Personalizado

In [115]:
class CustomCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        # Solo imprime logs cada cierto número de pasos
        if state.global_step % 1000 == 0:
            print(logs)

# Entrenamiento del Modelo

In [116]:
# Definir el entrenador
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,  # Añadir la función de métrica personalizada
    callbacks=[CustomCallback()]  # Mantén esto si tienes callbacks personalizados
)

# Entrenar el modelo
trainer.train()


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
 50%|█████     | 1000/2000 [01:07<01:07, 14.85it/s]

{'loss': 0.5446, 'grad_norm': 2.492856740951538, 'learning_rate': 3.336666666666667e-05, 'epoch': 1.0}
{'loss': 0.5446, 'grad_norm': 2.492856740951538, 'learning_rate': 3.336666666666667e-05, 'epoch': 1.0}



 50%|█████     | 1000/2000 [01:13<01:07, 14.85it/s]

{'eval_loss': 0.4171757698059082, 'eval_accuracy': 0.82175, 'eval_runtime': 5.3641, 'eval_samples_per_second': 745.695, 'eval_steps_per_second': 46.606, 'epoch': 1.0}
{'eval_loss': 0.4171757698059082, 'eval_accuracy': 0.82175, 'eval_runtime': 5.3641, 'eval_samples_per_second': 745.695, 'eval_steps_per_second': 46.606, 'epoch': 1.0}


100%|██████████| 2000/2000 [02:21<00:00, 14.89it/s]

{'loss': 0.3138, 'grad_norm': 5.647911071777344, 'learning_rate': 3.3333333333333334e-08, 'epoch': 2.0}
{'loss': 0.3138, 'grad_norm': 5.647911071777344, 'learning_rate': 3.3333333333333334e-08, 'epoch': 2.0}



100%|██████████| 2000/2000 [02:27<00:00, 14.89it/s]

{'eval_loss': 0.35327568650245667, 'eval_accuracy': 0.85425, 'eval_runtime': 6.306, 'eval_samples_per_second': 634.321, 'eval_steps_per_second': 39.645, 'epoch': 2.0}
{'eval_loss': 0.35327568650245667, 'eval_accuracy': 0.85425, 'eval_runtime': 6.306, 'eval_samples_per_second': 634.321, 'eval_steps_per_second': 39.645, 'epoch': 2.0}


100%|██████████| 2000/2000 [02:28<00:00, 13.51it/s]

{'train_runtime': 148.0249, 'train_samples_per_second': 216.18, 'train_steps_per_second': 13.511, 'total_flos': 4238956756992000.0, 'train_loss': 0.42916552734375, 'epoch': 2.0}
{'train_runtime': 148.0249, 'train_samples_per_second': 216.18, 'train_steps_per_second': 13.511, 'train_loss': 0.42916552734375, 'epoch': 2.0}





TrainOutput(global_step=2000, training_loss=0.42916552734375, metrics={'train_runtime': 148.0249, 'train_samples_per_second': 216.18, 'train_steps_per_second': 13.511, 'total_flos': 4238956756992000.0, 'train_loss': 0.42916552734375, 'epoch': 2.0})

# Evaluación y Guardado del Modelo Entrenado


In [117]:
# Evaluación del modelo
eval_result = trainer.evaluate()
print(f"Resultados de la evaluación: {eval_result}")

# Guardar el modelo entrenado
model.save_pretrained('./sentiment_model')
tokenizer.save_pretrained('./sentiment_model')


100%|██████████| 250/250 [00:07<00:00, 34.59it/s]

{'eval_loss': 0.35327568650245667, 'eval_accuracy': 0.85425, 'eval_runtime': 7.2679, 'eval_samples_per_second': 550.368, 'eval_steps_per_second': 34.398, 'epoch': 2.0}
Resultados de la evaluación: {'eval_loss': 0.35327568650245667, 'eval_accuracy': 0.85425, 'eval_runtime': 7.2679, 'eval_samples_per_second': 550.368, 'eval_steps_per_second': 34.398, 'epoch': 2.0}





('./sentiment_model\\tokenizer_config.json',
 './sentiment_model\\special_tokens_map.json',
 './sentiment_model\\vocab.txt',
 './sentiment_model\\added_tokens.json')

### Predicción de Sentimientos con el Modelo Ajustado



In [127]:
from transformers import pipeline, DistilBertForSequenceClassification, DistilBertTokenizer
import torch

# Seleccionar el dispositivo (GPU si está disponible)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Cargar el modelo desde el checkpoint más reciente
model_path = './results/checkpoint-2000'  # Asegúrate de cambiar a la ruta correcta
model = DistilBertForSequenceClassification.from_pretrained(model_path).to(device)

# Cargar el tokenizador original (no desde el checkpoint)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Crear un pipeline de clasificación
classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

# Realizar predicciones
reviews = ["Me encantó la película, fue maravillosa.", "No me gustó para nada, muy aburrida."]
predictions = classifier(reviews)

# Imprimir predicciones
print(predictions)



[{'label': 'LABEL_1', 'score': 0.9876910448074341}, {'label': 'LABEL_0', 'score': 0.9902077317237854}]


In [119]:
import shutil

# shutil.rmtree('./results/checkpoint-1000')
# shutil.rmtree('./results/checkpoint-3000')
# shutil.rmtree('./logs')
# shutil.rmtree('./results/runs')