In [2]:
# Instalación de librerías necesarias, incluyendo rouge_score
!pip install transformers==4.52.3 datasets evaluate rouge_score --quiet

import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import BartTokenizer, BartForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments
import torch
import evaluate
from google.colab import drive

# Montar Google Drive para acceder a archivos CSV
drive.mount('/content/drive')

# Cambia esta ruta a la carpeta donde están tus CSV en Google Drive
base_path = '/content/drive/MyDrive/Archivos_csv/'  # <-- Cambia aquí si es necesario

train_path = base_path + 'train.csv'
val_path = base_path + 'validation.csv'
test_path = base_path + 'test.csv'

def crear_muestra_pequena(ruta_archivo, n=1000):
    print(f"Cargando {ruta_archivo}...")
    df = pd.read_csv(ruta_archivo)
    if len(df) > n:
        df_small = df.sample(n=n, random_state=42)
        print(f"Muestra creada con {n} filas.")
        return df_small
    else:
        print(f"Archivo pequeño, se usa completo con {len(df)} filas.")
        return df

train_df = crear_muestra_pequena(train_path, 1000)
val_df = crear_muestra_pequena(val_path, 1000)
test_df = crear_muestra_pequena(test_path, 1000)

train_df = train_df.dropna(subset=['article', 'highlights'])
val_df = val_df.dropna(subset=['article', 'highlights'])
test_df = test_df.dropna(subset=['article', 'highlights'])

train_dataset = Dataset.from_pandas(train_df[['article', 'highlights']])
val_dataset = Dataset.from_pandas(val_df[['article', 'highlights']])
test_dataset = Dataset.from_pandas(test_df[['article', 'highlights']])

dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

print(f"Conjuntos cargados: train={len(dataset['train'])}, val={len(dataset['validation'])}, test={len(dataset['test'])}")

model_checkpoint = "facebook/bart-base"
tokenizer = BartTokenizer.from_pretrained(model_checkpoint)
model = BartForConditionalGeneration.from_pretrained(model_checkpoint)

def tokenize_function(examples):
    model_inputs = tokenizer(examples["article"], max_length=128, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["highlights"], max_length=64, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=16, remove_columns=['article', 'highlights'])

rouge_metric = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = [[(l if l != -100 else tokenizer.pad_token_id) for l in label] for label in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    prediction_lens = [len(tokenizer.encode(pred, skip_special_tokens=True)) for pred in decoded_preds]
    result["gen_len"] = sum(prediction_lens) / len(prediction_lens)
    return {k: round(v, 4) for k, v in result.items()}

training_args = Seq2SeqTrainingArguments(
    output_dir="./results_bart_homework8",
    num_train_epochs=1,  # 5 épocas
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs_bart_homework8",
    logging_steps=1000,  # Logs menos frecuentes
    eval_steps=1000,     # Evalúa menos frecuente
    save_steps=1000,     # Guarda menos frecuente
    save_total_limit=2,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    report_to=[]  # Desactivar wandb para menos overhead
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

print("Iniciando entrenamiento...")
trainer.train()
print("Entrenamiento terminado.")

eval_metrics = trainer.evaluate(eval_dataset=tokenized_dataset["test"])
print(f"Métricas en test: {eval_metrics}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Cargando /content/drive/MyDrive/Archivos_csv/train.csv...
Muestra creada con 1000 filas.
Cargando /content/drive/MyDrive/Archivos_csv/validation.csv...
Muestra creada con 1000 filas.
Cargando /content/drive/MyDrive/Archivos_csv/test.csv...
Muestra creada con 1000 filas.
Conjuntos cargados: train=1000, val=1000, test=1000


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(


Iniciando entrenamiento...


Step,Training Loss




Entrenamiento terminado.


Keyword arguments {'skip_special_tokens': True} not recognized.
Keyword arguments {'skip_special_tokens': True} not recognized.
Keyword arguments {'skip_special_tokens': True} not recognized.
Keyword arguments {'skip_special_tokens': True} not recognized.
Keyword arguments {'skip_special_tokens': True} not recognized.
Keyword arguments {'skip_special_tokens': True} not recognized.
Keyword arguments {'skip_special_tokens': True} not recognized.
Keyword arguments {'skip_special_tokens': True} not recognized.
Keyword arguments {'skip_special_tokens': True} not recognized.
Keyword arguments {'skip_special_tokens': True} not recognized.
Keyword arguments {'skip_special_tokens': True} not recognized.
Keyword arguments {'skip_special_tokens': True} not recognized.
Keyword arguments {'skip_special_tokens': True} not recognized.
Keyword arguments {'skip_special_tokens': True} not recognized.
Keyword arguments {'skip_special_tokens': True} not recognized.
Keyword arguments {'skip_special_tokens'

Métricas en test: {'eval_loss': 2.3656327724456787, 'eval_rouge1': 0.2514, 'eval_rouge2': 0.105, 'eval_rougeL': 0.2007, 'eval_rougeLsum': 0.2315, 'eval_gen_len': 19.997, 'eval_runtime': 1741.2537, 'eval_samples_per_second': 0.574, 'eval_steps_per_second': 0.144, 'epoch': 1.0}


In [14]:
!pip install gradio --quiet

from transformers import BartForConditionalGeneration, BartTokenizer
import torch
import gradio as gr
import re

# Ruta al checkpoint entrenado
checkpoint_path = "/content/results_bart_homework8/checkpoint-250"  # Cambia según tu estructura

# Cargar modelo y tokenizer
model = BartForConditionalGeneration.from_pretrained(checkpoint_path)
tokenizer = BartTokenizer.from_pretrained(checkpoint_path)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

def clean_text(text):
    # Limpiar saltos de línea y espacios múltiples
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def resumir_texto(texto):
    texto = clean_text(texto)  # Limpiar texto de entrada

    inputs = tokenizer(
        texto,
        return_tensors="pt",
        max_length=512,
        truncation=True,
        padding="max_length"
    ).to(device)

    summary_ids = model.generate(
        inputs["input_ids"],
        num_beams=6,
        max_length=200,
        min_length=40,
        no_repeat_ngram_size=3,
        length_penalty=2.0,
        early_stopping=True
    )

    resumen = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    resumen = clean_text(resumen)  # Limpiar resumen generado
    return resumen

iface = gr.Interface(
    fn=resumir_texto,
    inputs=gr.Textbox(lines=10, placeholder="Escribe o pega aquí la noticia que quieres resumir..."),
    outputs="text",
    title="Resumen de Noticias en Tiempo Real",
    description="Ingresa cualquier texto o noticia y obtén su resumen instantáneamente."
)

iface.launch()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://abaa48aea0fe30f4ad.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


