In [1]:
!pip install datasets transformers
!pip install seqeval
! pip install -U datasets evaluate

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (

In [3]:
from datasets import DatasetDict, Dataset, Features, Sequence, Value, ClassLabel
from collections import defaultdict
from pathlib import Path

In [10]:
def leer_archivo_bio(archivo_bio):
    """Lee un archivo .bio y devuelve un diccionario con tokens y etiquetas."""
    datos = defaultdict(list)
    with open(archivo_bio, 'r', encoding='utf-8') as f:
        lineas = f.readlines()

    tokens = []
    labels = []
    for num_linea, linea in enumerate(lineas, start=1):
        linea = linea.strip()
        if linea:
            partes = linea.split()
            if len(partes) != 2:
                raise ValueError(f"Error en línea {num_linea}: '{linea}'. Se esperaban 2 elementos.")
            palabra, etiqueta = partes
            tokens.append(palabra)
            labels.append(etiqueta)
        else:
            if tokens and labels:
                datos["tokens"].append(tokens)
                datos["ner_tags"].append(labels)
                tokens = []
                labels = []

    if tokens and labels:
        datos["tokens"].append(tokens)
        datos["ner_tags"].append(labels)

    return datos


def cargar_datasets_bio(rutas_archivos):
    """Carga archivos .bio y devuelve un DatasetDict."""
    datasets = {}
    for nombre, ruta in rutas_archivos.items():
        datos = leer_archivo_bio(ruta)
        datasets[nombre] = Dataset.from_dict(datos)

    return DatasetDict(datasets)



In [16]:
# PASO 1: Despues de cargar los datos, primero se detecta todas las etiquetas únicas
def detectar_etiquetas_unicas(rutas_archivos):
    """Detecta automáticamente todas las etiquetas únicas en los archivos."""
    todas_etiquetas = set()

    for ruta in rutas_archivos.values():
        with open(ruta, 'r', encoding='utf-8') as f:
            for linea in f:
                linea = linea.strip()
                if linea:
                    partes = linea.split()
                    if len(partes) == 2:
                        _, etiqueta = partes
                        todas_etiquetas.add(etiqueta)

    # Ordenamos las etiquetas para que 'O' sea la última
    etiquetas_ordenadas = sorted(todas_etiquetas - {'O'}) + ['O']
    return etiquetas_ordenadas

In [15]:
# Se definen los nombres de las rutas (paths) de los archivos .bio
rutas_archivos = {
    "train": "training.bio",
    "test": "testing_cleaned.bio",
    "valid": "validation_cleaned.bio"
}


In [18]:

# Detectar automáticamente todas las etiquetas
LABELS = detectar_etiquetas_unicas(rutas_archivos)
print("Etiquetas detectadas:", LABELS)

# Cargar los datasets
dataset_dict = cargar_datasets_bio(rutas_archivos)

# Definir la estructura de features con las etiquetas detectadas
features = Features({
    "tokens": Sequence(Value("string")),
    "ner_tags": Sequence(ClassLabel(names=LABELS))
})

# Aplicar el casting a cada split
for split in dataset_dict:
    dataset_dict[split] = dataset_dict[split].cast(features)

# Mostrar información del dataset
print("\nDataset cargado correctamente:")
print(dataset_dict)

# Mostrar un ejemplo del conjunto de entrenamiento
print("\nEjemplo del train:")
print(dataset_dict["train"][0])

# Mostrar las características del dataset
print("\nCaracterísticas del dataset:")
print(dataset_dict["train"].features)

Etiquetas detectadas: [',', '0', 'B-BIOMARCADOR', 'B-CANCER', 'B-CIRUGIA', 'B-DOSIS', 'B-EDAD', 'B-FECHA', 'B-GLEASON', 'B-MEDICAMENTO', 'B-TNM', 'B-TRATAMIENTO', 'I-BIOMARCADOR', 'I-CANCER', 'I-CIRUGIA', 'I-DOSIS', 'I-EDAD', 'I-FECHA', 'I-GLEASON', 'I-MEDICAMENTO', 'I-TNM', 'I-TRATAMIENTO', 'O']


Casting the dataset:   0%|          | 0/3106 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/991 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/929 [00:00<?, ? examples/s]


Dataset cargado correctamente:
DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 3106
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 991
    })
    valid: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 929
    })
})

Ejemplo del train:
{'tokens': ['Paciente', 'de', '72', 'años', ',', 'con', 'antecedentes', 'médicos', 'de', 'HTA', '.'], 'ner_tags': [22, 22, 6, 16, 22, 22, 22, 22, 22, 22, 22]}

Características del dataset:
{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner_tags': Sequence(feature=ClassLabel(names=[',', '0', 'B-BIOMARCADOR', 'B-CANCER', 'B-CIRUGIA', 'B-DOSIS', 'B-EDAD', 'B-FECHA', 'B-GLEASON', 'B-MEDICAMENTO', 'B-TNM', 'B-TRATAMIENTO', 'I-BIOMARCADOR', 'I-CANCER', 'I-CIRUGIA', 'I-DOSIS', 'I-EDAD', 'I-FECHA', 'I-GLEASON', 'I-MEDICAMENTO', 'I-TNM', 'I-TRATAMIENTO', 'O'], id=None), length=-1, id=None)}


In [19]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 3106
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 991
    })
    valid: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 929
    })
})

In [20]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset_dict.map(
    tokenize_and_align_labels,
    batched=True
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/3106 [00:00<?, ? examples/s]

Map:   0%|          | 0/991 [00:00<?, ? examples/s]

Map:   0%|          | 0/929 [00:00<?, ? examples/s]

In [21]:
task = "ner" # Should be one of "ner", "pos" or "chunk"
model_checkpoint = "bert-base-uncased"
batch_size = 8

In [22]:
label_list = dataset_dict["train"].features[f"{task}_tags"].feature.names
label_list

[',',
 '0',
 'B-BIOMARCADOR',
 'B-CANCER',
 'B-CIRUGIA',
 'B-DOSIS',
 'B-EDAD',
 'B-FECHA',
 'B-GLEASON',
 'B-MEDICAMENTO',
 'B-TNM',
 'B-TRATAMIENTO',
 'I-BIOMARCADOR',
 'I-CANCER',
 'I-CIRUGIA',
 'I-DOSIS',
 'I-EDAD',
 'I-FECHA',
 'I-GLEASON',
 'I-MEDICAMENTO',
 'I-TNM',
 'I-TRATAMIENTO',
 'O']

In [23]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [26]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-{task}",
    #evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    #push_to_hub=True,
)

In [27]:
try:
    from datasets import load_metric  # Para versiones antiguas
    metric = load_metric("seqeval")
except ImportError:
    from evaluate import load  # Para versiones nuevas
    metric = load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [29]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

Validación (eval_dataset): Se usa durante el entrenamiento para:

Ajustar hiperparámetros

Detener el entrenamiento temprano (early stopping)

Monitorizar el progreso

In [30]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [31]:
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33moswaldo-solarte[0m ([33moswaldo-solarte-universidadad-del-valle[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,0.3412
1000,0.0668
1500,0.0386


TrainOutput(global_step=1945, training_loss=0.12048356318535106, metrics={'train_runtime': 961.5464, 'train_samples_per_second': 16.151, 'train_steps_per_second': 2.023, 'total_flos': 878239151238612.0, 'train_loss': 0.12048356318535106, 'epoch': 5.0})

Buenas prácticas:

No uses test para tomar decisiones: Solo para la evaluación final

Usa validación para ajustes: Early stopping, learning rate, etc.

Guarda test para el final: Como si fuera datos "reales" que el modelo nunca ha visto

In [32]:
test_metrics = trainer.evaluate(tokenized_datasets["test"])
print("\n" + "="*50)
print(f"Resultados finales en conjunto de test:")
print(f"F1-score: {test_metrics['eval_f1']:.3f}")
print(f"Precisión: {test_metrics['eval_precision']:.3f}")
print(f"Recall: {test_metrics['eval_recall']:.3f}")
print("="*50)




Resultados finales en conjunto de test:
F1-score: 0.964
Precisión: 0.960
Recall: 0.968


In [None]:
#trainer.push_to_hub()

In [33]:
label_names =  dataset_dict["train"].features["ner_tags"].feature.names
label_names

[',',
 '0',
 'B-BIOMARCADOR',
 'B-CANCER',
 'B-CIRUGIA',
 'B-DOSIS',
 'B-EDAD',
 'B-FECHA',
 'B-GLEASON',
 'B-MEDICAMENTO',
 'B-TNM',
 'B-TRATAMIENTO',
 'I-BIOMARCADOR',
 'I-CANCER',
 'I-CIRUGIA',
 'I-DOSIS',
 'I-EDAD',
 'I-FECHA',
 'I-GLEASON',
 'I-MEDICAMENTO',
 'I-TNM',
 'I-TRATAMIENTO',
 'O']

In [34]:
predictions, labels, _ = trainer.predict(tokenized_datasets["test"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

{'BIOMARCADOR': {'precision': np.float64(0.9805825242718447),
  'recall': np.float64(0.9711538461538461),
  'f1': np.float64(0.9758454106280192),
  'number': np.int64(208)},
 'CANCER': {'precision': np.float64(0.9438502673796791),
  'recall': np.float64(0.9697802197802198),
  'f1': np.float64(0.9566395663956639),
  'number': np.int64(364)},
 'CIRUGIA': {'precision': np.float64(0.8235294117647058),
  'recall': np.float64(0.8615384615384616),
  'f1': np.float64(0.8421052631578948),
  'number': np.int64(65)},
 'DOSIS': {'precision': np.float64(0.9109947643979057),
  'recall': np.float64(0.9206349206349206),
  'f1': np.float64(0.9157894736842104),
  'number': np.int64(189)},
 'EDAD': {'precision': np.float64(0.975609756097561),
  'recall': np.float64(1.0),
  'f1': np.float64(0.9876543209876543),
  'number': np.int64(80)},
 'FECHA': {'precision': np.float64(0.9974937343358395),
  'recall': np.float64(0.9925187032418953),
  'f1': np.float64(0.995),
  'number': np.int64(401)},
 'GLEASON': {'p