**Analítica de datos en salud**

Presentado por:

* 2400452 - Jennifer Benavides Castillo
* 2400479 - Cristhian David Cruz Millán
* 2400794 - Sergio Alejandro Fierro Ospitia
* 2400478 - Edwin Andrés Lasso Rosero

#Entregable 1:
Extraer y preparar un conjunto de datos clínico anotado en formato BIO para entrenamiento de modelos de reconocimiento de entidades nombradas (NER) de cáncer de pulmón.

##Sirve para:

* Procesar archivos clínicos con anotaciones manuales de entidades (por ejemplo, enfermedades, tratamientos, fechas, etc.).

* Convertir dichos archivos a estructuras compatibles con bibliotecas de procesamiento de lenguaje natural como datasets de Hugging Face.

* Validar la integridad de las anotaciones (estructura de columnas, formatos BIO correctos).

* Crear objetos Dataset para entrenar o evaluar modelos NER con arquitectura Transformers.

##Se puede utilizar en:

* Entrenamiento de modelos NLP para extracción automática de entidades clínicas.

* Estandarización de datasets clínicos para investigación médica.

* Preparación de datos para estudios de minería de texto en salud y aplicaciones de inteligencia artificial en medicina.



In [None]:
!pip install datasets transformers
!pip install seqeval
! pip install -U datasets evaluate
!pip install -U huggingface_hub



In [None]:
from datasets import DatasetDict, Dataset, Features, Sequence, Value, ClassLabel
from collections import defaultdict
from pathlib import Path

In [None]:
import csv

def leer_archivo_bio(archivo_bio):
    """Lee un archivo CSV estilo BIO con tres columnas: ID, Token, Etiqueta."""
    datos = defaultdict(list)
    with open(archivo_bio, 'r', encoding='utf-8') as f:
        lector = csv.reader(f)
        next(lector)  # Saltar encabezado si existe
        tokens = []
        labels = []
        id_anterior = None

        for num_linea, fila in enumerate(lector, start=2):
            if len(fila) != 3:
                raise ValueError(f"Error en línea {num_linea}: {fila}. Se esperaban 3 columnas (ID, palabra, etiqueta).")

            id_actual, palabra, etiqueta = fila
            if id_anterior is None:
                id_anterior = id_actual

            if id_actual != id_anterior:
                datos["tokens"].append(tokens)
                datos["ner_tags"].append(labels)
                tokens = []
                labels = []
                id_anterior = id_actual

            tokens.append(palabra)
            labels.append(etiqueta)

        # Guardar última oración
        if tokens:
            datos["tokens"].append(tokens)
            datos["ner_tags"].append(labels)

    return datos

def cargar_datasets_bio(rutas_archivos):
    """Carga archivos .bio y devuelve un DatasetDict."""
    datasets = {}
    for nombre, ruta in rutas_archivos.items():
        datos = leer_archivo_bio(ruta)
        datasets[nombre] = Dataset.from_dict(datos)

    return DatasetDict(datasets)

In [None]:
# PASO 1: Despues de cargar los datos, primero se detecta todas las etiquetas únicas
def detectar_etiquetas_unicas(rutas_archivos):
    """Detecta automáticamente todas las etiquetas únicas en archivos CSV con 3 columnas."""
    todas_etiquetas = set()

    for ruta in rutas_archivos.values():
        with open(ruta, 'r', encoding='utf-8') as f:
            lector = csv.reader(f)
            next(lector)  #Saltar encabezado
            for fila in lector:
                if len(fila) == 3:
                    _, _, etiqueta = fila
                    todas_etiquetas.add(etiqueta)

    # Ordenamos las etiquetas para que 'O' sea la última
    etiquetas_ordenadas = sorted(todas_etiquetas - {'O'}) + ['O']
    return etiquetas_ordenadas


In [None]:
def corregir_formato_bio(ruta_archivo):
    filas_corregidas = []

    with open(ruta_archivo, 'r', encoding='utf-8') as f:
        lector = csv.reader(f)
        encabezado = next(lector)
        for fila in lector:
            if len(fila) == 3:
                fila[2] = fila[2].replace("B_", "B-").replace("I_", "I-")  # Corrige solo la etiqueta
            filas_corregidas.append(fila)

    # Escribir de nuevo el archivo con las etiquetas corregidas
    with open(ruta_archivo, 'w', newline='', encoding='utf-8') as f:
        escritor = csv.writer(f)
        escritor.writerow(encabezado)  # Restaurar encabezado
        escritor.writerows(filas_corregidas)

# Corregir los tres archivos
corregir_formato_bio("/content/sentences_train.csv")
corregir_formato_bio("/content/sentences_test.csv")
corregir_formato_bio("/content/sentences_dev.csv")

print("Etiquetas corregidas exitosamente en los tres archivos.")


Etiquetas corregidas exitosamente en los tres archivos.


In [None]:
# Se definen los nombres de las rutas (paths) de los archivos .bio
rutas_archivos = {
    "train": "/content/sentences_train.csv",
    "test": "/content/sentences_test.csv",
    "valid": "/content/sentences_dev.csv"
}


In [None]:
# Detectar automáticamente todas las etiquetas
LABELS = detectar_etiquetas_unicas(rutas_archivos)
print("Etiquetas detectadas:", LABELS)

# Cargar los datasets
dataset_dict = cargar_datasets_bio(rutas_archivos)


# Definir la estructura de features con las etiquetas detectadas
features = Features({
    "tokens": Sequence(Value("string")),
    "ner_tags": Sequence(ClassLabel(names=LABELS))
})

# Aplicar el casting a cada split
for split in dataset_dict:
    dataset_dict[split] = dataset_dict[split].cast(features)

# Mostrar información del dataset
print("\nDataset cargado correctamente:")
print(dataset_dict)

# Mostrar un ejemplo del conjunto de entrenamiento
print("\nEjemplo del train:")
print(dataset_dict["train"][0])

# Mostrar las características del dataset
print("\nCaracterísticas del dataset:")
print(dataset_dict["train"].features)

Etiquetas detectadas: ['B-CANCER_CONCEPT', 'B-CHEMOTHERAPY', 'B-DATE', 'B-DRUG', 'B-FAMILY', 'B-FREQ', 'B-IMPLICIT_DATE', 'B-INTERVAL', 'B-METRIC', 'B-OCURRENCE_EVENT', 'B-QUANTITY', 'B-RADIOTHERAPY', 'B-SMOKER_STATUS', 'B-STAGE', 'B-SURGERY', 'B-TNM', 'I-CANCER_CONCEPT', 'I-DATE', 'I-DRUG', 'I-FAMILY', 'I-FREQ', 'I-IMPLICIT_DATE', 'I-INTERVAL', 'I-METRIC', 'I-OCURRENCE_EVENT', 'I-SMOKER_STATUS', 'I-STAGE', 'I-SURGERY', 'I-TNM', 'O']


Casting the dataset:   0%|          | 0/19154 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/4947 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5453 [00:00<?, ? examples/s]


Dataset cargado correctamente:
DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 19154
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 4947
    })
    valid: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 5453
    })
})

Ejemplo del train:
{'tokens': ['Abuela'], 'ner_tags': [4]}

Características del dataset:
{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner_tags': Sequence(feature=ClassLabel(names=['B-CANCER_CONCEPT', 'B-CHEMOTHERAPY', 'B-DATE', 'B-DRUG', 'B-FAMILY', 'B-FREQ', 'B-IMPLICIT_DATE', 'B-INTERVAL', 'B-METRIC', 'B-OCURRENCE_EVENT', 'B-QUANTITY', 'B-RADIOTHERAPY', 'B-SMOKER_STATUS', 'B-STAGE', 'B-SURGERY', 'B-TNM', 'I-CANCER_CONCEPT', 'I-DATE', 'I-DRUG', 'I-FAMILY', 'I-FREQ', 'I-IMPLICIT_DATE', 'I-INTERVAL', 'I-METRIC', 'I-OCURRENCE_EVENT', 'I-SMOKER_STATUS', 'I-STAGE', 'I-SURGERY', 'I-TNM', 'O'], id=None), length=-1, id=None)}


In [None]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 19154
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 4947
    })
    valid: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 5453
    })
})

In [None]:
x = dataset_dict["train"].features["ner_tags"].feature.names

print(x)

['B-CANCER_CONCEPT', 'B-CHEMOTHERAPY', 'B-DATE', 'B-DRUG', 'B-FAMILY', 'B-FREQ', 'B-IMPLICIT_DATE', 'B-INTERVAL', 'B-METRIC', 'B-OCURRENCE_EVENT', 'B-QUANTITY', 'B-RADIOTHERAPY', 'B-SMOKER_STATUS', 'B-STAGE', 'B-SURGERY', 'B-TNM', 'I-CANCER_CONCEPT', 'I-DATE', 'I-DRUG', 'I-FAMILY', 'I-FREQ', 'I-IMPLICIT_DATE', 'I-INTERVAL', 'I-METRIC', 'I-OCURRENCE_EVENT', 'I-SMOKER_STATUS', 'I-STAGE', 'I-SURGERY', 'I-TNM', 'O']


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

#tokenizer = AutoTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-cased")

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset_dict.map(
    tokenize_and_align_labels,
    batched=True
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/19154 [00:00<?, ? examples/s]

Map:   0%|          | 0/4947 [00:00<?, ? examples/s]

Map:   0%|          | 0/5453 [00:00<?, ? examples/s]

In [None]:
task = "ner" # Should be one of "ner", "pos" or "chunk"
model_checkpoint = "bert-base-uncased"
batch_size = 8

In [None]:
label_list = dataset_dict["train"].features[f"{task}_tags"].feature.names
label_list

['B-CANCER_CONCEPT',
 'B-CHEMOTHERAPY',
 'B-DATE',
 'B-DRUG',
 'B-FAMILY',
 'B-FREQ',
 'B-IMPLICIT_DATE',
 'B-INTERVAL',
 'B-METRIC',
 'B-OCURRENCE_EVENT',
 'B-QUANTITY',
 'B-RADIOTHERAPY',
 'B-SMOKER_STATUS',
 'B-STAGE',
 'B-SURGERY',
 'B-TNM',
 'I-CANCER_CONCEPT',
 'I-DATE',
 'I-DRUG',
 'I-FAMILY',
 'I-FREQ',
 'I-IMPLICIT_DATE',
 'I-INTERVAL',
 'I-METRIC',
 'I-OCURRENCE_EVENT',
 'I-SMOKER_STATUS',
 'I-STAGE',
 'I-SURGERY',
 'I-TNM',
 'O']

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
model_bert_base = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_bert_base}-finetuned-{task}-lung",
    eval_strategy = "epoch", # Changed from evaluation_strategy to eval_strategy
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
    push_to_hub=True,
    hub_token="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"
)

In [None]:
try:
    from datasets import load_metric  # Para versiones antiguas
    metric = load_metric("seqeval")
except ImportError:
    from evaluate import load  # Para versiones nuevas
    metric = load("seqeval")

In [None]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

Validación (eval_dataset): Se usa durante el entrenamiento para:

Ajustar hiperparámetros

Detener el entrenamiento temprano (early stopping)

Monitorizar el progreso

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [None]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mjennifer-benavides[0m ([33mjennifer-benavides-universidad-del-valle[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1494,0.098779,0.929193,0.929193,0.929193,0.974055
2,0.0953,0.092315,0.932691,0.955427,0.943922,0.979143
3,0.0659,0.095306,0.925725,0.951944,0.938652,0.976736
4,0.0531,0.098728,0.937393,0.957632,0.947405,0.979555
5,0.0454,0.103652,0.93655,0.9509,0.94367,0.979234
6,0.04,0.110561,0.934635,0.956007,0.9452,0.979578
7,0.0315,0.110713,0.934957,0.949391,0.942118,0.978639
8,0.0295,0.114519,0.933106,0.95206,0.942488,0.978524
9,0.0229,0.121346,0.93423,0.951364,0.942719,0.978432
10,0.0226,0.122552,0.933828,0.950087,0.941887,0.97857


TrainOutput(global_step=23950, training_loss=0.06487054183736972, metrics={'train_runtime': 3802.927, 'train_samples_per_second': 50.366, 'train_steps_per_second': 6.298, 'total_flos': 5513628084726960.0, 'train_loss': 0.06487054183736972, 'epoch': 10.0})

Buenas prácticas:

No uses test para tomar decisiones: Solo para la evaluación final

Usa validación para ajustes: Early stopping, learning rate, etc.

Guarda test para el final: Como si fuera datos "reales" que el modelo nunca ha visto

In [None]:
test_metrics = trainer.evaluate(tokenized_datasets["test"])
print("\n" + "="*50)
print(f"Resultados finales en conjunto de test:")
print(f"F1-score: {test_metrics['eval_f1']:.3f}")
print(f"Precisión: {test_metrics['eval_precision']:.3f}")
print(f"Recall: {test_metrics['eval_recall']:.3f}")
print("="*50)


Resultados finales en conjunto de test:
F1-score: 0.934
Precisión: 0.921
Recall: 0.948


In [None]:
trainer.push_to_hub()

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

events.out.tfevents.1749247600.4c1b94190950.10931.0:   0%|          | 0.00/21.3k [00:00<?, ?B/s]

events.out.tfevents.1749251429.4c1b94190950.10931.1:   0%|          | 0.00/569 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/jenniferbc/bert-base-uncased-finetuned-ner-lung/commit/c052d5a50aa6dd55d5efe832d142c7758278be5b', commit_message='End of training', commit_description='', oid='c052d5a50aa6dd55d5efe832d142c7758278be5b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/jenniferbc/bert-base-uncased-finetuned-ner-lung', endpoint='https://huggingface.co', repo_type='model', repo_id='jenniferbc/bert-base-uncased-finetuned-ner-lung'), pr_revision=None, pr_num=None)

In [None]:
label_names =  dataset_dict["train"].features["ner_tags"].feature.names
label_names

['B-CANCER_CONCEPT',
 'B-CHEMOTHERAPY',
 'B-DATE',
 'B-DRUG',
 'B-FAMILY',
 'B-FREQ',
 'B-IMPLICIT_DATE',
 'B-INTERVAL',
 'B-METRIC',
 'B-OCURRENCE_EVENT',
 'B-QUANTITY',
 'B-RADIOTHERAPY',
 'B-SMOKER_STATUS',
 'B-STAGE',
 'B-SURGERY',
 'B-TNM',
 'I-CANCER_CONCEPT',
 'I-DATE',
 'I-DRUG',
 'I-FAMILY',
 'I-FREQ',
 'I-IMPLICIT_DATE',
 'I-INTERVAL',
 'I-METRIC',
 'I-OCURRENCE_EVENT',
 'I-SMOKER_STATUS',
 'I-STAGE',
 'I-SURGERY',
 'I-TNM',
 'O']

In [None]:
predictions, labels, _ = trainer.predict(tokenized_datasets["test"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

{'CANCER_CONCEPT': {'precision': np.float64(0.8994910941475827),
  'recall': np.float64(0.9241830065359478),
  'f1': np.float64(0.9116698903932947),
  'number': np.int64(765)},
 'CHEMOTHERAPY': {'precision': np.float64(0.9794871794871794),
  'recall': np.float64(1.0),
  'f1': np.float64(0.9896373056994818),
  'number': np.int64(191)},
 'DATE': {'precision': np.float64(0.9759797724399494),
  'recall': np.float64(0.9834394904458599),
  'f1': np.float64(0.9796954314720813),
  'number': np.int64(785)},
 'DRUG': {'precision': np.float64(0.9177126917712691),
  'recall': np.float64(0.9748148148148148),
  'f1': np.float64(0.9454022988505747),
  'number': np.int64(675)},
 'FAMILY': {'precision': np.float64(0.9738562091503268),
  'recall': np.float64(0.9933333333333333),
  'f1': np.float64(0.9834983498349835),
  'number': np.int64(150)},
 'FREQ': {'precision': np.float64(0.896551724137931),
  'recall': np.float64(0.9837837837837838),
  'f1': np.float64(0.9381443298969072),
  'number': np.int64(1

SUBIR MODELO A HUGGIN FACE HUB