In [2]:
%pip install transformers datasets scikit-learn pandas xgboost torch

Collecting torch
  Using cached torch-2.7.1-cp312-cp312-win_amd64.whl.metadata (28 kB)
Using cached torch-2.7.1-cp312-cp312-win_amd64.whl (216.1 MB)
Installing collected packages: torch
Successfully installed torch-2.7.1
Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split

from torch.utils.data import DataLoader, Subset, Dataset
from transformers import BertForSequenceClassification
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from tqdm import tqdm

from sklearn.metrics import classification_report, confusion_matrix
from google.colab import drive
from transformers import BertTokenizer

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


ModuleNotFoundError: No module named 'torch'

In [None]:
class EsquizofreniaDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

In [None]:
drive.mount('/content/drive')
ruta_dataset = '/content/drive/MyDrive/ALBA/diagnosticos_F20_F20.89_con_descripcion.csv'
df = pd.read_csv(ruta_dataset, sep="|")
model_type = "dccuchile/bert-base-spanish-wwm-cased"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Reemplaza espacios innecesarios en los nombres de columna
df.columns = df.columns.str.strip()

# Unir las 20 columnas de diagnóstico en una sola cadena de texto por fila
diag_columns = [col for col in df.columns if col.startswith("Diag")]
df["text"] = df[diag_columns].fillna("").agg(" ".join, axis=1)

print("Unique values in DIAG PSQ:", df["DIAG PSQ"].unique())

df["label"] = df["DIAG PSQ"].map({
    "Esquizofrenia": 1,
    "Otros tipos de esquizofrenia": 0
})

# Remove rows with NaN labels
df = df.dropna(subset=["label"])
print(f"Dataset size after removing NaN labels: {len(df)}")

# Convert to int to ensure no floating point issues
df["label"] = df["label"].astype(int)

Unique values in DIAG PSQ: ['Otros tipos de esquizofrenia' 'Esquizofrenia' 'No encontrado']
Dataset size after removing NaN labels: 2277


In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_type)
labels = df["label"].tolist()
# Tokenizamos los textos
encodings = tokenizer(
    df["text"].tolist(),
    truncation=True,
    padding=True,
    max_length=512,
    return_tensors="pt"
)


# Ruta donde guardar
save_path = "/content/drive/MyDrive/ALBA/resultados/" + model_type

In [None]:
dataset = EsquizofreniaDataset(encodings, labels)

# Índices para separar
train_indices, test_indices = train_test_split(
    list(range(len(dataset))),
    test_size=0.2,
    stratify=labels,
    random_state=42
)

# Crear subsets
train_dataset = Subset(dataset, train_indices)
test_dataset = Subset(dataset, test_indices)

# Cargar en DataLoader
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)



model = BertForSequenceClassification.from_pretrained(
    model_type,
    num_labels=2  # 2 clases: F20 (1) y F20.89 (0)
)

# Guardar modelo y tokenizador
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


('/content/drive/MyDrive/ALBA/resultados/dccuchile/bert-base-spanish-wwm-cased/tokenizer_config.json',
 '/content/drive/MyDrive/ALBA/resultados/dccuchile/bert-base-spanish-wwm-cased/special_tokens_map.json',
 '/content/drive/MyDrive/ALBA/resultados/dccuchile/bert-base-spanish-wwm-cased/vocab.txt',
 '/content/drive/MyDrive/ALBA/resultados/dccuchile/bert-base-spanish-wwm-cased/added_tokens.json',
 '/content/drive/MyDrive/ALBA/resultados/dccuchile/bert-base-spanish-wwm-cased/tokenizer.json')

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizador
optimizer = AdamW(model.parameters(), lr=2e-5)

# Entrenamiento simple
model.train()
for epoch in range(8):  # puedes ajustar el número de épocas
    total_loss = 0
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}")

    for batch in loop:
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()
        loop.set_postfix(loss=loss.item())

    print(f"Epoch {epoch+1} loss: {total_loss / len(train_loader)}")

Epoch 1: 100%|██████████| 228/228 [01:11<00:00,  3.20it/s, loss=0.387]


Epoch 1 loss: 0.5768945789650867


Epoch 2: 100%|██████████| 228/228 [01:12<00:00,  3.14it/s, loss=0.333]


Epoch 2 loss: 0.47276206791662334


Epoch 3: 100%|██████████| 228/228 [01:13<00:00,  3.09it/s, loss=0.448]


Epoch 3 loss: 0.38968559030214683


Epoch 4: 100%|██████████| 228/228 [01:14<00:00,  3.06it/s, loss=0.439]


Epoch 4 loss: 0.328384897426555


Epoch 5: 100%|██████████| 228/228 [01:14<00:00,  3.05it/s, loss=0.0366]


Epoch 5 loss: 0.2957181418314576


Epoch 6: 100%|██████████| 228/228 [01:14<00:00,  3.05it/s, loss=0.592]


Epoch 6 loss: 0.2549937566917945


Epoch 7: 100%|██████████| 228/228 [01:14<00:00,  3.04it/s, loss=0.238]


Epoch 7 loss: 0.2407357290340179


Epoch 8: 100%|██████████| 228/228 [01:14<00:00,  3.04it/s, loss=0.79]

Epoch 8 loss: 0.22701568664878755





In [None]:
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(batch["labels"].cpu().numpy())

# Resultados
print(classification_report(all_labels, all_preds, target_names=["F20.89", "F20"]))
print(confusion_matrix(all_labels, all_preds))

              precision    recall  f1-score   support

      F20.89       0.80      0.93      0.86       307
         F20       0.79      0.52      0.63       149

    accuracy                           0.80       456
   macro avg       0.79      0.73      0.75       456
weighted avg       0.80      0.80      0.79       456

[[286  21]
 [ 71  78]]
