In [None]:
# Mount Google Drive if using Colab
try:
    from google.colab import drive
    drive.mount('/content/drive')
    USING_COLAB = True
except ImportError:
    USING_COLAB = False

In [None]:
# Verify by listing the files in the drive
!ls /content/drive/My\ Drive/GTSI/Codigos_ods/data

 dataset.csv   investigacion			   vinculacion
 integradora  'OSDG Community Dataset (OSDG-CD)'


In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, EarlyStoppingCallback
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

In [None]:
import transformers
print(f"Versión de transformers: {transformers.__version__}")

Versión de transformers: 4.52.3


In [None]:
# !pip install hf_xet

In [None]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
def prepare_data(df, column="value"):
    # Codificar las etiquetas
    label_encoder = LabelEncoder()
    df['label_encoded'] = label_encoder.fit_transform(df[column])

    return df, label_encoder

In [None]:
def get_training_args_compatible(output_dir):
    """Crea TrainingArguments compatible con diferentes versiones de transformers"""

    # Argumentos base que funcionan en todas las versiones
    base_args = {
        'output_dir': output_dir,
        'num_train_epochs': 3,
        'per_device_train_batch_size': 8,
        'per_device_eval_batch_size': 8,
        'warmup_steps': 500,
        'weight_decay': 0.01,
        'logging_steps': 100,
        'save_steps': 500,
        'load_best_model_at_end': True,
        'save_total_limit': 2,
    }

    # Intentar con parámetros de versión nueva primero
    try:
        # training_args = TrainingArguments(
        #     **base_args,
        #     logging_dir=f'{output_dir}/logs',
        #     evaluation_strategy="steps",  # ✅ Cambiar a "steps"
        #     eval_steps=500,               # ✅ Debe coincidir con save_steps
        #     save_strategy="steps",
        #     metric_for_best_model="eval_loss",
        #     greater_is_better=False,
        # )

        training_args = TrainingArguments(
            **base_args,
            logging_dir=f'{output_dir}/logs',
            evaluation_strategy="epoch",  # Evaluar cada época
            save_strategy="epoch",        # Guardar cada época
            metric_for_best_model="eval_loss",
            greater_is_better=False,
        )
        print("✓ Usando TrainingArguments (versión nueva)")
        return training_args

    except TypeError as e:
        if "evaluation_strategy" in str(e):
            # Usar parámetros de versión antigua
            training_args = TrainingArguments(
                **base_args,
                logging_dir=f'{output_dir}/logs',
                eval_strategy="steps",     # ✅ Versión antigua
                eval_steps=500,           # ✅ Debe coincidir con save_steps
                save_strategy="steps",
                metric_for_best_model="eval_loss",
                greater_is_better=False,
            )
            print("✓ Usando TrainingArguments (versión antigua)")
            return training_args
        else:
            raise e

In [None]:
def train_model(model_name, train_dataset, val_dataset, num_labels, output_dir):
    """Entrena un modelo transformer"""

    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels
        )
        print(f"✓ Modelo {model_name} cargado con pesos PyTorch nativos")

    except OSError as e:
        if "pytorch_model.bin" in str(e) and "TensorFlow" in str(e):
            print(f"⚠️  Convirtiendo desde TensorFlow para {model_name}")
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            model = AutoModelForSequenceClassification.from_pretrained(
                model_name,
                num_labels=num_labels,
                from_tf=True
            )
            print(f"✓ Modelo {model_name} cargado desde TensorFlow")
        else:
            raise e

    # Usar función compatible para TrainingArguments
    training_args = get_training_args_compatible(output_dir)

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        return {
            'accuracy': accuracy_score(labels, predictions)
        }

    # Crear trainer con o sin early stopping según la versión
    try:
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
        )
    except:
        # Si EarlyStoppingCallback no está disponible
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics,
        )

    print(f"Entrenando {model_name}...")
    trainer.train()

    return trainer, tokenizer

Características principales del código:

Expansión de datos: La función expand_dataframe() convierte cada valor separado por comas en una fila individual  
Soporte para BERT y RoBERTa: Entrena ambos modelos automáticamente  
Validación: División train/validation con early stopping  
Métricas: Calcula accuracy y loss  
Guardado: Guarda modelos entrenados para uso posterior  
Predicción: Función para hacer predicciones con modelos entrenados  

In [None]:
def predict_example(model_path, tokenizer_path, text, label_encoder):
    """Función para hacer predicciones con el modelo entrenado"""

    # Cargar modelo y tokenizer
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

    # Tokenizar
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=512
    )

    # Predicción
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(predictions, dim=-1).item()

    # Decodificar etiqueta
    predicted_label = label_encoder.inverse_transform([predicted_class])[0]
    confidence = predictions[0][predicted_class].item()

    return predicted_label, confidence


In [None]:
# Cargar tu DataFrame

if USING_COLAB:
  df = pd.read_csv("/content/drive/My Drive/GTSI/Codigos_ods/data/dataset.csv")
else:
  df = pd.read_csv("./data/dataset.csv")
df.keys()

Index(['text', 'value'], dtype='object')

In [None]:
# Preparar datos
df_processed, label_encoder = prepare_data(df)
print(f"\nClases únicas: {label_encoder.classes_}")
print(f"Número de clases: {len(label_encoder.classes_)}")


Clases únicas: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16]
Número de clases: 16


In [None]:
# Dividir datos
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_processed['text'].tolist(),
    df_processed['label_encoded'].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df_processed['label_encoded']
)

In [None]:
# Modelos a entrenar
# models_config = {
#     'bert': 'bert-base-uncased',
#     'roberta': 'roberta-base'
# }

models_config = {
    # 'distilbert': 'distilbert-base-uncased',
    'bert': 'google-bert/bert-base-uncased',
    'roberta': 'FacebookAI/roberta-base',
    # 'albert': 'albert-base-v2',
}

In [None]:
results = {}

for model_type, model_name in models_config.items():
    print(f"\n{'='*50}")
    print(f"Entrenando {model_type.upper()}")
    print(f"{'='*50}")

    # Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Datasets
    train_dataset = TextClassificationDataset(
        train_texts, train_labels, tokenizer
    )
    val_dataset = TextClassificationDataset(
        val_texts, val_labels, tokenizer
    )

    if USING_COLAB:
      str_output_dir = f'/content/drive/My Drive/GTSI/Codigos_ods/results/{model_type}'
    else:
      str_output_dir = f'./results/{model_type}'

    # Entrenar
    trainer, trained_tokenizer = train_model(
        model_name=model_name,
        train_dataset=train_dataset,
        val_dataset=val_dataset,
        num_labels=len(label_encoder.classes_),
        output_dir=str_output_dir
    )

    # Evaluar
    eval_results = trainer.evaluate()
    results[model_type] = eval_results

    print(f"\nResultados {model_type}:")
    for key, value in eval_results.items():
        print(f"{key}: {value:.4f}")

    # Guardar modelo
    if USING_COLAB:
      trainer.save_model(f'/content/drive/My Drive/GTSI/Codigos_ods/models/{model_type}')
      trained_tokenizer.save_pretrained(f'/content/drive/My Drive/GTSI/Codigos_ods/models/{model_type}')
    else:
      trainer.save_model(f'/models/{model_type}')
      trained_tokenizer.save_pretrained(f'./models/{model_type}')

    if USING_COLAB:
      print(f"Modelo {model_type} guardado en /content/drive/My Drive/GTSI/Codigos_ods/models/{model_type}")
    else:
      print(f"Modelo {model_type} guardado en ./models/{model_type}")


Entrenando BERT


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✓ Modelo google-bert/bert-base-uncased cargado con pesos PyTorch nativos
✓ Usando TrainingArguments (versión antigua)
Entrenando google-bert/bert-base-uncased...




<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msteven-araujo[0m ([33msteven-araujo-espol[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy
500,1.196,1.160895,0.673944
1000,1.1768,1.037501,0.712031
1500,0.9756,1.062042,0.709302
2000,0.9458,1.015627,0.715472
2500,0.9909,0.890114,0.74205
3000,0.9435,0.875079,0.75
3500,0.8095,0.876393,0.754983
4000,0.8616,0.846229,0.761035
4500,0.6585,0.924086,0.75795
5000,0.6012,0.869582,0.766018


In [None]:

# Comparar resultados
print(f"\n{'='*50}")
print("COMPARACIÓN DE RESULTADOS")
print(f"{'='*50}")

for model_type, metrics in results.items():
    print(f"{model_type.upper()}:")
    print(f"  Accuracy: {metrics['eval_accuracy']:.4f}")
    print(f"  Loss: {metrics['eval_loss']:.4f}")
