In [1]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix


# Ruta a la carpeta donde se encuentra el modelo guardado
paths_s = ['bsc-bio-ehr-es', 'bsc-bio-es', 'roberta-base-biomedical-clinical-es']
paths_l = ['bsc-bio-ehr-es-large', 'bsc-bio-es-large', 'roberta-base-biomedical-clinical-es-large']
base_path = 'PlanTL-GOB-ES'
dataset_s = load_dataset('csv', data_files={'train': 'train_temp_stratify.csv', 'test': 'test_temp_stratify.csv'})
dataset_l = load_dataset('csv', data_files={'train': 'train_temp_large_stratify.csv', 'test': 'test_temp_large_stratify.csv'})



Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [6]:
import pandas as pd

# Cargar el archivo CSV en un DataFrame
file_path = '../transformers/dataset_large.csv'  # Reemplaza esto con la ruta a tu archivo CSV
df = pd.read_csv(file_path)

# Obtener la cantidad de elementos (filas)
num_elements = len(df)

# Obtener el tipo de datos de cada columna
column_types = df.dtypes

# Obtener el recuento de cada etiqueta (label)
# Reemplaza 'Label' con el nombre de la columna que contiene las etiquetas en tu CSV
label_counts = df['Label'].value_counts()

# Mostrar la cantidad de elementos, el tipo de datos de cada columna y el recuento de etiquetas
print(f'Número de elementos en el CSV: {num_elements}')
print('Tipos de datos de cada columna:')
print(column_types)
print('\nRecuento de cada etiqueta:')
print(label_counts)

Número de elementos en el CSV: 407
Tipos de datos de cada columna:
Text     object
Label     int64
dtype: object

Recuento de cada etiqueta:
Label
0    305
1    102
Name: count, dtype: int64


In [7]:
import pandas as pd


def count_classes_s(dataset, dataset_name):
    for split in ['train', 'test']:
        # Convertir el dataset en un DataFrame de pandas para facilitar el conteo
        df = pd.DataFrame(dataset[split])
        
        # Contar los elementos por clase
        class_counts = df['label'].value_counts().to_dict()
        
        # Asegurarse de que ambas clases 0 y 1 estén presentes en el conteo
        class_counts = {label: class_counts.get(label, 0) for label in [0, 1]}
        
        print(f"{dataset_name} - {split.capitalize()} set:")
        print(f"Class 0: {class_counts[0]}")
        print(f"Class 1: {class_counts[1]}")
        print()

def count_classes_l(dataset, dataset_name):
    for split in ['train', 'test']:
        # Convertir el dataset en un DataFrame de pandas para facilitar el conteo
        df = pd.DataFrame(dataset[split])
        
        # Contar los elementos por clase
        class_counts = df['Label'].value_counts().to_dict()
        
        # Asegurarse de que ambas clases 0 y 1 estén presentes en el conteo
        class_counts = {label: class_counts.get(label, 0) for label in [0, 1]}
        
        print(f"{dataset_name} - {split.capitalize()} set:")
        print(f"Class 0: {class_counts[0]}")
        print(f"Class 1: {class_counts[1]}")
        print()

# Contar y mostrar los resultados para cada dataset
count_classes_s(dataset_s, 'Dataset S')
count_classes_l(dataset_l, 'Dataset L')

Dataset S - Train set:
Class 0: 40
Class 1: 40

Dataset S - Test set:
Class 0: 10
Class 1: 10

Dataset L - Train set:
Class 0: 244
Class 1: 81

Dataset L - Test set:
Class 0: 61
Class 1: 21



In [9]:
from sklearn.model_selection import train_test_split


dataset_large = pd.read_csv('../transformers/dataset_large.csv')

# Verificar la distribución de clases en el dataset original
print("Distribución de clases en el dataset original:")
print(dataset_large['Label'].value_counts())

# Dividir el dataset en entrenamiento y prueba de manera estratificada
train_df, test_df = train_test_split(dataset_large, test_size=0.2, stratify=dataset_large['Label'], random_state=42)

# Verificar la distribución de clases en el dataset de entrenamiento y prueba
print("Distribución de clases en el dataset de entrenamiento:")
print(train_df['Label'].value_counts())
print("Distribución de clases en el dataset de prueba:")
print(test_df['Label'].value_counts())

Distribución de clases en el dataset original:
Label
0    305
1    102
Name: count, dtype: int64
Distribución de clases en el dataset de entrenamiento:
Label
0    244
1     81
Name: count, dtype: int64
Distribución de clases en el dataset de prueba:
Label
0    61
1    21
Name: count, dtype: int64


In [10]:
def metrics_s():
    for path in paths_s:
        model_path = f'{base_path}/{path}'
        print(f'MODELO: {model_path}')
        # Cargar el modelo desde la carpeta local
        model = RobertaForSequenceClassification.from_pretrained(model_path)

        # Cargar el tokenizador desde la carpeta local
        tokenizer = RobertaTokenizer.from_pretrained(model_path)
        def tokenize_function(examples):
            return tokenizer(examples['text'], padding='max_length', truncation=True)
        
        # Tokenizar el dataset
        tokenized_datasets = dataset_s.map(tokenize_function, batched=True)

        # Verificar los nombres de las columnas
        print(tokenized_datasets['train'].column_names)
        print(tokenized_datasets['test'].column_names)


        # Configuración del entrenador solo para realizar predicciones (sin entrenamiento)
        training_args = TrainingArguments(
            output_dir='./results',
            per_device_eval_batch_size=16,
        )

        # Definir el entrenador
        trainer = Trainer(
            model=model,
            args=training_args,
            eval_dataset=tokenized_datasets['test']
        )

        # Obtener las predicciones del modelo
        predictions = trainer.predict(tokenized_datasets['test'])

        # Obtener las etiquetas verdaderas y las predicciones
        y_true = predictions.label_ids
        y_pred = predictions.predictions.argmax(-1)

        # Calcular métricas
        accuracy = accuracy_score(y_true, y_pred)
        precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')

        # Calcular la matriz de confusión
        conf_matrix = confusion_matrix(y_true, y_pred)

        # Mostrar resultados
        print(f'Accuracy: {accuracy:.4f}')
        print(f'Precision: {precision:.4f}')
        print(f'Recall: {recall:.4f}')
        print(f'F1 Score: {f1:.4f}')
        print(f'Confusion Matrix:\n{conf_matrix}')

        # Extraer valores de la matriz de confusión
        tn, fp, fn, tp = conf_matrix.ravel()

        print(f'True Negatives: {tn}')
        print(f'False Positives: {fp}')
        print(f'False Negatives: {fn}')
        print(f'True Positives: {tp}')
        print()
        print()

In [11]:
def metrics_l():
    for path in paths_l:
        model_path = f'{base_path}/{path}'
        print(f'MODELO: {model_path}')
        # Cargar el modelo desde la carpeta local
        model = RobertaForSequenceClassification.from_pretrained(model_path)

        # Cargar el tokenizador desde la carpeta local
        tokenizer = RobertaTokenizer.from_pretrained(model_path)
        def tokenize_function(examples):
            return tokenizer(examples['Text'], padding='max_length', truncation=True)
        
        # Tokenizar el dataset
        tokenized_datasets = dataset_l.map(tokenize_function, batched=True)
        
        tokenized_datasets = tokenized_datasets.rename_column("Label", "labels")

        # Verificar los nombres de las columnas
        print(tokenized_datasets['train'].column_names)
        print(tokenized_datasets['test'].column_names)


        # Configuración del entrenador solo para realizar predicciones (sin entrenamiento)
        training_args = TrainingArguments(
            output_dir='./results',
            per_device_eval_batch_size=16,
        )

        # Definir el entrenador
        trainer = Trainer(
            model=model,
            args=training_args,
            eval_dataset=tokenized_datasets['test']
        )

        # Obtener las predicciones del modelo
        predictions = trainer.predict(tokenized_datasets['test'])
        
        # Obtener las etiquetas verdaderas y las predicciones
        y_true = predictions.label_ids
        y_pred = predictions.predictions.argmax(-1)


        # Calcular métricas
        accuracy = accuracy_score(y_true, y_pred)
        precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')

        # Calcular la matriz de confusión
        conf_matrix = confusion_matrix(y_true, y_pred)

        # Mostrar resultados
        print(f'Accuracy: {accuracy:.4f}')
        print(f'Precision: {precision:.4f}')
        print(f'Recall: {recall:.4f}')
        print(f'F1 Score: {f1:.4f}')
        print(f'Confusion Matrix:\n{conf_matrix}')

        # Extraer valores de la matriz de confusión
        tn, fp, fn, tp = conf_matrix.ravel()

        print(f'True Negatives: {tn}')
        print(f'False Positives: {fp}')
        print(f'False Negatives: {fn}')
        print(f'True Positives: {tp}')
        print()
        print()

In [12]:
print('DATASET PEQUEÑO')
metrics_s()

DATASET PEQUEÑO
MODELO: PlanTL-GOB-ES/bsc-bio-ehr-es


Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

['text', 'label', 'input_ids', 'attention_mask']
['text', 'label', 'input_ids', 'attention_mask']



  0%|          | 0/2 [00:00<?, ?it/s]

Accuracy: 0.8500
Precision: 0.8889
Recall: 0.8000
F1 Score: 0.8421
Confusion Matrix:
[[9 1]
 [2 8]]
True Negatives: 9
False Positives: 1
False Negatives: 2
True Positives: 8


MODELO: PlanTL-GOB-ES/bsc-bio-es


Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

['text', 'label', 'input_ids', 'attention_mask']
['text', 'label', 'input_ids', 'attention_mask']


  0%|          | 0/2 [00:00<?, ?it/s]

Accuracy: 0.8000
Precision: 0.8000
Recall: 0.8000
F1 Score: 0.8000
Confusion Matrix:
[[8 2]
 [2 8]]
True Negatives: 8
False Positives: 2
False Negatives: 2
True Positives: 8


MODELO: PlanTL-GOB-ES/roberta-base-biomedical-clinical-es


Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

['text', 'label', 'input_ids', 'attention_mask']
['text', 'label', 'input_ids', 'attention_mask']


  0%|          | 0/2 [00:00<?, ?it/s]

Accuracy: 0.8000
Precision: 0.8000
Recall: 0.8000
F1 Score: 0.8000
Confusion Matrix:
[[8 2]
 [2 8]]
True Negatives: 8
False Positives: 2
False Negatives: 2
True Positives: 8




In [13]:
print('DATASET GRANDE')
metrics_l()

DATASET GRANDE
MODELO: PlanTL-GOB-ES/bsc-bio-ehr-es-large


Map:   0%|          | 0/325 [00:00<?, ? examples/s]

Map:   0%|          | 0/82 [00:00<?, ? examples/s]

['Text', 'labels', 'input_ids', 'attention_mask']
['Text', 'labels', 'input_ids', 'attention_mask']


  0%|          | 0/6 [00:00<?, ?it/s]

Accuracy: 0.9512
Precision: 0.9048
Recall: 0.9048
F1 Score: 0.9048
Confusion Matrix:
[[59  2]
 [ 2 19]]
True Negatives: 59
False Positives: 2
False Negatives: 2
True Positives: 19


MODELO: PlanTL-GOB-ES/bsc-bio-es-large


Map:   0%|          | 0/325 [00:00<?, ? examples/s]

Map:   0%|          | 0/82 [00:00<?, ? examples/s]

['Text', 'labels', 'input_ids', 'attention_mask']
['Text', 'labels', 'input_ids', 'attention_mask']


  0%|          | 0/6 [00:00<?, ?it/s]

Accuracy: 0.9512
Precision: 0.9474
Recall: 0.8571
F1 Score: 0.9000
Confusion Matrix:
[[60  1]
 [ 3 18]]
True Negatives: 60
False Positives: 1
False Negatives: 3
True Positives: 18


MODELO: PlanTL-GOB-ES/roberta-base-biomedical-clinical-es-large


Map:   0%|          | 0/325 [00:00<?, ? examples/s]

Map:   0%|          | 0/82 [00:00<?, ? examples/s]

['Text', 'labels', 'input_ids', 'attention_mask']
['Text', 'labels', 'input_ids', 'attention_mask']


  0%|          | 0/6 [00:00<?, ?it/s]

Accuracy: 0.9512
Precision: 0.9048
Recall: 0.9048
F1 Score: 0.9048
Confusion Matrix:
[[59  2]
 [ 2 19]]
True Negatives: 59
False Positives: 2
False Negatives: 2
True Positives: 19


