Fuentes: https://medium.com/nlplanet/fine-tuning-distilbert-on-senator-tweets-a6f2425ca50e

#### **Instalar Modulos**

conda install datasets=="2.20.0"

conda install transformers=="4.40.1"

conda install numpy=="1.26.4" # La última versión no funciona bien


In [1]:
# SOlo correr una vez, reiniciar, y está.

#!pip install datasets=="2.20.0"
#!pip install transformers=="4.40.1"
#!pip install numpy=="1.26.4"
#!pip install transformers
#!pip install --upgrade --force-reinstall pandas datasets pyarrow

In [2]:
# Data processing
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from datasets import Dataset,  DatasetDict
#from UA_MDM_LDI_II.tutoriales.utils import plot_confusion_matrix

# Modeling
import torch
from torch.utils.data import DataLoader
from transformers import DistilBertTokenizerFast, DataCollatorWithPadding, AutoModelForSequenceClassification, AdamW, get_scheduler

# Progress bar
from tqdm.auto import tqdm

# Verificamos que CUDA está funcional
torch.cuda.is_available()

True

**Bajamos el modelo**

In [3]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

**Armado de los Datasets**

In [4]:
# Paths
BASE_DIR = 'https://raw.githubusercontent.com/MartinGaddi12/labo-ii/refs/heads/develop/'
PATH_TO_TRAIN = os.path.join(BASE_DIR, "train.csv")
PATH_TO_TEMP_FILES = os.path.join(BASE_DIR, "work/optuna_temp_artifacts")
PATH_TO_OPTUNA_ARTIFACTS = os.path.join(BASE_DIR, "work/optuna_artifacts")

# Parametros y variables
SEED = 7
TEST_SIZE = 0.2
BATCH_SIZE = 16

In [5]:
# Cargar los datos
train_df = pd.read_csv(PATH_TO_TRAIN,sep=",")
train_df = train_df[train_df['Description'].notnull()]
train_df['labels'] = train_df["AdoptionSpeed"]

# Dividir los datos usando sklearn
train_df, test_df = train_test_split(train_df, test_size=TEST_SIZE, random_state=SEED, stratify=train_df.AdoptionSpeed)

# Convertir a Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Combinar en un DatasetDict
dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

# Codificar la columna de etiquetas como clases
dataset = dataset.class_encode_column('labels')

# Hacer una lista de columnas para remover antes de la tokenización
cols_to_remove = [col for col in dataset["train"].column_names if col != 'labels']
print(cols_to_remove)

Stringifying the column:   0%|          | 0/11984 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/11984 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/2996 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/2996 [00:00<?, ? examples/s]

['Type', 'Name', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2', 'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed', 'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'RescuerID', 'VideoAmt', 'Description', 'PetID', 'PhotoAmt', 'AdoptionSpeed', '__index_level_0__']


In [6]:
# Obtener el objeto ClassLabel del conjunto de datos de entrenamiento
class_label = dataset["train"].features["labels"]

# Obtener las clases originales a partir del objeto ClassLabel
classes = class_label.names
classes

['0', '1', '2', '3', '4']

In [7]:
# Tokenize and encode the dataset
def tokenize(batch):
    from transformers import DistilBertTokenizerFast
    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
    tokenized_batch = tokenizer(batch["Description"], padding=True, truncation=True, max_length=512)
    return tokenized_batch

dataset_enc = dataset.map(tokenize, batched=True, remove_columns=cols_to_remove, num_proc=4)

# Set dataset format for PyTorch
dataset_enc.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

# Check the output
print(dataset_enc["train"].column_names)



Map (num_proc=4):   0%|          | 0/11984 [00:00<?, ? examples/s]



Map (num_proc=4):   0%|          | 0/2996 [00:00<?, ? examples/s]



['labels', 'input_ids', 'attention_mask']


In [8]:
# Instantiate a data collator with dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Create data loaders for to reshape data for PyTorch model
train_dataloader = DataLoader(
    dataset_enc["train"], shuffle=True, batch_size=BATCH_SIZE, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    dataset_enc["test"], batch_size=BATCH_SIZE, collate_fn=data_collator
)

In [9]:
# Dynamically set number of class labels based on dataset
num_labels = dataset["train"].features['labels'].num_classes
print(f"Number of labels: {num_labels}")

# Load model
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased",
                                                           num_labels=num_labels)

Number of labels: 5




model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Model parameters
learning_rate = 5e-5
num_epochs = 5

# Create the optimizer
optimizer = AdamW(model.parameters(), lr=learning_rate)

# Further define learning rate scheduler
num_training_batches = len(train_dataloader)
num_training_steps = num_epochs * num_training_batches
lr_scheduler = get_scheduler(
    "linear",                   # linear decay
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)





**Miramos el Modelo**

In [11]:

# Set the device automatically (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# Move model to device
model.to(device)


cuda


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

**Entrenamos**

In [12]:
progress_bar = tqdm(range(num_training_steps))

# Train the model with PyTorch training loop
model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/3745 [00:00<?, ?it/s]

**Obtenemos la kappa base**

In [13]:
from sklearn.metrics import cohen_kappa_score

# Inicializa listas para almacenar todas las predicciones y etiquetas
all_predictions = []
all_labels = []

# Iteratively evaluate the model and collect predictions and labels
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

    # Mover predicciones y etiquetas a CPU y convertir a numpy
    all_predictions.extend(predictions.cpu().numpy())
    all_labels.extend(batch["labels"].cpu().numpy())

# Convertir listas a arrays de numpy
all_predictions = np.array(all_predictions)
all_labels = np.array(all_labels)

# Calcular Quadratic Weighted Kappa
qwk = cohen_kappa_score(all_labels, all_predictions, weights='quadratic')

print(f"Quadratic Weighted Kappa: {qwk}")


Quadratic Weighted Kappa: 0.24254343783507526


**Predecimos un ejemplo de descripción**

In [14]:
# Un ejemplo
desc = test_df.iloc[4]['Description']
print(desc)

# Tokenize inputs
inputs = tokenizer(desc, padding=True, truncation=True, return_tensors="pt").to(device) # Move the tensor to the GPU

# Inference model and get logits
outputs = model(**inputs)

# Convert logits to class probabilities
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
probabilities = predictions.detach().cpu().numpy()
predicted_class = np.argmax(probabilities, axis=1)

# Establecer opciones de impresión para evitar la notación científica
np.set_printoptions(suppress=True, formatter={'float_kind': '{:.8f}'.format})
print(probabilities[0])
print(predicted_class)

Has large paws and big bone but i cannot guarantee he will be large size or purebred. Puppy is a mixed breed. He was found with a long slash wound on his head but has since recovered after getting treatment form the vet. He is now certified healthy. brave, adventurous, curious and active little fellow-ton
[0.00118142 0.28571945 0.66658783 0.04516295 0.00134835]
[2]


In [15]:
all_predictions

array([4, 2, 3, ..., 2, 1, 2], shape=(2996,))

In [16]:
Base_comp = pd.read_csv(PATH_TO_TRAIN,sep=",")
Base_comp = Base_comp[Base_comp['Description'].notnull()]
Base_comp['labels'] = Base_comp["AdoptionSpeed"]

In [17]:
# Cambiar el modelo a modo de evaluación
model.eval()

# Inicializar listas para almacenar los resultados
all_probabilities_download = []
all_predictions_download = []
all_ids_download = []

# Iterar sobre todas las descripciones en la base completa
for i in tqdm(range(len(Base_comp))):
    description = Base_comp.iloc[i]['Description']  # Obtener la descripción
    pet_id = Base_comp.iloc[i]['PetID']  # Obtener el ID de la mascota

    # Tokenizar la descripción
    inputs = tokenizer(description, padding=True, truncation=True, return_tensors="pt").to(device)

    # Hacer la predicción
    with torch.no_grad():
        outputs = model(**inputs)

    # Obtener las probabilidades usando softmax
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()

    # Obtener la clase predicha (la que tiene la probabilidad más alta)
    predicted_class = np.argmax(probabilities, axis=1).item()

    # Almacenar los resultados
    all_ids_download.append(pet_id)
    all_probabilities_download.append(probabilities.flatten())  # Aplanar el array de probabilidades
    all_predictions_download.append(predicted_class)

# Crear un DataFrame con los resultados
results_df = pd.DataFrame({
    'ID': all_ids_download,
    'Probabilities': all_probabilities_download,
    'Prediction': all_predictions_download
})

# Guardar los resultados en un archivo CSV
results_df.to_csv('predictions_desc.csv', index=False)

# Ver las primeras filas del DataFrame resultante
print(results_df.head())

  0%|          | 0/14980 [00:00<?, ?it/s]

          ID                                      Probabilities  Prediction
0  86e1089a3  [0.000752545, 0.15268189, 0.80045265, 0.045552...           2
1  6296e909a  [0.028924825, 0.31997138, 0.54787356, 0.096991...           2
2  3422e4906  [0.0008947758, 0.0027402993, 0.12768795, 0.865...           3
3  5842f1ff5  [0.0005042793, 0.061550587, 0.846091, 0.090763...           2
4  850a43f90  [0.0014846955, 0.017332021, 0.69236803, 0.2825...           2


In [22]:
from google.colab import files
files.download('/content/predictions_desc.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [28]:
import shutil
from google.colab import files

# 1. Guardar el modelo y el tokenizer en /content/mi_modelo
model.save_pretrained("/content/mi_modelo")
tokenizer.save_pretrained("/content/mi_modelo")

# 2. Comprimir la carpeta en un archivo ZIP
shutil.make_archive("/content/mi_modelo", 'zip', "/content/mi_modelo")

# 3. Descargar el archivo ZIP
files.download("/content/mi_modelo.zip")



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [27]:
# Cambiar el modelo a modo de evaluación
model.eval()

# Inicializar listas para almacenar los resultados
all_probabilities_download = []
all_predictions_download = []
all_ids_download = []

# Iterar sobre todas las descripciones en la base completa
for i in tqdm(range(len(test_df))):
    description = test_df.iloc[i]['Description']  # Obtener la descripción
    pet_id = test_df.iloc[i]['PetID']  # Obtener el ID de la mascota

    # Tokenizar la descripción
    inputs = tokenizer(description, padding=True, truncation=True, return_tensors="pt").to(device)

    # Hacer la predicción
    with torch.no_grad():
        outputs = model(**inputs)

    # Obtener las probabilidades usando softmax
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()

    # Obtener la clase predicha (la que tiene la probabilidad más alta)
    predicted_class = np.argmax(probabilities, axis=1).item()

    # Almacenar los resultados
    all_ids_download.append(pet_id)
    all_probabilities_download.append(probabilities.flatten())  # Aplanar el array de probabilidades
    all_predictions_download.append(predicted_class)

# Crear un DataFrame con los resultados
results_df = pd.DataFrame({
    'ID': all_ids_download,
    'Probabilities': all_probabilities_download,
    'Prediction': all_predictions_download
})

# Guardar los resultados en un archivo CSV
results_df.to_csv('predictions_desc_test.csv', index=False)

# Ver las primeras filas del DataFrame resultante
print(results_df.head())
files.download('/content/predictions_desc_test.csv')


  0%|          | 0/2996 [00:00<?, ?it/s]

          ID                                      Probabilities  Prediction
0  89d4cef63  [0.0019559788, 0.002046748, 0.013240888, 0.055...           4
1  cc7d90ef4  [0.0052554854, 0.09236715, 0.71967584, 0.18120...           2
2  02cd64831  [0.00067357335, 0.0034487182, 0.47987753, 0.51...           3
3  9c95b7659  [0.0040065325, 0.9177144, 0.059182093, 0.01611...           1
4  ae4ab31ec  [0.0011814154, 0.28571945, 0.6665878, 0.045162...           2


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>