In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorboard as tf
from sklearn.model_selection import train_test_split

In [None]:
#torch version --> 2.4.1+cu121
import torch
print(torch.__version__)

2.5.1+cu124


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, EarlyStoppingCallback

from torch.optim import AdamW

In [None]:
df = pd.read_csv('/content/labeled_twitter_depressive_full_clean_km2.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123984 entries, 0 to 123983
Data columns (total 4 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   text            123984 non-null  object 
 1   depression      123984 non-null  float64
 2   not_depression  123984 non-null  float64
 3   cluster         123984 non-null  int64  
dtypes: float64(2), int64(1), object(1)
memory usage: 3.8+ MB


In [None]:
from sklearn.preprocessing import LabelEncoder

X = df['text'].values.tolist()
y = df['cluster'].values.tolist()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [None]:
from transformers import AutoTokenizer

model_name = "tiya1012/swmh4_mtb"

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, ignore_mismatched_sizes=True)
model = model.to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at tiya1012/swmh4_mtb and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([4, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
train_encodings = tokenizer(X_train, truncation=True, padding=True)
test_encodings = tokenizer(X_test, truncation=True, padding=True)

print('Train & validation texts encoded')

Train & validation texts encoded


In [None]:
test_encodings.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [None]:
import torch

class TweetDataset(torch.utils.data.Dataset):
    """
    Class to store the tweet data as PyTorch Dataset
    """

    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # an encoding can have keys such as input_ids and attention_mask
        # item is a dictionary which has the same keys as the encoding has
        # and the values are the idxth value of the corresponding key (in PyTorch's tensor format)
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


print(TweetDataset.__doc__)


    Class to store the tweet data as PyTorch Dataset
    


In [None]:
train_dataset = TweetDataset(train_encodings, y_train)
test_dataset = TweetDataset(test_encodings, y_test)

In [None]:
from sklearn.utils.class_weight import compute_class_weight
import torch.nn as nn

# Definir la función de pérdida ponderada
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
loss_fn = nn.CrossEntropyLoss(weight=class_weights)

In [None]:
from transformers import Trainer

# Definir una clase personalizada para el modelo, sobrescribiendo el método 'compute_loss'
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [None]:
from transformers import TrainerCallback
import json

# Callback personalizado para guardar métricas de entrenamiento y validación
class SaveAllMetricsCallback(TrainerCallback):
    def __init__(self, output_file="metrics.json"):
        self.output_file = output_file
        self.metrics = []

    def on_log(self, args, state, control, **kwargs):
        # Guardar las métricas en cada log
        if state.log_history:
            # Captura el último registro de métricas
            last_log = state.log_history[-1]
            # Asegúrate de que las métricas de computación estén incluidas
            self.metrics.append(last_log)

    def on_train_end(self, args, state, control, **kwargs):
        # Al final del entrenamiento, guardar todas las métricas en un archivo JSON
        with open(self.output_file, "w") as f:
            json.dump(self.metrics, f, indent=4)

In [None]:
# Instanciar el callback para guardar todas las métricas
all_metrics_callback = SaveAllMetricsCallback(output_file="all_training_metrics.json")

In [None]:
#from sklearn.metrics import accuracy_score, precision_recall_fscore_support, balanced_accuracy_score
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    balanced_acc = balanced_accuracy_score(labels, preds)
    precision = precision_score(labels, preds),
    recall = recall_score(labels, preds),
    f1 = f1_score(labels, preds)

    return {
        'accuracy': acc,
        'balanced_acc' : balanced_acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=50,
    per_device_train_batch_size=64,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    weight_decay=0.01,               # strength of weight decay
    learning_rate=1.5e-4,
    lr_scheduler_type="linear",      # Usar scheduler lineal
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    save_total_limit=1,              # limit the total amount of checkpoints. Deletes the older checkpoints.
    dataloader_pin_memory=False,     # Whether you want to pin memory in data loaders or not. Will default to True
    evaluation_strategy="epoch",     # Evaluation is done at the end of each epoch.
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    #evaluation_strategy="steps",
    #logging_steps=100,
    logging_dir='./logs',
    report_to="none",  # Desactivar wandb
)



In [None]:
early_stopping = EarlyStoppingCallback(early_stopping_patience=10)

trainer = CustomTrainer(
    model=model,  # the instantiated 🤗 Transformers model to be trained
    args=training_args,  # training arguments, defined above
    train_dataset=train_dataset,  # training dataset
    eval_dataset=test_dataset,  # evaluation dataset
    compute_metrics=compute_metrics,  # The function that will be used to compute metrics at evaluation
    callbacks=[all_metrics_callback, early_stopping]
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Balanced Acc,Precision,Recall,F1
1,0.381,0.374045,0.838085,0.833462,"(0.8202915228231684,)","(0.7998129967274428,)",0.809923
2,0.3125,0.381125,0.836875,0.826988,"(0.85,)","(0.7550257129499767,)",0.799703
3,0.2211,0.531665,0.831834,0.827176,"(0.8124102269462798,)","(0.7932678821879383,)",0.802725
4,0.1572,0.537011,0.827398,0.824179,"(0.799402594978064,)","(0.800748013090229,)",0.800075
5,0.3162,0.701322,0.431302,0.5,"(0.4313021736500383,)","(1.0,)",0.602671
6,0.6538,0.603603,0.61209,0.656679,"(0.5270188830855765,)","(0.9812061711079944,)",0.685725
7,0.5627,0.625797,0.588378,0.636249,"(0.5118596286575289,)","(0.9846657316503039,)",0.673574
8,0.6954,0.696635,0.431302,0.5,"(0.4313021736500383,)","(1.0,)",0.602671
9,0.6954,0.693289,0.431302,0.5,"(0.4313021736500383,)","(1.0,)",0.602671
10,0.695,0.696481,0.431302,0.5,"(0.4313021736500383,)","(1.0,)",0.602671


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=17050, training_loss=0.4809161593976958, metrics={'train_runtime': 16249.0832, 'train_samples_per_second': 305.208, 'train_steps_per_second': 4.769, 'total_flos': 4.877932970572003e+16, 'train_loss': 0.4809161593976958, 'epoch': 11.0})

In [None]:
# Evaluar el modelo
results = trainer.evaluate()

In [None]:
#trainer.save_model('./best_model2')

In [None]:
print(f"Resultados: {results}")

Resultados: {'eval_loss': 0.3740449547767639, 'eval_accuracy': 0.8380852522482558, 'eval_balanced_acc': 0.8334620224028648, 'eval_precision': (0.8202915228231684,), 'eval_recall': (0.7998129967274428,), 'eval_f1': 0.8099228329309284, 'eval_runtime': 117.9486, 'eval_samples_per_second': 210.236, 'eval_steps_per_second': 3.29, 'epoch': 11.0}


# Save Model

In [None]:
torch.save(model.state_dict(), 'best_mtb_model.pth')

#Load Model

In [None]:
from transformers import AutoTokenizer

model_name = "tiya1012/swmh4_mtb"

model_load = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, ignore_mismatched_sizes=True)
model_load.load_state_dict(torch.load('best_mtb_model.pth'))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at tiya1012/swmh4_mtb and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([4, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model_load.load_state_dict(torch.load('best_mtb_model.pth'))


<All keys matched successfully>

In [None]:
model_load.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
training_args_eval = TrainingArguments(
    output_dir='./results_eval',
    report_to="none",  # Desactivar wandb
)

In [None]:
trainer_eval = Trainer(
    model=model_load,
    args=training_args_eval,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [None]:
# Evaluar el modelo en el conjunto de evaluación
results_eval_load = trainer_eval.evaluate()

# Imprimir los resultados para comprobar que son los mismos
print(f"Resultados de evaluación: {results_eval_load}")

Resultados de evaluación: {'eval_loss': 0.36632803082466125, 'eval_model_preparation_time': 0.0067, 'eval_accuracy': 0.8380852522482558, 'eval_balanced_acc': 0.8334620224028648, 'eval_precision': (0.8202915228231684,), 'eval_recall': (0.7998129967274428,), 'eval_f1': 0.8099228329309284, 'eval_runtime': 126.4809, 'eval_samples_per_second': 196.053, 'eval_steps_per_second': 24.51}
