In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorboard as tf
from sklearn.model_selection import train_test_split

In [None]:
#torch version --> 2.4.1+cu121
import torch
print(torch.__version__)

2.4.1+cu121


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, EarlyStoppingCallback

from torch.optim import AdamW

In [None]:
df = pd.read_csv('/content/labeled_twitter_depressive_full_clean_km2.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123984 entries, 0 to 123983
Data columns (total 4 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   text            123984 non-null  object 
 1   depression      123984 non-null  float64
 2   not_depression  123984 non-null  float64
 3   cluster         123984 non-null  int64  
dtypes: float64(2), int64(1), object(1)
memory usage: 3.8+ MB


In [None]:
from sklearn.preprocessing import LabelEncoder

X = df['text'].values.tolist()
y = df['cluster'].values.tolist()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [None]:
from transformers import AutoTokenizer

model_name = "tiya1012/swmh4_bert"

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, ignore_mismatched_sizes=True)
model = model.to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/870 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at tiya1012/swmh4_bert and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([4, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/348 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]



In [None]:
train_encodings = tokenizer(X_train, truncation=True, padding=True)
test_encodings = tokenizer(X_test, truncation=True, padding=True)

print('Train & validation texts encoded')

Train & validation texts encoded


In [None]:
test_encodings.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [None]:
import torch

class TweetDataset(torch.utils.data.Dataset):
    """
    Class to store the tweet data as PyTorch Dataset
    """

    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


print(TweetDataset.__doc__)


    Class to store the tweet data as PyTorch Dataset
    


In [None]:
train_dataset = TweetDataset(train_encodings, y_train)
test_dataset = TweetDataset(test_encodings, y_test)

In [None]:
from sklearn.utils.class_weight import compute_class_weight
import torch.nn as nn

class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
loss_fn = nn.CrossEntropyLoss(weight=class_weights)

In [None]:
from transformers import Trainer

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [None]:
from transformers import TrainerCallback
import json

class SaveAllMetricsCallback(TrainerCallback):
    def __init__(self, output_file="metrics.json"):
        self.output_file = output_file
        self.metrics = []

    def on_log(self, args, state, control, **kwargs):
        if state.log_history:
            last_log = state.log_history[-1]
            self.metrics.append(last_log)

    def on_train_end(self, args, state, control, **kwargs):
        with open(self.output_file, "w") as f:
            json.dump(self.metrics, f, indent=4)

In [None]:
all_metrics_callback = SaveAllMetricsCallback(output_file="all_training_metrics.json")

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    balanced_acc = balanced_accuracy_score(labels, preds)
    precision = precision_score(labels, preds),
    recall = recall_score(labels, preds),
    f1 = f1_score(labels, preds)

    return {
        'accuracy': acc,
        'balanced_acc' : balanced_acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=50,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    learning_rate=1e-4,
    lr_scheduler_type="linear",
    warmup_steps=500,
    save_total_limit=1,
    dataloader_pin_memory=False,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    logging_dir='./logs'
)



In [None]:
early_stopping = EarlyStoppingCallback(early_stopping_patience=10)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[all_metrics_callback, early_stopping]
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Balanced Acc,Precision,Recall,F1
1,0.3689,0.357217,0.844497,0.840862,"(0.8231736130800491,)","(0.8143992519869098,)",0.818763
2,0.2987,0.35639,0.845465,0.840628,"(0.8310660877954655,)","(0.8054230949041609,)",0.818044
3,0.1906,0.487523,0.840344,0.83573,"(0.8231625407791211,)","(0.8021505376344086,)",0.812521
4,0.1291,0.56075,0.833085,0.822752,"(0.8474666101335595,)","(0.7475455820476858,)",0.794376
5,0.1027,0.607705,0.832359,0.833431,"(0.7853526536312849,)","(0.841234221598878,)",0.812334
6,0.1024,0.617135,0.83006,0.825288,"(0.8107201073928468,)","(0.7905563347358578,)",0.800511
7,0.0959,0.67469,0.826592,0.824893,"(0.7910787437414656,)","(0.812529219261337,)",0.801661
8,0.1335,0.653986,0.800097,0.782318,"(0.8486874088478367,)","(0.6529219261337074,)",0.738044
9,0.2005,0.691625,0.575231,0.507619,"(0.9764705882352941,)","(0.015521271622253389,)",0.030557
10,0.6962,0.697554,0.431302,0.5,"(0.4313021736500383,)","(1.0,)",0.602671


Trainer is attempting to log a value of "(0.8231736130800491,)" of type <class 'tuple'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "(0.8143992519869098,)" of type <class 'tuple'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "(0.8310660877954655,)" of type <class 'tuple'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "(0.8054230949041609,)" of type <class 'tuple'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "(0.8231625407791211,)" of type <class 'tuple'> for key "eval/precision" as a 

TrainOutput(global_step=18600, training_loss=0.31840455947383756, metrics={'train_runtime': 17551.7253, 'train_samples_per_second': 282.556, 'train_steps_per_second': 4.416, 'total_flos': 5.321381422442186e+16, 'train_loss': 0.31840455947383756, 'epoch': 12.0})

In [None]:
results = trainer.evaluate()

Trainer is attempting to log a value of "(0.8310660877954655,)" of type <class 'tuple'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "(0.8054230949041609,)" of type <class 'tuple'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


In [None]:
print(f"Resultados: {results}")

Resultados: {'eval_loss': 0.3563902974128723, 'eval_accuracy': 0.8454651772391821, 'eval_balanced_acc': 0.8406281550254743, 'eval_precision': (0.8310660877954655,), 'eval_recall': (0.8054230949041609,), 'eval_f1': 0.8180436847103514, 'eval_runtime': 112.3029, 'eval_samples_per_second': 220.805, 'eval_steps_per_second': 3.455, 'epoch': 12.0}


# Save Model

In [None]:
torch.save(model.state_dict(), 'best_bert_model.pth')

#Load Model

In [None]:
from transformers import AutoTokenizer

model_name = "tiya1012/swmh4_bert"

model_load = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, ignore_mismatched_sizes=True)
model_load.load_state_dict(torch.load('best_bert_model.pth'))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at tiya1012/swmh4_bert and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([4, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model_load.load_state_dict(torch.load('best_bert_model.pth'))


<All keys matched successfully>

In [None]:
model_load.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
trainer_eval = Trainer(
    model=model_load,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [None]:
results_eval_load = trainer_eval.evaluate()

print(f"Resultados de evaluación: {results_eval_load}")

Trainer is attempting to log a value of "(0.8310660877954655,)" of type <class 'tuple'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "(0.8054230949041609,)" of type <class 'tuple'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


Resultados de evaluación: {'eval_loss': 0.3495139181613922, 'eval_model_preparation_time': 0.0052, 'eval_accuracy': 0.8454651772391821, 'eval_balanced_acc': 0.8406281550254743, 'eval_precision': (0.8310660877954655,), 'eval_recall': (0.8054230949041609,), 'eval_f1': 0.8180436847103514, 'eval_runtime': 126.6355, 'eval_samples_per_second': 195.814, 'eval_steps_per_second': 24.48}
