<a href="https://colab.research.google.com/github/ronie-1989/Master-Thesis/blob/main/self_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi

Mon Oct  2 19:43:02 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   52C    P8    10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%%capture
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install wandb

In [13]:
import torch
import transformers
import evaluate
import wandb
import os

from datasets import Dataset, DatasetDict, load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_scheduler, AutoModel, BertModel
from tqdm.auto import tqdm
from torch.utils.data import DataLoader
#from torcheval.metrics.functional import multiclass_confusion_matrix

import torch.nn as nn
import torch.optim as optim
import numpy as np
#import pandas as pd
#import matplotlib.pyplot as plt
#import seaborn as sns

In [14]:
class SBertModel(torch.nn.Module):
  def __init__(self, model_name, num_classes=2, dropout_prob=0.1):
    super(SBertModel, self).__init__()
    #self.tokenizer = AutoTokenizer.from_pretrained(model_name)
    self.bert = AutoModel.from_pretrained(model_name)
    self.dropout = nn.Dropout(p=dropout_prob)
    self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)

  def forward(self, input_ids, attention_mask):
    #input_ids = self.tokenizer(input_text, padding=True, truncation=True, return_tensors='pt')
    outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
    pooled_output = self.dropout(outputs.last_hidden_state[:, 0, :])
    logits = self.classifier(pooled_output)
    return logits

In [15]:
def save_model(model, optimizer, epoch, save_path):
  checkpoint = {"epoch": epoch,
                "model_state_dict": model.state_dict(),
                "optimizer_state_dict": optimizer.state_dict()
                }

  checkpoint_path = f"{save_path}_epoch_{epoch}.pt"
  torch.save(checkpoint, checkpoint_path)
  print(f"Model checkpoint at epoch {epoch}: {checkpoint_path}")

In [16]:
def evaluate_model(model, dataloader, loss_fn, device):
  test_loss = 0.0
  accuracy_metric = evaluate.load("accuracy")
  f1_metric = evaluate.load("f1")

  all_preds = []
  all_labels = []

  model.eval()

  for batch in dataloader:
    batch = {k:v.to(device) for k,v in batch.items()}

    with torch.no_grad():
      input_ids = batch['input_ids']
      attention_mask = batch['attention_mask']
      logits = model(input_ids, attention_mask)

      predictions = torch.argmax(logits, dim=1)
      labels = batch["labels"]

      all_preds.extend(predictions)
      all_labels.extend(labels)

      loss = loss_fn(logits, labels)
      test_loss += loss.item()


    test_loss /= len(dataloader)

    valid_accuracy = accuracy_metric.compute(predictions=all_preds, references= all_labels)
    valid_f1 = f1_metric.compute(predictions=all_preds, references= all_labels)

    return all_preds, all_labels, test_loss, valid_accuracy, valid_f1

In [17]:
def train_model(model, optimizer, loss_fn, train_dataloader, valid_dataloader, num_epochs, device, save_path):

  training_losses = []
  validation_losses = []
  validation_accuracies = []
  validation_f1_scores = []

  model.train()
  progress_bar = tqdm(range(num_epochs * len(train_dataloader)))

  for epoch in range(num_epochs):
    train_loss = 0.0

    for batch in train_dataloader:
      batch = {k:v.to(device) for k,v in batch.items()}
      input_ids = batch['input_ids']
      attention_mask = batch['attention_mask']
      labels = batch['labels']
      outputs = model(input_ids, attention_mask)
      loss = loss_fn(outputs, labels)
      loss.backward()

      optimizer.step()
      optimizer.zero_grad()

      train_loss += loss.item()
      progress_bar.update(1)

    train_loss /= len(train_dataloader)
    training_losses.append(train_loss)

    preds, labels, valid_loss, valid_accuracy, valid_f1 = evaluate_model(model, valid_dataloader, loss_fn, device)

    validation_losses.append(valid_loss)
    validation_accuracies.append(valid_accuracy)
    validation_f1_scores.append(valid_f1)

    wandb.log({
        "epoch": epoch,
        "train_loss": train_loss,
        "validation_loss": valid_loss,
        "validation_accuracy": valid_accuracy,
        "validation_f1": valid_f1
    })

    save_model(model, optimizer, epoch, save_path)

  return training_losses, validation_losses, validation_accuracies, validation_f1_scores

In [18]:
def main():

  # read dataset
  dataset = load_dataset('csv', data_files="/content/drive/MyDrive/RWTH Aachen Media Informatics/Semester 6/Master Thesis/Gab Hate Corpus/ghc_train.tsv", sep='\t')
  small_dataset = dataset['train'].shard(num_shards=20, index=0) # this is for debugging purposes only

  # create the train test split
  train_test_split = small_dataset.train_test_split(test_size=0.1) # try with bigger test size
  train_dataset = train_test_split['train']
  valid_dataset = train_test_split['test']

  #tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L12-v2")
  tokenizer = AutoTokenizer.from_pretrained("nreimers/MiniLM-L6-H384-uncased")

  def tokenization(example):
    return tokenizer(example['text'], padding='max_length', truncation=True)
    #return tokenizer(example['text'], padding=True, truncation=True, return_tensors='pt')

  def return_tokenized(in_dataset):
    tokenized_dataset = in_dataset.map(tokenization, batched=True)
    tokenized_dataset = tokenized_dataset.remove_columns(['text', 'cv', 'vo']).rename_column('hd', 'labels')
    tokenized_dataset.set_format('torch')
    return tokenized_dataset

  # create dataloaders
  tokenized_train = return_tokenized(train_dataset)
  train_dataloader = DataLoader(tokenized_train, batch_size=16)

  tokenized_valid = return_tokenized(valid_dataset)
  valid_dataloader = DataLoader(tokenized_valid, batch_size=16)

  num_epochs = 10
  model = SBertModel("nreimers/MiniLM-L6-H384-uncased")
  #model = SBertModel("sentence-transformers/all-MiniLM-L12-v2")
  optimizer = optim.AdamW(model.parameters(), lr=5e-5)
  loss_criterion = nn.CrossEntropyLoss()

  device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
  model.to(device)


  run = wandb.init(project="sbert-master_thesis", config={"optimizer": optimizer,
                                                     "epochs": num_epochs})

  print(f"wandb run name: {run.name}")


  save_path = os.path.join("/content/drive/MyDrive/RWTH Aachen Media Informatics/Semester 6/Master Thesis/checkpoint_logs/", run.name)
  os.mkdir(save_path)
  save_path = "/content/drive/MyDrive/RWTH Aachen Media Informatics/Semester 6/Master Thesis/checkpoint_logs/" + run.name + "/"

  train_model(model, optimizer, loss_criterion, train_dataloader, valid_dataloader, num_epochs, device, save_path)

  wandb.finish()

In [19]:
if __name__ == "__main__":
  main()

Map:   0%|          | 0/991 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

wandb run name: logical-firefly-77


  0%|          | 0/620 [00:00<?, ?it/s]

Model checkpoint at epoch 0: /content/drive/MyDrive/RWTH Aachen Media Informatics/Semester 6/Master Thesis/checkpoint_logs/logical-firefly-77/_epoch_0.pt
Model checkpoint at epoch 1: /content/drive/MyDrive/RWTH Aachen Media Informatics/Semester 6/Master Thesis/checkpoint_logs/logical-firefly-77/_epoch_1.pt
Model checkpoint at epoch 2: /content/drive/MyDrive/RWTH Aachen Media Informatics/Semester 6/Master Thesis/checkpoint_logs/logical-firefly-77/_epoch_2.pt
Model checkpoint at epoch 3: /content/drive/MyDrive/RWTH Aachen Media Informatics/Semester 6/Master Thesis/checkpoint_logs/logical-firefly-77/_epoch_3.pt
Model checkpoint at epoch 4: /content/drive/MyDrive/RWTH Aachen Media Informatics/Semester 6/Master Thesis/checkpoint_logs/logical-firefly-77/_epoch_4.pt
Model checkpoint at epoch 5: /content/drive/MyDrive/RWTH Aachen Media Informatics/Semester 6/Master Thesis/checkpoint_logs/logical-firefly-77/_epoch_5.pt
Model checkpoint at epoch 6: /content/drive/MyDrive/RWTH Aachen Media Inform

VBox(children=(Label(value='0.001 MB of 0.012 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.096335…

0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▇▄▂▁▁▁▁▁▁
validation_loss,▂▁▃▂▆▃▆█▅▅

0,1
epoch,9.0
train_loss,0.00025
validation_loss,0.05026
