In [None]:
!pip install datasets
!pip install transformers
!pip install evaluate
!pip install accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Collec

In [None]:
from google.colab import drive
# mount drive to access data
drive.mount('/content/drive')
# load data folder into working directory
!cp -r drive/MyDrive/data .

Mounted at /content/drive


In [None]:
from datasets import Dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, \
 AdamW, TrainingArguments, Trainer
import numpy as np
import evaluate
import torch

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# loading data with preprocessed tweets
train_df = pd.read_csv('data/train.csv')
val_df = pd.read_csv('data/val.csv')

In [None]:
# drop other columns
train_df = train_df[['PosSentiment', 'TweetText']]
val_df = val_df[['PosSentiment', 'TweetText']]

In [None]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [None]:
# downsample for finetuning
train_dataset = train_dataset.select(range(3000))
val_dataset = val_dataset.select(range(625))

### Prepare Input for BERT

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
train_dataset = train_dataset.rename_column("PosSentiment", "labels")
train_dataset = train_dataset.rename_column("TweetText", "text")
val_dataset = val_dataset.rename_column("PosSentiment", "labels")
val_dataset = val_dataset.rename_column("TweetText", "text")

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [None]:
train_dataset = train_dataset.map(tokenize_function, batched=True) # return_tensors='pt'
val_dataset = val_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/625 [00:00<?, ? examples/s]

In [None]:
train_dataset = train_dataset.class_encode_column("labels")
val_dataset = val_dataset.class_encode_column("labels")

Stringifying the column:   0%|          | 0/3000 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/3000 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/625 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/625 [00:00<?, ? examples/s]

### Class weighted Cross Entropy

In [None]:
y = np.array(train_dataset['labels'])

In [None]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)

In [None]:
class_weights=torch.tensor(class_weights,dtype=torch.float).to(device)

In [None]:
loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)
class WeightedTrainer(Trainer):

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def compute_loss(self, model, inputs, return_outputs=False):
      
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs[0]
        #hack = dict()
        #hack['logits'] = logits
        if return_outputs:
          return (loss_fn(logits, labels), outputs)
        return loss_fn(logits, labels)

### Load Model and freeze layers

In [None]:
# download pretrained model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=5).to(device)

# freeze embeddings and encoder part
for param in model.bert.embeddings.parameters():
  param.requires_grad = False
for param in model.bert.encoder.parameters():
  param.requires_grad = False

# unfreeze top layers of the encoder
for param in model.bert.encoder.layer[9].parameters():
  param.requires_grad = True
for param in model.bert.encoder.layer[10].parameters():
  param.requires_grad = True
for param in model.bert.encoder.layer[11].parameters():
  param.requires_grad = True

# gather trainable parameters
params = list(model.bert.pooler.parameters()) \
          + list(model.classifier.parameters()) \
          + list(model.bert.encoder.layer[11].parameters()) \
          + list(model.bert.encoder.layer[10].parameters()) \
          + list(model.bert.encoder.layer[9].parameters())  

# assing trainable parameters to optimizer
optim = AdamW(params=params, lr=5e-5, weight_decay=0.0)

# training arguments
training_args = TrainingArguments(output_dir="test_trainer", \
                                  evaluation_strategy="epoch", \
                                  per_device_eval_batch_size=32, \
                                  per_device_train_batch_size=32, \
                                  num_train_epochs=4.0, \
                                  fp16=True, \
                                  logging_strategy="epoch")

# metric computation
metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    preds = eval_pred.predictions[0] if isinstance(eval_pred.predictions, tuple) else eval_pred.predictions
    predictions = np.argmax(preds, axis=-1)
    
    return metric.compute(predictions=predictions, references=eval_pred.label_ids, average="weighted")

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    optimizers=(optim, None)
)

trainer.train()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch,Training Loss,Validation Loss,F1
1,1.4784,1.251146,0.473343
2,1.3364,1.108621,0.545279
3,1.0908,1.080136,0.598648
4,0.9577,1.034781,0.613445


TrainOutput(global_step=376, training_loss=1.2158017462872444, metrics={'train_runtime': 254.5739, 'train_samples_per_second': 47.138, 'train_steps_per_second': 1.477, 'total_flos': 3157417709568000.0, 'train_loss': 1.2158017462872444, 'epoch': 4.0})

In [None]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,