<a href="https://colab.research.google.com/github/sjoshi63/Story/blob/master/BERT_FineTuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
#!pip install datasets evaluate
#!pip install -U transformers

Collecting transformers
  Downloading transformers-4.44.0-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.44.0-py3-none-any.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m113.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.42.4
    Uninstalling transformers-4.42.4:
      Successfully uninstalled transformers-4.42.4
Successfully installed transformers-4.44.0


In [None]:
# Perform the following just once to setup Kaggle access through Python
#from google.colab import files
#files.upload()
# ! mkdir ~/.kaggle
# ! cp kaggle.json ~/.kaggle/
# ! chmod 600 ~/.kaggle/kaggle.json#


In [2]:
import os
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import torch
import glob
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from transformers import pipeline


In [3]:
label2id = {
    'Anxiety': 0,
    'Normal': 1,
    'Depression': 2,
    'Bipolar': 3,
    'Suicidal': 4,
    'Stress': 5,
    'Personality disorder': 6
    }

In [4]:
df = pd.read_csv('/content/sample_data/sentiments.csv')
len(df)

53043

In [5]:
df = df.dropna()
len(df)## should be less if after dropping rows with null columns

52681

In [6]:
df['status'].unique()

array(['Anxiety', 'Normal', 'Depression', 'Suicidal', 'Stress', 'Bipolar',
       'Personality disorder'], dtype=object)

In [7]:
df['status'] = df['status'].map(label2id)

In [8]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=7)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [10]:
model = model.to('cuda')

In [11]:
X = list(df['statement'])
y = list(df['status'])

In [12]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

In [13]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.encodings["input_ids"])

In [14]:
X_train_tokenized[0]

Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [15]:
train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

In [16]:
train_dataset[5]

{'input_ids': tensor([  101,  1045,  2572,  2200,  5305,  1998,  5458,  1010,  2119, 10597,
          1998,  8186,  1012,  1045,  2031,  2042,  8084,  2007,  1037,  2843,
          1997,  3558,  2740,  3314,  2005,  2026,  2878,  2166,  1010,  1998,
          1045,  2031,  2042,  8084,  2007,  5177,  2740,  3314,  2005,  1996,
          2627,  1020,  1011,  1021,  2086,  1010,  1045,  2572,  2061,  5458,
          1997,  2383,  2000,  3066,  2007,  2119,  3471,  1010,  2004,  2026,
          3558,  2740,  3084,  2026,  5177,  2740,  4788,  1998,  1996,  6911,
          2013,  2026,  5177,  2740,  2003,  3497,  2074,  2437,  2026,  3785,
          4788,  1010,  2049,  1037,  2196,  4566,  5402,  1998,  1045,  6524,
          2514,  2204,  1010,  2412,  1012,  1045,  2031,  2042,  8084,  2007,
          5729, 26180,  2005,  2026,  2878,  2166,  1010,  2029,  2003,  2986,
          2006,  2049,  2219,  1010,  2021,  2049,  2012,  1996,  2391,  2073,
          1045,  2031,  1037,  2524,  2

In [17]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
#

In [18]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [19]:
training_args = TrainingArguments(
    output_dir='output',
    num_train_epochs=1,
#    learning_rate=2e-5,
    per_device_train_batch_size=8,
#    per_device_eval_batch_size=18,
#    weight_decay=0.01,
#    eval_strategy="epoch",
#    save_strategy="epoch",
#    load_best_model_at_end=True,
#    save_total_limit=3,
#    report_to="tensorboard",
#    fp16=True,
)

In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
#    data_collator=data_collator,
)

In [21]:
history = trainer.train()

Step,Training Loss
500,0.9357
1000,0.6287
1500,0.58
2000,0.5171
2500,0.5202
3000,0.4922
3500,0.4764
4000,0.4469
4500,0.436
5000,0.4138


In [22]:
trainer.evaluate(val_dataset)

{'eval_loss': 0.4061444401741028,
 'eval_accuracy': 0.8458764354180507,
 'eval_f1': 0.8460171064319972,
 'eval_precision': 0.847032520257301,
 'eval_recall': 0.8458764354180507,
 'eval_runtime': 149.5569,
 'eval_samples_per_second': 70.455,
 'eval_steps_per_second': 8.813,
 'epoch': 1.0}