<a href="https://colab.research.google.com/github/sjoshi63/Story/blob/master/BERT_FineTuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#!pip install datasets evaluate

In [2]:
# Perform the following just once to setup Kaggle access through Python
#from google.colab import files
#files.upload()
# ! mkdir ~/.kaggle
# ! cp kaggle.json ~/.kaggle/
# ! chmod 600 ~/.kaggle/kaggle.json#


In [3]:
import os
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import torch
import glob
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from transformers import pipeline


In [4]:
label2id = {
    'Anxiety': 0,
    'Normal': 1,
    'Depression': 2,
    'Bipolar': 3,
    'Suicidal': 4,
    'Stress': 5,
    'Personality disorder': 6
    }

In [5]:
df = pd.read_csv('/content/sample_data/sentiments.csv')
len(df)

53043

In [6]:
df = df.dropna()
len(df)## should be less if after dropping rows with null columns

52681

In [7]:
df['status'].unique()

array(['Anxiety', 'Normal', 'Depression', 'Suicidal', 'Stress', 'Bipolar',
       'Personality disorder'], dtype=object)

In [8]:
df['status'] = df['status'].map(label2id)

In [9]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
model = model.to('cuda')

In [11]:
X = list(df['statement'])
y = list(df['status'])

In [12]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

In [13]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.encodings["input_ids"])

In [14]:
X_train_tokenized[0]

Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [15]:
train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

In [16]:
train_dataset[5]

{'input_ids': tensor([  101,  1045,  2079,  2025,  2215,  2000,  2444,  4902,  1012,  1045,
          2572,  2385,  1998,  2673,  1999,  2026,  2166,  1010, 19237,  1012,
          2026,  3008,  2031,  2042,  9196,  2005,  2086,  1010,  2057,  2333,
          2408,  3032,  1010,  2026,  2269,  2318,  1037,  2047,  2155,  1998,
          2043,  2002,  2657,  2008,  1045,  2079,  2025,  2156,  2010,  2155,
          2004,  2026,  2155,  2002,  2741,  2033,  2125,  2000,  1037, 19294,
          1012,  1045,  2031,  2288,  7078,  2053,  2490,  1999,  2505,  1010,
          2013,  3087,  1012,  2043,  1045,  4982,  2039,  1010,  1045,  2215,
          2000,  2022,  1037,  5455,  1010,  2021,  2026,  3008,  2079,  2025,
          2066,  2008,  2801,  2012,  2035,  1012,  2027,  2036,  2377,  1996,
          6778,  4003,  2006,  2033,  2035,  1996,  2051,  1010,  1998,  2043,
          1045, 14323,  2068,  2055,  2037, 28586,  2027,  2735,  2009,  2105,
          2006,  2033,  1012,  3374,  2

In [17]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
#

In [18]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [19]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=3,
    report_to="tensorboard",
    fp16=True,
)



In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

In [21]:
history = trainer.train()

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
