# Fine-Tune a Large Language Model for Terinary Sentence Sentiment Calssification

### Install necessary libraries

In [2]:
!pip install numpy>=1.20.0 scipy>=1.7.0 matplotlib>=3.7.0 scikit-learn>=1.0.2 nltk>=3.7 pytest>=7.1 jupyter>=1.0.0 pandas>=1.5 torch==1.13.1 torchvision==0.14.1 transformers==4.32.1 datasets==2.10.1 spacy==3.5.1 --quiet

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
en-core-web-sm 3.6.0 requires spacy<3.7.0,>=3.6.0, but you have spacy 3.5.1 which is incompatible.
torchaudio 2.0.2+cu118 requires torch==2.0.1, but you have torch 1.13.1 which is incompatible.
torchdata 0.6.1 requires torch==2.0.1, but you have torch 1.13.1 which is incompatible.
torchtext 0.15.2 requires torch==2.0.1, but you have torch 1.13.1 which is incompatible.[0m[31m
[0m

In [3]:
!pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.22.0-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.2/251.2 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.22.0


In [4]:
from collections import defaultdict, Counter
from datasets import load_dataset
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
import torch

## Import Dynaset-R1

In [5]:
dynasent_r1 = load_dataset("dynabench/dynasent", 'dynabench.dynasent.r1.all')

Downloading builder script:   0%|          | 0.00/16.5k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/6.97k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/13.7k [00:00<?, ?B/s]

Downloading and preparing dataset dynasent/dynabench.dynasent.r1.all (download: 16.26 MiB, generated: 23.94 MiB, post-processed: Unknown size, total: 40.20 MiB) to /root/.cache/huggingface/datasets/dynabench___dynasent/dynabench.dynasent.r1.all/1.1.0/ab89971d9ae1aacc59ed44d6855bf0e89167417257e2c2666f38e532148f2967...


Downloading data:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/80488 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3600 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3600 [00:00<?, ? examples/s]

Dataset dynasent downloaded and prepared to /root/.cache/huggingface/datasets/dynabench___dynasent/dynabench.dynasent.r1.all/1.1.0/ab89971d9ae1aacc59ed44d6855bf0e89167417257e2c2666f38e532148f2967. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

## Fine-Tune a model

In [6]:
# PLEASE MAKE SURE TO INCLUDE THE FOLLOWING BETWEEN THE START AND STOP COMMENTS:
#   1) Textual description of your system.
#   2) The code for your original system.
# PLEASE MAKE SURE NOT TO DELETE OR EDIT THE START AND STOP COMMENTS

# START COMMENT: Enter your system description in this cell.
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from transformers import Trainer, TrainingArguments
from sklearn.metrics import classification_report

class SentimentClassifier:
    def __init__(self, model_name, num_classes, max_seq_length, batch_size, num_epochs, learning_rate):
        self.model_name = model_name
        self.num_classes = num_classes
        self.max_seq_length = max_seq_length
        self.batch_size = batch_size
        self.num_epochs = num_epochs
        self.learning_rate = learning_rate
        self.output_dir = f"./sentence_classifier"
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Load pretrained model and tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, num_labels=num_classes).to(self.device)

        # Define training arguments
        self.training_args = TrainingArguments(
            per_device_train_batch_size=self.batch_size,
            per_device_eval_batch_size=self.batch_size,
            evaluation_strategy="epoch",
            save_total_limit=1,
            output_dir=self.output_dir,
            num_train_epochs=self.num_epochs,
            learning_rate=self.learning_rate,
            logging_dir="./logs",
        )

        # Define optimizer and scheduler
        self.optimizer = AdamW(self.model.parameters(), lr=self.learning_rate)
        self.scheduler = get_linear_schedule_with_warmup(self.optimizer,
                                                         num_warmup_steps=0,
                                                         num_training_steps=self.num_epochs)

    def trainer(self, train_dataset, val_dataset):
        # Define Trainer
        trainer = Trainer(
            model=self.model,
            args=self.training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            optimizers=(self.optimizer, self.scheduler),
        )
        return trainer

    def generate_predictions(self, train_dataset, sentence):
        classifier_model = AutoModelForSequenceClassification.from_pretrained(self.output_dir).to(self.device)
        inputs = self.tokenizer(sentence, padding='max_length', truncation=True, return_tensors='pt', max_length=self.max_seq_length)
        inputs.to(self.device)
        with torch.no_grad():
            outputs = classifier_model(**inputs)
        predicted_label_id = torch.argmax(outputs.logits, dim=1)
        predicted_label = train_dataset.features["gold_label"].feature.str[predicted_label_id.item()]
        return predicted_label

    def tokenize_data(self, data):
        inputs = self.tokenizer(data['sentence'], text_pair=data['gold_label'], padding='max_length', truncation=True, return_tensors="pt", max_length=self.max_seq_length)
        #labels = data['gold_label']
        #return {'input_ids': inputs['input_ids'], 'attention_mask': inputs['attention_mask'], 'labels': labels}
        return inputs




In [7]:
# Initialize the classifier
classifier = SentimentClassifier(model_name="google/flan-t5-small", num_classes=3, max_seq_length=128,
                                  batch_size=32, num_epochs=5, learning_rate=2e-5)


Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at google/flan-t5-small and are newly initialized: ['classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight', 'classification_head.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
train_dataset = dynasent_r1['train'].map(classifier.tokenize_data, batched=True)
val_dataset = dynasent_r1['validation'].map(classifier.tokenize_data, batched=True)

Map:   0%|          | 0/80488 [00:00<?, ? examples/s]

Map:   0%|          | 0/3600 [00:00<?, ? examples/s]

In [9]:
trainer = classifier.trainer(train_dataset, val_dataset)

In [10]:
trainer

<transformers.trainer.Trainer at 0x799b69a58e50>

In [11]:
trainer.train()

ValueError: ignored

In [13]:
!nvidia-smi

Mon Sep  4 13:18:24 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   57C    P0    28W /  70W |   6151MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces