In [None]:
## Installing the necessary dependencies;

!pip install -qq -U bitsandbytes transformers peft accelerate datasets scipy einops evaluate trl

In [None]:
### Importing the necessary libraries;

from datasets import load_dataset, Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    GenerationConfig
)
from tqdm import tqdm
from trl import SFTTrainer
import torch
import time
from huggingface_hub import login

import pandas as pd
import numpy as np

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("hf_token")


### Hide this later using the .env file;
login(token=secret_value_0)

In [None]:
### gets me the GPU memory used at any point of time
from pynvml import *

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


print_gpu_utilization()

In [None]:
## Loading the dataset;

from datasets import load_dataset

dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_All_Beauty", trust_remote_code=True)
print(dataset["full"][0])


In [None]:
## Preprocessing;
rating = pd.DataFrame(dataset['full']['rating'], columns = ['rating'])
review = pd.DataFrame(dataset['full']['text'], columns = ['review'])


### Classify 1 as negative, 5 as positive
consol = pd.concat([rating, review], axis = 1)
consol = consol[consol.rating.isin([1,5])].reset_index(drop = True)

## Creating a label;
consol['sentiment'] = consol.rating.map({1: 'Negative', 5: 'Positive'})
consol.drop(['rating'], axis = 1, inplace = True)

In [None]:
# Convert to Hugging Face Dataset

from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

## Splitting it into train and test;
test_size = 0.25

train_df, test_df = train_test_split(consol, test_size = test_size)

# Convert individual DataFrames
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Combine into a DatasetDict
dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})


In [None]:
import gc

del train_dataset, test_dataset

gc.collect()

In [None]:
dataset

In [None]:
## Viewing a sample observation;
dataset['train']['review'][0], dataset['train']['sentiment'][0]

In [None]:
### Model Definition for fine tuning


## Defining the bnb congif for loading the model in a quantised fashion; 
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False,
    )

device_map = {"": 0}


## Loading the model;
model_name='microsoft/phi-2'
original_model = AutoModelForCausalLM.from_pretrained(model_name, 
                                                      device_map=device_map,
                                                      quantization_config=bnb_config,
                                                      trust_remote_code=True,
                                                      use_auth_token=True)


print_gpu_utilization()

In [None]:
print_gpu_utilization()

In [None]:
### Tokeniser

## Loading the tokenizer;
model_name='microsoft/phi-2'
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          trust_remote_code=True,
                                          padding_side="left",
                                          add_eos_token=True,
                                          add_bos_token=True,
                                          use_fast=False)
tokenizer.pad_token = tokenizer.eos_token


print_gpu_utilization()

### Converting the text into the given prompt fashion

In [None]:
eval_tokenizer = AutoTokenizer.from_pretrained(model_name, 
                                               add_bos_token=True, 
                                               trust_remote_code=True, 
                                               use_fast=False)

eval_tokenizer.pad_token = eval_tokenizer.eos_token


In [None]:
def gen(model,p, maxlen=100, sample=True):
    toks = eval_tokenizer(p, return_tensors="pt")
    res = model.generate(**toks.to("cuda"), max_new_tokens=maxlen, do_sample=sample,num_return_sequences=1,temperature=0.1,num_beams=1,top_p=0.95,).to('cpu')
    return eval_tokenizer.batch_decode(res,skip_special_tokens=True)

In [None]:
from transformers import set_seed
seed = 42
set_seed(seed)

index = 12

review = dataset['train']['review'][index]
sentiment = dataset['train']['sentiment'][index]

formatted_prompt = f"Instruct: You are a sentiment analyser, that tries to classify the sentiment of a e-commerce review into one of these - 'Positive' or 'Negative'. Classify the following review: \n{review}\nOutput:\n"


res = gen(original_model,formatted_prompt,100,)

#print(res[0])
output = res[0].split('Output:\n')[1]

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{formatted_prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{sentiment}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')