## Data Loading and Preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/IMDB-Dataset.csv")
df = df.sample(1000)
df.head()

Unnamed: 0,review,sentiment
22071,the only word i can think of to describe this ...,negative
20000,I am a huge fan of Northern Exposure. Men In T...,negative
30798,I was surprised at just how much I enjoyed thi...,positive
28896,I'm not sure why Spike Lee made this train wre...,negative
24181,For awhile I was hooked on shows like Ghost Hu...,negative


In [3]:
df.shape

(1000, 2)

In [4]:

df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [5]:
df['review'] = df['review'].str.lower()

In [6]:
df.head()

Unnamed: 0,review,sentiment
22071,the only word i can think of to describe this ...,negative
20000,i am a huge fan of northern exposure. men in t...,negative
30798,i was surprised at just how much i enjoyed thi...,positive
28896,i'm not sure why spike lee made this train wre...,negative
24181,for awhile i was hooked on shows like ghost hu...,negative


## Data Preparation for ML

In [7]:
# custom dataset -> evaluation/compute metrics -> training arguments -> trainer -> training -> testing

In [8]:
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [9]:
class CustomDataset(Dataset):
  def __init__(self, texts, labels, tokenizer, max_len=512):
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = str(self.texts[idx])
    label = torch.tensor(self.labels[idx])

    encoding = self.tokenizer(text, truncation=True, padding="max_length",
                              max_length=self.max_len)

    return {
        'input_ids': encoding['input_ids'],
        'attention_mask': encoding['attention_mask'],
        'labels': label
    }

In [10]:
# prepare tokenizer and model
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = 'distilbert-base-uncased'
device = "cuda"

tokenizer = AutoTokenizer.from_pretrained(checkpoint,add_eos_token=True)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint,quantization_config=bnb_config, device_map={"":0} ,num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
X = df['review'].tolist()

label2id = {'positive': 1, 'negative': 0}
id2label = {1: 'positive', 0: 'negative'}

y = df['sentiment'].map(label2id).tolist()

dataset = CustomDataset(X, y, tokenizer)

In [12]:
dataset

<__main__.CustomDataset at 0x23e04a55950>

In [13]:
dataset[0].keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [14]:
train_dataset, test_dataset = train_test_split(dataset, test_size=0.2, random_state=42)

In [15]:
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(example):
  labels = example.label_ids
  preds = example.predictions.argmax(-1)

  f1 = f1_score(labels, preds, average="weighted")
  acc = accuracy_score(labels, preds)

  return {'accuracy': acc, "f1": f1}

In [16]:
import bitsandbytes as bnb
def find_all_linear_names(model):
  cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
  lora_module_names = set()
  for name, module in model.named_modules():
    if isinstance(module, cls):
      names = name.split('.')
      lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names: # needed for 16-bit
      lora_module_names.remove('lm_head')
  return list(lora_module_names)
modules = find_all_linear_names(model)
print(modules)

['q_lin', 'v_lin', 'pre_classifier', 'out_lin', 'k_lin', 'lin1', 'lin2']


In [17]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=512,
    lora_alpha=32,
    target_modules=modules,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

In [18]:
from transformers import Trainer, TrainingArguments
batch_size = 16
model_name = "distilbert_finetuned_setiment"

args = TrainingArguments(
    output_dir = "output",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size = batch_size,
    learning_rate = 2e-5,
    num_train_epochs = 3,
    evaluation_strategy = 'epoch'
)




In [19]:
trainer = Trainer(model=model,
                  args=args,
                  train_dataset = train_dataset,
                  eval_dataset = test_dataset,
                  compute_metrics=compute_metrics,
                  tokenizer = tokenizer)

In [20]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )
    
print_trainable_parameters(model)

trainable params: 43253760 || all params: 88680194 || trainable%: 48.774994786321734


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


In [None]:
trainer.save_model("finetuned_model_dir")

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

model_path = 'finetuned_model_dir'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Create a pipeline for sentiment analysis
sentiment_analysis = pipeline('text-classification', model=model, tokenizer=tokenizer)

# Perform sentiment analysis on some text
text = "I really enjoyed the movie!"
result = sentiment_analysis(text)[0]
print(f"Text: {text}")
print(f"Sentiment: {result['label']}, Score: {result['score']}")

In [None]:
# Perform sentiment analysis on some text
text = "I love this movie!"
result = sentiment_analysis(text)[0]
print(f"Text: {text}")
print(f"Sentiment: {result['label']}, Score: {result['score']}")

In [None]:
text = "i love this product"
pipe = pipeline('text-classification', model_name)
pipe(text)

In [None]:
id2label

In [None]:


tok = AutoTokenizer.from_pretrained(model_name)
mod = AutoModelForSequenceClassification.from_pretrained(model_name)

In [None]:
def get_prediction(text):
  input_ids = tok.encode(text, return_tensors='pt')
  output = mod(input_ids)

  preds = torch.nn.functional.softmax(output.logits, dim=-1)

  prob = torch.max(preds).item()

  idx = torch.argmax(preds).item()
  sentiment = id2label[idx]

  return {'sentiment':sentiment, 'prob':prob}

In [None]:
text = "i love this product"
get_prediction(text)

In [None]:
text = "i hate this product"
get_prediction(text)