<a href="https://colab.research.google.com/github/pranshulagrawal999/Movie_Review_Sentiment_Analysis/blob/main/Fine_tuning_LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets torch


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
# Import necessary libraries
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load the dataset
df_review = pd.read_csv("IMDB Dataset.csv")

# Prepare a balanced dataset
df_positive = df_review[df_review['sentiment'] == 'positive'][:9000]
df_negative = df_review[df_review['sentiment'] == 'negative'][:1000]
df_review_balanced = pd.concat([df_positive, df_negative])

# Convert sentiment labels to numerical format
df_review_balanced['label'] = df_review_balanced['sentiment'].map({'positive': 1, 'negative': 0})

# Split into training and testing sets
train_data, test_data = train_test_split(df_review_balanced, test_size=0.33, random_state=42)
train_dataset = Dataset.from_pandas(train_data[['review', 'label']])
test_dataset = Dataset.from_pandas(test_data[['review', 'label']])

# Load the tokenizer and model
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Tokenize the data
def tokenize_data(example):
    return tokenizer(example['review'], truncation=True, padding='max_length', max_length=256)

train_dataset = train_dataset.map(tokenize_data, batched=True)
test_dataset = test_dataset.map(tokenize_data, batched=True)

# Set the format for PyTorch tensors
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# Define evaluation metrics
def compute_metrics(pred):
    predictions = pred.predictions.argmax(-1)
    labels = pred.label_ids
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=10,
    save_total_limit=2
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print("Evaluation Results:", results)

# Test the model on new examples
examples = ["A good movie", "An excellent movie", "I did not like this movie at all I gave this movie away"]
encoded_examples = tokenizer(examples, truncation=True, padding=True, max_length=256, return_tensors="pt")
predictions = model(**encoded_examples)
predicted_classes = predictions.logits.argmax(-1).numpy()
print("Predictions:", predicted_classes)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/6700 [00:00<?, ? examples/s]

Map:   0%|          | 0/3300 [00:00<?, ? examples/s]

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.177,0.158257,0.947576,0.958388,0.984965,0.971494
2,0.1714,0.18374,0.948485,0.963547,0.980287,0.971845


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.177,0.158257,0.947576,0.958388,0.984965,0.971494
2,0.1714,0.18374,0.948485,0.963547,0.980287,0.971845
3,0.0512,0.213437,0.946061,0.962233,0.978951,0.97052


Evaluation Results: {'eval_loss': 0.2134370505809784, 'eval_accuracy': 0.946060606060606, 'eval_precision': 0.9622331691297209, 'eval_recall': 0.978950885399265, 'eval_f1': 0.9705200397482611, 'eval_runtime': 22.8059, 'eval_samples_per_second': 144.7, 'eval_steps_per_second': 2.28, 'epoch': 3.0}


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [None]:
import torch

# Check if CUDA (GPU) is available, otherwise fallback to CPU
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Move the model to the appropriate device
model = model.to(device)

# Test the model on new examples
examples = ["A good movie", "An excellent movie", "I did not like this movie at all I gave this movie away"]

# Tokenize and move the inputs to the same device as the model
encoded_examples = tokenizer(examples, truncation=True, padding=True, max_length=256, return_tensors="pt")
encoded_examples = {key: value.to(device) for key, value in encoded_examples.items()}

# Make predictions
predictions = model(**encoded_examples)
predicted_classes = predictions.logits.argmax(-1).cpu().numpy()  # Move predictions back to CPU for further processing

print("Predictions:", predicted_classes)


Predictions: [1 1 0]


In [None]:
!zip -r results.zip results/

  adding: results/ (stored 0%)
  adding: results/checkpoint-1257/ (stored 0%)
  adding: results/checkpoint-1257/tokenizer.json (deflated 71%)
  adding: results/checkpoint-1257/scheduler.pt (deflated 57%)
  adding: results/checkpoint-1257/model.safetensors (deflated 8%)
  adding: results/checkpoint-1257/rng_state.pth (deflated 25%)
  adding: results/checkpoint-1257/special_tokens_map.json (deflated 42%)
  adding: results/checkpoint-1257/config.json (deflated 46%)
  adding: results/checkpoint-1257/vocab.txt (deflated 53%)
  adding: results/checkpoint-1257/optimizer.pt (deflated 17%)
  adding: results/checkpoint-1257/tokenizer_config.json (deflated 76%)
  adding: results/checkpoint-1257/trainer_state.json (deflated 78%)
  adding: results/checkpoint-1257/training_args.bin (deflated 51%)
  adding: results/checkpoint-1250/ (stored 0%)
  adding: results/checkpoint-1250/tokenizer.json (deflated 71%)
  adding: results/checkpoint-1250/scheduler.pt (deflated 55%)
  adding: results/checkpoint-1250