In [None]:
!pip install openai transformers datasets torch accelerate bitsandbytes unsloth
!pip install -U trl
!pip install -q -U openai

import os
import pandas as pd
import numpy as np
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from trl import SFTTrainer
from openai import OpenAI
import json

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting unsloth
  Downloading unsloth-2025.2.15-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.8/57.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.3/472.3 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Run this in the first cell
!export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

In [None]:
# Load the Stanford Sentiment Treebank dataset
dataset = load_dataset("sst2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/5.27k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [None]:
# Function to prepare data for OpenAI fine-tuning
def prepare_openai_data(examples):
    return [
        {
            "messages": [
                {"role": "user", "content": f"Classify the sentiment of this review: {example['sentence']}"},
                {"role": "assistant", "content": "Positive" if example['label'] == 1 else "Negative"}
            ]
        }
        for example in examples
    ]

# Function to prepare data for DeepSeek fine-tuning
def prepare_deepseek_data(examples):
    return [
        {
            "instruction": "Classify the sentiment of this review:",
            "input": example['sentence'],
            "output": "Positive" if example['label'] == 1 else "Negative"
        }
        for example in examples
    ]

# Prepare data for both models
openai_train_data = prepare_openai_data(dataset['train'])
openai_test_data = prepare_openai_data(dataset['test'])
deepseek_train_data = prepare_deepseek_data(dataset['train'])
deepseek_test_data = prepare_deepseek_data(dataset['test'])

# Save OpenAI data to JSONL file
with open('openai_train.jsonl', 'w') as f:
    for item in openai_train_data:
        f.write(json.dumps(item) + '\n')

In [None]:
# OpenAI Fine-tuning
import os

os.environ["OPENAI_API_KEY"] = "API_KEY"
client = OpenAI()

# Upload the file
file = client.files.create(
    file=open('openai_train.jsonl', 'rb'),
    purpose="fine-tune"
)

# Create a fine-tuning job
job = client.fine_tuning.jobs.create(
    training_file=file.id,
    model="gpt-3.5-turbo-0125"
)

print(f"OpenAI Fine-tuning job created: {job.id}")

OpenAI Fine-tuning job created: ftjob-LTIwDMBekgmdZfjtpmaEVYkz


In [None]:
from unsloth import FastLanguageModel
from transformers import TrainingArguments, set_seed
from trl import SFTTrainer
from datasets import Dataset

# Set seed for reproducibility (matches OpenAI's seed)
set_seed(1230529224)

# Load the model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/DeepSeek-R1-Distill-Qwen-1.5B-unsloth-bnb-4bit",
    max_seq_length=512,
    load_in_4bit=True
)

# Prepare the model for fine-tuning
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_alpha=16,
    lora_dropout=0
)

# Convert list to Hugging Face Dataset
train_dataset = Dataset.from_list([
    {
        "instruction": item['instruction'],
        "input": item['input'],
        "output": item['output']
    }
    for item in deepseek_train_data
])

# Set up training arguments (aligned to OpenAI GPT-3.5 params)
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,                        # 1 Epoch to match GPT-3.5
    gradient_accumulation_steps=1,             # No accumulation to match OpenAI
    per_device_train_batch_size=44,            # Match OpenAI batch size
    learning_rate=1e-4,                        # Approx. LR multiplier 2x
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    save_strategy="epoch",
)

# Define the formatting function (for DeepSeek input)
def formatting_func(example):
    return f"Instruction: {example['instruction']}\nInput: {example['input']}\nOutput: {example['output']}"

# Create the trainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    formatting_func=formatting_func,
    max_seq_length=256,  # Keeping this lower improves speed
)

# Train the model
trainer.train()

print("DeepSeek fine-tuning completed")

# Save the fine-tuned DeepSeek model
trainer.save_model("./deepseek_fine_tuned")

print("Fine-tuning process completed for DeepSeek model")

==((====))==  Unsloth 2025.2.15: Fast Qwen2 patching. Transformers: 4.48.3.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Applying formatting function to train dataset (num_proc=12):   0%|          | 0/67349 [00:00<?, ? examples/s]

Converting train dataset to ChatML (num_proc=12):   0%|          | 0/67349 [00:00<?, ? examples/s]

Applying chat template to train dataset (num_proc=12):   0%|          | 0/67349 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=12):   0%|          | 0/67349 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=12):   0%|          | 0/67349 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 67,349 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 44 | Gradient Accumulation steps = 1
\        /    Total batch size = 44 | Total steps = 1,531
 "-____-"     Number of trainable parameters = 4,358,144


Step,Training Loss
500,3.2396
1000,2.0924
1500,2.0695


DeepSeek fine-tuning completed
Fine-tuning process completed for DeepSeek model


In [None]:
# Function to get predictions from ChatGPT
def get_chatgpt_predictions(client, model_id, test_data):
    predictions = []
    for item in tqdm(test_data, desc="ChatGPT Predictions"):
        response = client.chat.completions.create(
            model=model_id,
            messages=[
                {"role": "system", "content": "You are a sentiment classifier. Respond with only 'Positive' or 'Negative'."},
                {"role": "user", "content": f"Classify the sentiment of this review: {item['messages'][0]['content']}"}
            ]
        )
        predictions.append(response.choices[0].message.content.strip())
    return predictions


# Get predictions and calculate metrics
chatgpt_model_id = "ft:gpt-3.5-turbo-0125:personal::B4y3Yp73"
chatgpt_predictions = get_chatgpt_predictions(client, chatgpt_model_id, openai_test_data)

ChatGPT Predictions:   0%|          | 0/1821 [00:01<?, ?it/s]


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [None]:
# Function to get predictions from DeepSeek
model = FastLanguageModel.for_inference(model)

def get_deepseek_predictions(model, tokenizer, test_data):
    predictions = []
    model.eval()
    with torch.no_grad():
        for item in tqdm(test_data, desc="DeepSeek Predictions"):
            inputs = tokenizer(f"Instruction: Classify the sentiment of this review:\nInput: {item['input']}\nOutput:", return_tensors="pt", truncation=True, max_length=512)
            inputs = {k: v.to(model.device) for k, v in inputs.items()}
            outputs = model.generate(**inputs, max_new_tokens=10)
            prediction = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
            predictions.append("Positive" if "Positive" in prediction else "Negative")
    return predictions


deepseek_predictions = get_deepseek_predictions(model, tokenizer, deepseek_test_data)

DeepSeek Predictions: 100%|██████████| 1821/1821 [17:01<00:00,  1.78it/s]


In [None]:
# Function to calculate metrics
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

def calculate_metrics(true_labels, predictions):
    # Get all possible classes from true and predicted labels
    unique_labels = list(set(true_labels) | set(predictions))

    metrics = {
        "accuracy": accuracy_score(true_labels, predictions),
        "precision": precision_score(true_labels, predictions, average='weighted', zero_division=0),
        "recall": recall_score(true_labels, predictions, average='weighted', zero_division=0),
        "f1": f1_score(true_labels, predictions, average='weighted', zero_division=0),
        "report": classification_report(true_labels, predictions, labels=unique_labels, zero_division=0)
    }

    return metrics

# chatgpt_metrics = calculate_metrics(true_labels, chatgpt_predictions)
deepseek_metrics = calculate_metrics(true_labels, deepseek_predictions)

In [None]:
# Measure latency
def measure_latency(model_func, data):
    start_time = time.time()
    model_func(data[:100])  # Use a subset of data for latency measurement
    end_time = time.time()
    return (end_time - start_time) / 100  # Average latency per sample

# chatgpt_latency = measure_latency(lambda x: get_chatgpt_predictions(client, chatgpt_model_id, x), openai_test_data)
deepseek_latency = measure_latency(lambda x: get_deepseek_predictions(model, tokenizer, x), deepseek_test_data)

DeepSeek Predictions: 100%|██████████| 100/100 [00:56<00:00,  1.76it/s]


In [None]:
# # Print comparison results
# print("Performance Comparison:")
# print(f"{'Metric':<20}{'ChatGPT':<15}{'DeepSeek':<15}")
# print("-" * 50)
# for metric in ["accuracy", "precision", "recall", "f1"]:
#     print(f"{metric:<20}{chatgpt_metrics[metric]:.4f}{deepseek_metrics[metric]:.4f}")
# print(f"{'Latency (s/sample)':<20}{chatgpt_latency:.4f}{deepseek_latency:.4f}")

print("Performance Comparison:")
print(f"{'Metric':<20}{'DeepSeek':<15}")
print("-" * 50)
for metric in ["accuracy", "precision", "recall", "f1"]:
    print(f"{metric:<20}{deepseek_metrics[metric]:.4f}")
print(f"{'Latency (s/sample)':<20}{deepseek_latency:.4f}")

Performance Comparison:
Metric              DeepSeek       
--------------------------------------------------
accuracy            0.4843
precision           1.0000
recall              0.4843
f1                  0.6526
Latency (s/sample)  0.5694


In [None]:
from tabulate import tabulate

print("\nQualitative Analysis:")

sample_reviews = [
    "This movie was absolutely fantastic!",
    "I've never been so disappointed in a product.",
    "The service was neither good nor bad.",
    "While the concept was innovative, the execution left much to be desired."
]

comparison_results = []

for review in sample_reviews:
    # Get ChatGPT Response
    # chatgpt_response = client.chat.completions.create(
    #     model=chatgpt_model_id,
    #     messages=[
    #         {"role": "system", "content": "You are a sentiment classifier. Respond with only 'Positive' or 'Negative'."},
    #         {"role": "user", "content": f"Classify the sentiment of this review: {review}"}
    #     ]
    # ).choices[0].message.content.strip()

    # Get DeepSeek Response
    deepseek_input = tokenizer(
        f"Instruction: Classify the sentiment of this review:\nInput: {review}\nOutput:",
        return_tensors="pt",
        truncation=True,
        max_length=512
    )
    deepseek_input = {k: v.to(model.device) for k, v in deepseek_input.items()}
    deepseek_output = model.generate(**deepseek_input, max_new_tokens=10)
    deepseek_response = tokenizer.decode(deepseek_output[0], skip_special_tokens=True).strip()

    # Collect results for tabulation
    # comparison_results.append([review, chatgpt_response, deepseek_response])
    comparison_results.append([review, deepseek_response])

# Print results in a structured table
# print(tabulate(comparison_results, headers=["Review", "ChatGPT Sentiment", "DeepSeek Sentiment"], tablefmt="grid"))
print(tabulate(comparison_results, headers=["Review", "DeepSeek Sentiment"], tablefmt="grid"))


Qualitative Analysis:
+--------------------------------------------------------------------------+---------------------------------------------------------------------------------+
| Review                                                                   | DeepSeek Sentiment                                                              |
| This movie was absolutely fantastic!                                     | Instruction: Classify the sentiment of this review:                             |
|                                                                          | Input: This movie was absolutely fantastic!                                     |
|                                                                          | Output: Positive                                                                |
|                                                                          | That 's a nice way to put                                                       |
+----------------------