<a href="https://colab.research.google.com/github/sdahal51/ModernBERT/blob/main/Fine_Tuning_DeepSeek_LLM_Adapting_Open_Source_AI_for_Your%C2%A0Needs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Check GPU Availability
Make sure Google Colab is using a GPU.

In [None]:
import torch
torch.cuda.is_available()


If this returns True, you're good to go! If not, go to Runtime > Change runtime type > GPU.

# Install Required Libraries
Run this command to install transformers, torch, and accelerate.

In [None]:
!pip install -U torch transformers datasets accelerate peft bitsandbytes

# 3. Load DeepSeek LLM from Hugging Face

Load the model with LoRA (Low-Rank Adaptation) for efficient fine-tuning.

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model

model_name = "deepseek-ai/deepseek-llm-7b-base"

# Configure 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16  # Use float16 for faster computation
)

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)


# Apply LoRA for memory-efficient fine-tuning
lora_config = LoraConfig(
    r=8,  # Low-rank adaptation size
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],  # Apply LoRA to attention layers
    lora_dropout=0.05,
    bias="none"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

print("✅ DeepSeek LLM Loaded with LoRA and 4-bit Precision!")

# 4. Load and Preprocess the IMDB Dataset

Download the IMDB dataset and prepare it for causal language modeling.



In [None]:
from datasets import load_dataset

# Load IMDB dataset
dataset = load_dataset("imdb")

# Display dataset structure and a few samples
print("Dataset Structure:")
print(dataset)

print("Sample Data:")
print(dataset["train"][0])
print(dataset["train"][1])

### Tokenize Dataset

In [None]:
def tokenize_function(examples):
    inputs = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )
    inputs["labels"] = inputs["input_ids"].copy()  # Use input_ids as labels for causal LM
    return inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Verify tokenized sample
print("Tokenized Sample with Labels:")
print(tokenized_datasets["train"][0])

# 5. Set Training Parameterss

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

from transformers import TrainingArguments

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs",
    fp16=True,
)

print("✅ WandB Disabled!")

# Get sample Data

To speed up the training

In [None]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(500))
small_test_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(100))

# 5. Initialize Trainer and Train

Set up the Trainer and start fine-tuning.

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset.remove_columns(["text"]),  # Remove raw text column
    eval_dataset=small_test_dataset.remove_columns(["text"]),
)

print("🚀 Trainer Initialized!")


In [None]:
torch.cuda.empty_cache()
print("✅ Cleared CUDA Cache")


# 6. Fine-Tune DeepSeek LLM

In [None]:
print("🚀 Starting Fine-Tuning...")
trainer.train()

#7. Run Predictions with the Fine-Tuned Model
Now that we have fine-tuned DeepSeek LLM, let's generate predictions for new text inputs.



In [None]:
def generate_prediction(review_text):
    inputs = tokenizer(review_text, return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_length=100, pad_token_id=tokenizer.eos_token_id)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example reviews
reviews = [
    "The movie was absolutely fantastic! I loved the cinematography and the acting was superb.",
    "This was the worst movie I've ever seen. The plot made no sense and the dialogue was terrible.",
    "It was an okay movie. Some parts were really good, but overall it was just average."
]

# Run predictions
for review in reviews:
    print(f"Review: {review}")
    print(f"Predicted Sentiment: {generate_prediction(review)}")
    print("-" * 80)


# 8. Reference

* [DeepSeek LLM](https://github.com/deepseek-ai/DeepSeek-LLM)
* [DeepSeek](https://www.deepseek.com/)
* [DeepSeek LLM Paper on arXiv](https://arxiv.org/abs/2401.02954)




Copyright 2025 Abhishek Gargha Maheshwarappa

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.