In [None]:
%pip install accelerate
# Automatically places parts of the model on available devices (like GPUs or CPU) — We don’t have to manually move layers or tensors.
# Supports model parallelism, splitting huge models across multiple GPUs or even machines.
# Manages memory usage and device communication behind the scenes.
# Simplifies running models in mixed precision (like float16) or quantized modes.
# Provides tools to easily scale training and inference from one device to many.

%pip install bitsandbytes
# Allows to use quantization techniques to reduce model size and memory use.

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
import os

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
model_name = "deepseek-ai/deepseek-coder-1.3b-base"

In [None]:
# Since my laptop doesnt have enough storage or RAM to run the model, I will use an external drive. 
mount_point = "Seagate"

if os.path.ismount(mount_point):
    print(f"/Volume/{mount_point} is mounted.")
    model_dir = f"/Volumes/{mount_point}/VSWorkspace/{model_name.replace('/', '_')}"
    os.environ["HF_HOME"] = model_dir # Caching directory for Hugging Face models
else:
    print(f"/Volume/{mount_point} is not mounted.")
    model_dir = "model"

In [None]:
if os.path.exists(model_dir):
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = AutoModelForCausalLM.from_pretrained(model_dir)
else:
    tokenizer = AutoTokenizer.from_pretrained(model_name, device_map=device)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
        use_safetensors=True
    )

    model.save_pretrained(model_dir)
    tokenizer.save_pretrained(model_dir)