In [None]:
# Accelerate : Automatically places parts of the model on available devices (like GPUs or CPU) — We don’t have to manually move layers or tensors.
# Supports model parallelism, splitting huge models across multiple GPUs or even machines.
# Manages memory usage and device communication behind the scenes.
# Simplifies running models in mixed precision (like float16) or quantized modes.
# Provides tools to easily scale training and inference from one device to many.

# BitsandBytes : Allows to use quantization techniques to reduce model size and memory use.

%pip install transformers datasets peft accelerate bitsandbytes torch


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
import os
import pandas as pd
from peft import LoraConfig, get_peft_model, TaskType
import wandb

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
model_name = "deepseek-ai/deepseek-coder-1.3b-base"

In [None]:
# Since my laptop doesnt have enough storage or RAM to run the model, I will use an external drive.
mount_point = "Seagate"

if os.path.ismount(mount_point):
    print(f"/Volume/{mount_point} is mounted.")
    model_dir = f"/Volumes/{mount_point}/VSWorkspace/{model_name.replace('/', '_')}"
    os.environ["HF_HOME"] = model_dir # Caching directory for Hugging Face models
else:
    print(f"/Volume/{mount_point} is not mounted.")
    model_dir = "model"

In [None]:
if os.path.exists(model_dir):
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = AutoModelForCausalLM.from_pretrained(model_dir)
else:
    tokenizer = AutoTokenizer.from_pretrained(model_name, device_map=device)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
        use_safetensors=True
    )

In [None]:
from datasets import Dataset

df = pd.read_csv("dataset.csv")
dataset = Dataset.from_pandas(df)


In [None]:
print([name for name, _ in model.named_modules()])

In [None]:
lora_config = LoraConfig(
    r=8,                           # Rank of the low-rank matrices
    lora_alpha=32,                 # Scaling factor
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],  # Module names where LoRA is applied
    lora_dropout=0.1,              # Dropout during training for regularization
    bias="none",                   # Whether to train bias ("none", "all", or "lora_only")
    task_type=TaskType.CAUSAL_LM   # Since we are only predicting the next token in sequence
)


model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

Tokenize the entire dataset

In [None]:
def format(example):
    return f"### Instruction:\n{example['English understanding']}\n\n### Output:\n{example['Shell script']}"

def tokenize(example):
    tokenizer.pad_token = tokenizer.eos_token # Defines the token that is used to pad around the tokens.
    return tokenizer(format(example), truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(tokenize)

In [None]:
print(torch.cuda.device_count())
print(f"Allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
print(f"Reserved : {torch.cuda.memory_reserved() / 1e9:.2f} GB")

In [None]:
APIKEY = '08c7e1411b3a99f64afa74e0b8cf5b23f0230f70'
wandb.login(key=APIKEY)

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir="./lora-finetuned-model",
    per_device_train_batch_size=3,  # Training examples processed simultaneously
    gradient_accumulation_steps=1,
    num_train_epochs=10,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",  # Save checkpoint every epoch
    eval_strategy="no",     # No evaluation during training
    # Remove eval_steps since eval_strategy="no" (eval_steps not needed)
    learning_rate=2e-5,
    load_best_model_at_end=False,  # Must be False if eval_strategy="no"
    save_total_limit=2,
    weight_decay=0.01,
    fp16=True
)
# Completing one dataset is one epoch
# If batch is 100 and dataset is 1000 then 1 epoch is 10 steps

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

In [None]:
model.save_pretrained("finetuned_shell_model")
tokenizer.save_pretrained("finetuned_shell_model")


In [None]:
def runExample(prompt):
  inputs = tokenizer(prompt, return_tensors="pt").to(device)
  outputs = model.generate(**inputs, max_new_tokens=50)

  output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
  main_output = output_text.split("### Output:")[1].strip().split("###")[0].strip()
  return(main_output)


In [None]:
prompt = "### Instruction:\nMove all files from A to B\n\n### Output:\n"
output = runExample(prompt)
print(output)

In [None]:
prompt = "### Instruction:\n Group files based on month and year.\n\n### Output:\n"
output = runExample(prompt)
print(output)

In [None]:
prompt = "### Instruction:\n Move all photos to ~/Images.\n\n### Output:\n"
output = runExample(prompt)
print(output)

In [None]:
os.getcwd()

In [None]:
import matplotlib.pyplot as plt

logs = trainer.state.log_history
steps = [log["step"] for log in logs if "loss" in log]
losses = [log["loss"] for log in logs if "loss" in log]

plt.plot(steps, losses, label="Training Loss")
plt.xlabel("Training Step")
plt.ylabel("Loss")
plt.title("Loss vs Training Step")
plt.grid(True)
plt.legend()
plt.show()

In [None]:
prompt = "### Instruction:\n Move all *.JPG, *.JPEG, *.MOV in current directory recursively to ~/Images.\n\n### Output:\n"
result = runExample(prompt)
print(result)

In [None]:
!zip -r finetuned_shell_model.zip finetuned_shell_model/

from google.colab import files
files.download('finetuned_shell_model.zip')