# Training Phi-3-mini-128k-instruct to Learn Swift Programming Language

This notebook trains Microsoft's Phi-3-mini-128k-instruct model to understand and work with Swift code using a dataset of real Swift files.

In [None]:
# Install required packages
!pip install transformers datasets evaluate torch scikit-learn tqdm dropbox requests accelerate peft bitsandbytes

In [None]:
import os
import torch
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from tqdm import tqdm

In [None]:
# Load the model and tokenizer
model_name = "microsoft/phi-3-mini-128k-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Load model with quantization for efficiency
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    load_in_8bit=True
)

In [None]:
# Prepare the model for training
model = prepare_model_for_kbit_training(model)

# Configure LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

In [None]:
# Load and prepare the Swift code dataset
def load_swift_files(directory="code_by_language/Swift", max_files=1000):
    files = []
    for filename in os.listdir(directory)[:max_files]:
        if filename.endswith(".swift"):
            file_path = os.path.join(directory, filename)
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                    if content.strip():  # Skip empty files
                        files.append({"text": content})
            except Exception as e:
                print(f"Error reading {file_path}: {e}")
    return files

# Load Swift files
swift_files = load_swift_files()
print(f"Loaded {len(swift_files)} Swift files")

# Create a dataset
from datasets import Dataset
dataset = Dataset.from_list(swift_files)

# Split the dataset
dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

print(f"Training set: {len(train_dataset)} examples")
print(f"Evaluation set: {len(eval_dataset)} examples")

In [None]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=2048)

tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_eval = eval_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

In [None]:
# Configure the data collator with padding and truncation
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    padding=True,
    truncation=True
)

In [None]:
# Configure training arguments
training_args = TrainingArguments(
    output_dir="./phi-3-swift-finetuned",
    evaluation_strategy="epoch",
    learning_rate=2e-4,
    weight_decay=0.01,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    fp16=True
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator,
    label_names=[],  # Empty list for PeftModelForCausalLM
)

In [None]:
# Train the model
trainer.train()

In [None]:
# Save the model
model.save_pretrained("./phi-3-swift-finetuned-final")
tokenizer.save_pretrained("./phi-3-swift-finetuned-final")

In [None]:
# Test the model with a Swift code prompt
test_prompt = """
Write a Swift function that sorts an array of integers using the quicksort algorithm.
"""

inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")
outputs = model.generate(
    **inputs,
    max_new_tokens=500,
    temperature=0.7,
    top_p=0.9,
    do_sample=True
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)