In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model
import torch


  from .autonotebook import tqdm as notebook_tqdm


In [3]:

# Load model and tokenizer
model_name = "Qwen/Qwen2.5-Coder-1.5B"  # Specify exact variant if needed (e.g., 7B, 1.5B)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)


KeyboardInterrupt: 

In [None]:

# Tokenization function
def tokenize_function(texts):
    return tokenizer(texts, truncation=True, max_length=512, padding="max_length", return_tensors="pt")

# Tokenize datasets
with open("data/train_data.txt", "r") as f:
    train_data = f.readlines()
with open("data/val_data.txt", "r") as f:
    val_data = f.readlines()

train_tokenized = tokenize_function(train_data)
val_tokenized = tokenize_function(val_data)


In [None]:

# Convert to a format suitable for Trainer
class SimpleDataset:
    def __init__(self, tokenized_data):
        self.input_ids = tokenized_data["input_ids"]
        self.attention_mask = tokenized_data["attention_mask"]
    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self, idx):
        return {"input_ids": self.input_ids[idx], "attention_mask": self.attention_mask[idx]}

train_dataset = SimpleDataset(train_tokenized)
val_dataset = SimpleDataset(val_tokenized)

# Data collator for CLM (shifts inputs to create targets)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


In [None]:

# Configure LoRA
lora_config = LoraConfig(
    r=8,              # Rank of the adaptation matrices
    lora_alpha=32,    # Scaling factor
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],  # Attention layers to adapt
    lora_dropout=0.1
)
model = get_peft_model(model, lora_config)

# Training arguments
training_args = TrainingArguments(
    output_dir="./finetuned_qwen_ballerina",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,  # Enable mixed precision for efficiency
    save_strategy="epoch",
    load_best_model_at_end=True
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator
)


In [None]:

# Train the model
trainer.train()


In [None]:

# Save the fine-tuned model
model.save_pretrained("./finetuned_qwen_ballerina")
tokenizer.save_pretrained("./finetuned_qwen_ballerina")

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load the saved model and tokenizer
model_path = "./finetuned_qwen_ballerina"
model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Function to generate Ballerina code completions
def generate_code_completion(prompt, max_length=200, temperature=0.7, top_p=0.9):
    # Prepare the input
    inputs = tokenizer(prompt, return_tensors="pt")
    
    # Generate completion
    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            max_new_tokens=max_length,
            temperature=temperature,
            top_p=top_p,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    
    # Decode and return the completion
    completion = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return completion

# Example usage
prompt = "// Write a Ballerina function to calculate factorial\n"
completion = generate_code_completion(prompt)
print(completion)