# Web scraper
This is the function i used to scrape figma pages for information, Which i then processed using an llm

In [80]:
import requests
from bs4 import BeautifulSoup

url = "https://www.figma.com/resource-library/graphic-design-principles/"
response = requests.get(url)
print(f"Status code: {response.status_code}")
soup = BeautifulSoup(response.text, "html")

import re
def remove_html_tags(text):
  pattern = re.compile('<.*?>')
  return pattern.sub(r'', text)
cleaned_text = remove_html_tags(soup.get_text())
with open("output.txt", "w", encoding="utf-8") as file:
  file.write(cleaned_text)

Status code: 200


# Data formatting function
In order to turn data from a dataset into a form that the llama chat model would understand, extensive replacement of human and assistant keys with [INST] tags was required

In [None]:
import json

def convert_qa_to_llama_format(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    converted_data = []
    for item in data:
        formatted_text = f"<s>[INST] {item['human']} [/INST] {item['assistant']}</s>"
        converted_data.append({"text": formatted_text})
    
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(converted_data, f, indent=2, ensure_ascii=False)

convert_qa_to_llama_format('content.json', 'llama.json')

# Web dev dataset
This is the dataset we used to train the model on code and web dev principles

In [None]:
from datasets import load_dataset

# Load the dataset
import os

dataset = load_dataset("sahil2801/CodeAlpaca-20k")

dataset['train'].to_json('CodeAlpaca-20k.json')

train_data = dataset['train']
print(f"Number of examples: {len(train_data)}")
print(train_data[0])  # First example

# Reformatting
Here I applyied the earlier function to the downloaded dataset to get fully formatted data

In [None]:
import json

def convert_codealpaca_to_llama(input_file, output_file):
    converted_data = []
    with open(input_file, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                item = json.loads(line)
                
                instruction = item['instruction'].strip()
                input_text = item.get('input', '').strip()
                output_text = item['output'].strip()
                if input_text and input_text not in ['', '< noinput >', '<noinput>']:
                    user_prompt = f"{instruction}\n\n{input_text}"
                else:
                    user_prompt = instruction
                formatted_text = f"<s>[INST] {user_prompt} [/INST] {output_text}</s>"
                converted_data.append({"text": formatted_text})
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(converted_data, f, indent=2, ensure_ascii=False)
    print(f"Converted {len(converted_data)} examples")

convert_codealpaca_to_llama('CodeAlpaca-20k.json', 'llama_codealpaca.json')

Converted 20022 examples


# Supplementary dataset
Data from this was not directly used

In [None]:
import json
from datasets import load_dataset

def convert_oasst2_to_llama_fast(output_file="llama_oasst2.json", max_examples=10000):
   print("Loading dataset...")
   dataset = load_dataset("OpenAssistant/oasst2")
   
   converted_data = []
   count = 0
   print("Building message lookup...")
   msg_lookup = {item['message_id']: item for item in dataset['train']}
   
   print("Converting conversations...")
   for item in dataset['train']:
       if count >= max_examples:
           break
       if (item['role'] == 'assistant' and 
           item['parent_id'] and 
           item.get('review_result', False) == True):
           
           parent = msg_lookup.get(item['parent_id'])
           
           if (parent and 
               parent['role'] == 'prompter' and 
               item['lang'] == 'en'): 
               
               user_text = parent['text'].strip()
               assistant_text = item['text'].strip()
               if 10 < len(user_text) < 2000 and 10 < len(assistant_text) < 4000:
                   conversation = f"<s>[INST] {user_text} [/INST] {assistant_text}</s>"
                   converted_data.append({"text": conversation})
                   count += 1
                   
                   if count % 1000 == 0:
                       print(f"Processed {count} conversations...")
   
   print(f"Saving {len(converted_data)} conversations...")
   with open(output_file, 'w', encoding='utf-8') as f:
       json.dump(converted_data, f, indent=2, ensure_ascii=False)
   
   print(f"Done! Saved {len(converted_data)} conversations to {output_file}")
   return len(converted_data)
count = convert_oasst2_to_llama_fast()

Loading dataset...
Building message lookup...
Converting conversations...
Processed 1000 conversations...
Processed 2000 conversations...
Processed 3000 conversations...
Processed 4000 conversations...
Processed 5000 conversations...
Processed 6000 conversations...
Processed 7000 conversations...
Processed 8000 conversations...
Processed 9000 conversations...
Processed 10000 conversations...
Saving 10000 conversations...
Done! Saved 10000 conversations to llama_oasst2.json


# Langchain dataset
The dataset used to train the model on langchain principles, downloaded and formatted

In [None]:
from datasets import load_dataset

dataset = load_dataset("hudsongeorge/langchain-docs")

howto_dataset = load_dataset("LangChainDatasets/langchain-howto-queries")

def convert_langchain_docs_to_llama(dataset):
    converted = []
    for item in dataset['train']:
        if 'question' in item and 'answer' in item:
            text = f"<s>[INST] {item['question']} [/INST] {item['answer']}</s>"
            converted.append({"text": text})
    with open("lang.json", 'w', encoding='utf-8') as f:
       json.dump(converted, f, indent=2, ensure_ascii=False)
converted = convert_langchain_docs_to_llama(dataset)

# AI principles of usage dataset

In [None]:
import json
from datasets import load_dataset

dataset = load_dataset("Pavithrars/AI_dataset")
def to_llama_format(example):
    instruction = example.get("Question") or example.get("instruction") or "Explain AI."
    output = example.get("Answer") or example.get("output") or "Artificial Intelligence is..."
    text = f"<s>[INST] <<SYS>>\nYou are a helpful assistant that explains how AI works.\n<</SYS>>\n\n{instruction} [/INST] {output} </s>"
    return {"text": text}
train_dataset = dataset["train"].map(to_llama_format)
data_list = [row for row in train_dataset]
with open("llama_ai_dataset.json", "w") as f:
    json.dump(data_list, f, indent=2)

print("Saved llama_ai_dataset.json")

# Dataset Reformatting code 

In [None]:
import json

with open("llama_ai_dataset.json", "r") as f:
    data = json.load(f)
cleaned = [{"text": item["text"]} for item in data if "text" in item]

with open("llama_ai_dataset_clean.json", "w") as f:
    json.dump(cleaned, f, indent=2)

print(f"Cleaned dataset size: {len(cleaned)} examples")


Cleaned dataset size: 173 examples


# Additional AI Info dataset

In [None]:
import json
from datasets import load_dataset
dataset = load_dataset("lucadiliello/How_AI_Works")

records = []

for item in dataset["train"]:
    question = item.get("title") or item.get("question") or "Explain this AI concept."
    answer = item.get("document") or item.get("content") or item.get("summary") or "No answer available."
    text = f"<s>[INST] <<SYS>>\nYou are a helpful assistant that explains how AI works.\n<</SYS>>\n\n{question} [/INST] {answer} </s>"

    records.append({"text": text})
with open("llama_ai_dataset2.json", "w") as f:
    json.dump(records, f, indent=2)

print(f"✅ Saved {len(records)} examples in llama_ai_dataset.json")


## Fine Tuning
# After Data consolidation into a final file, Fine tuning started from these notebooks, on google collab.
The code below is to load the dataset we used for tuning

In [None]:
from google.colab import files
uploaded = files.upload()

# Necessary Installs

In [None]:
!pip install -q -U accelerate peft bitsandbytes transformers trl datasets huggingface_hub

# Huggingface login to access gated model

In [None]:
from huggingface_hub import notebook_login

notebook_login()

# Actual codebase used to fine tune model
QLoRA Was used to fine tune the loaded llama 7b chat model. LoRA config and trainer Config is below, Marked with comments

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset
from trl import SFTTrainer

# Loading Model in 4-bit (QLoRA)
use_4bit = True
bnb_4bit_compute_dtype="float16"
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False
)

model_name = "meta-llama/Llama-2-7b-hf"

if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

model.config.use_cache=False
model.config.pretraining_tp=1

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token 
tokenizer.padding_side = "right"

# Loading Dataset
dataset = load_dataset("json", data_files="final.json", split="train")
train_ds, val_ds = dataset.train_test_split(test_size=0.1).values()


# LoRA Config
peft_config = LoraConfig(
    r=32,                
    lora_alpha=16,        
    lora_dropout=0.05,    
    bias="none",
    task_type="CAUSAL_LM"
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)


# Training Arguments
training_args = TrainingArguments(
    output_dir="./qlora-llama7b-finetuned",
    num_train_epochs=2,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,     
    optim="paged_adamw_32bit",
    save_steps=500,  
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,                          
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    report_to="tensorboard",
    save_total_limit=2           
)


# Trainer Initialisation

trainer = SFTTrainer(
    model=model,
    train_dataset=train_ds,      
    eval_dataset=val_ds,         
    peft_config=peft_config,
    args=training_args
)


# Actual Training

print("Starting training...")
trainer.train()
print("Training finished.")


# Saving Adapter post training
# The adapter is saved, not the full model. The adapter is the additional weights learned during fine-tuning.

model.save_pretrained("./qlora-llama7b-adapter")
tokenizer.save_pretrained("./qlora-llama7b-adapter")

print("Model and tokenizer saved successfully!")

# Final Rag content production
Here, most of the data used in fune tuning was parsed into separate files for the rag pipeline's vector db

In [None]:
import json

def parse_json_to_continuous_text(json_file_path, output_file_path=None):
    try:
        with open(json_file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
        
        parsed_conversations = []
        
        for conversation in data:
            human_text = conversation.get('text', '').strip()
            
            continuous_text = f"{human_text}".strip()
            parsed_conversations.append(continuous_text)

        if output_file_path:
            with open(output_file_path, 'w', encoding='utf-8') as output_file:
                for text in parsed_conversations:
                    output_file.write(text + '\n\n')
            print(f"Parsed text saved to: {output_file_path}")
        
        return parsed_conversations
    
    except FileNotFoundError:
        print(f"Error: File '{json_file_path}' not found.")
        return []
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in '{json_file_path}'.")
        return []
    except Exception as e:
        print(f"Error parsing file: {str(e)}")
        return []
    
parse_json_to_continuous_text("final.json", "web_cont.txt")

# Data conersions
Data was further converted into QA pairs for data extraction for part of the data used

In [None]:
import json
import re

def convert_to_qa_format(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    converted_data = []
    
    for item in data:
        text = item.get('text', '')
        pattern = r'<s>\[INST\]\s*(.*?)\s*\[/INST\]\s*(.*?)\s*</s>'
        match = re.search(pattern, text, re.DOTALL)
        
        if match:
            instruction = match.group(1).strip()
            response = match.group(2).strip()
            
            qa_item = {
                "human": instruction,
                "assistant": response
            }
            converted_data.append(qa_item)
        else:
            print(f"Warning: Could not parse item: {text[:50]}...")
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(converted_data, f, indent=2, ensure_ascii=False)
    
    print(f"Conversion complete! Converted {len(converted_data)} items.")
    return converted_data

def convert_data_to_qa_format(data):
    converted_data = []
    
    for item in data:
        text = item.get('text', '')
        pattern = r'<s>\[INST\]\s*(.*?)\s*\[/INST\]\s*(.*?)\s*</s>'
        match = re.search(pattern, text, re.DOTALL)
        
        if match:
            instruction = match.group(1).strip()
            response = match.group(2).strip()
            
            qa_item = {
                "human": instruction,
                "assistant": response
            }
            converted_data.append(qa_item)
    
    return converted_data

def convert_to_plain_text(data_or_file, output_file=None):
    if isinstance(data_or_file, str):
        with open(data_or_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
    else:
        data = data_or_file
    
    output_lines = []
    
    for i, item in enumerate(data):
        text = item.get('text', '')
        pattern = r'<s>\[INST\]\s*(.*?)\s*\[/INST\]\s*(.*?)\s*</s>'
        match = re.search(pattern, text, re.DOTALL)
        
        if match:
            instruction = match.group(1).strip()
            response = match.group(2).strip()
            output_lines.append(f"human: {instruction}")
            output_lines.append(f"assistant: {response}")
            if i < len(data) - 1:
                output_lines.append("")
    
    result_text = "\n".join(output_lines)
    if output_file:
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(result_text)
        print(f"Plain text format saved to {output_file}")
    
    return result_text

if __name__ == "__main__":
  convert_to_plain_text("final.json", "web_mid.txt")

# Dummy run of fine tuned model

In [None]:
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer

base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
model = PeftModel.from_pretrained(base_model, "your-username/your-model-name")
tokenizer = AutoTokenizer.from_pretrained("your-username/your-model-name")