#Data Collection and Preprocessing

In [1]:
!mkdir -p data/
!cd data/

In [2]:
!pip install requests beautifulsoup4



##StackOverflow scraper

In [3]:
!pip install html2text stackapi

Collecting html2text
  Downloading html2text-2025.4.15-py3-none-any.whl.metadata (4.1 kB)
Collecting stackapi
  Downloading StackAPI-0.3.1-py3-none-any.whl.metadata (2.3 kB)
Downloading html2text-2025.4.15-py3-none-any.whl (34 kB)
Downloading StackAPI-0.3.1-py3-none-any.whl (7.2 kB)
Installing collected packages: html2text, stackapi
Successfully installed html2text-2025.4.15 stackapi-0.3.1


In [4]:
import requests
import json
import os
import time
from bs4 import BeautifulSoup

TAGS = ["git", "bash", "grep", "awk", "sed", "curl", "wget", "tar", "gzip",
    "find", "chmod", "chown", "ssh", "scp", "makefile", "docker", "apt",
    "yum", "venv", "pip", "tmux", "zsh", "crontab"]
OUTPUT_DIR = "data"
OUTPUT_FILE = "qa_pairs.json"
MIN_QA_PAIRS = 150
API_KEY = "rl_9yaCcyuncF5mmh3a9WGNiZAAa"

def clean_html(raw_html):
    return BeautifulSoup(raw_html, "html.parser").get_text().strip()

def fetch_top_answer(q_id):
    url = f"https://api.stackexchange.com/2.3/questions/{q_id}/answers"
    params = {
        'order': 'desc',
        'sort': 'votes',
        'site': 'stackoverflow',
        'pagesize': 1,
        'filter': 'withbody'
    }
    if API_KEY:
        params['key'] = API_KEY

    resp = requests.get(url, params=params)
    if resp.status_code != 200:
        print(f"  Answer fetch failed: {resp.status_code}")
        return None

    answers = resp.json().get("items", [])
    if not answers:
        return None

    return clean_html(answers[0].get("body", ""))


def fetch_qa(tag, max_pages=4):
    qa_pairs = []
    for page in range(1, max_pages + 1):
        print(f"Fetching page {page} for tag '{tag}'...")
        url = "https://api.stackexchange.com/2.3/questions"
        params = {
            'order': 'desc',
            'sort': 'votes',
            'tagged': tag,
            'site': 'stackoverflow',
            'pagesize': 20,
            'page': page,
            'filter': 'withbody'
        }
        if API_KEY:
            params['key'] = API_KEY

        resp = requests.get(url, params=params)
        if resp.status_code != 200:
            print(f"  Error: {resp.status_code} - {resp.text}")
            continue

        for q in resp.json().get("items", []):
            if not q.get("is_answered"):
                continue
            q_id = q.get("question_id")
            question = clean_html(q.get("title", ""))
            answer = fetch_top_answer(q_id)
            if question and answer:
                qa_pairs.append({
                    "question": question,
                    "answer": answer,
                    "tag": tag
                })
        time.sleep(0.5)
    return qa_pairs


def main():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    all_qa = []
    for tag in TAGS:
        tag_qa = fetch_qa(tag)
        all_qa.extend(tag_qa)
        print(f"Collected {len(tag_qa)} Q&A pairs for '{tag}'")


    print(f"Total Q&A pairs collected: {len(all_qa)}")
    with open(os.path.join(OUTPUT_DIR, OUTPUT_FILE), "w", encoding="utf-8") as f:
        json.dump(all_qa, f, indent=2, ensure_ascii=False)
    print("Saved dataset to", os.path.join(OUTPUT_DIR, OUTPUT_FILE))

if __name__ == "__main__":
     main()

Fetching page 1 for tag 'git'...
Fetching page 2 for tag 'git'...
Fetching page 3 for tag 'git'...
Fetching page 4 for tag 'git'...
Collected 80 Q&A pairs for 'git'
Fetching page 1 for tag 'bash'...
Fetching page 2 for tag 'bash'...
Fetching page 3 for tag 'bash'...
Fetching page 4 for tag 'bash'...
Collected 80 Q&A pairs for 'bash'
Fetching page 1 for tag 'grep'...
Fetching page 2 for tag 'grep'...
Fetching page 3 for tag 'grep'...
Fetching page 4 for tag 'grep'...
Collected 80 Q&A pairs for 'grep'
Fetching page 1 for tag 'awk'...
Fetching page 2 for tag 'awk'...
Fetching page 3 for tag 'awk'...
Fetching page 4 for tag 'awk'...
Collected 80 Q&A pairs for 'awk'
Fetching page 1 for tag 'sed'...
Fetching page 2 for tag 'sed'...
Fetching page 3 for tag 'sed'...
Fetching page 4 for tag 'sed'...
Collected 80 Q&A pairs for 'sed'
Fetching page 1 for tag 'curl'...
Fetching page 2 for tag 'curl'...
Fetching page 3 for tag 'curl'...
Fetching page 4 for tag 'curl'...
Collected 80 Q&A pairs for 'c


If you meant to use Beautiful Soup to parse the contents of a file on disk, then something has gone wrong. You should open the file first, using code like this:

    filehandle = open(your filename)

You can then feed the open filehandle into Beautiful Soup instead of using the filename.



    
  return BeautifulSoup(raw_html, "html.parser").get_text().strip()


Collected 80 Q&A pairs for 'pip'
Fetching page 1 for tag 'tmux'...
Fetching page 2 for tag 'tmux'...
Fetching page 3 for tag 'tmux'...
Fetching page 4 for tag 'tmux'...
Collected 80 Q&A pairs for 'tmux'
Fetching page 1 for tag 'zsh'...
Fetching page 2 for tag 'zsh'...
Fetching page 3 for tag 'zsh'...
Fetching page 4 for tag 'zsh'...
Collected 80 Q&A pairs for 'zsh'
Fetching page 1 for tag 'crontab'...
Fetching page 2 for tag 'crontab'...
Fetching page 3 for tag 'crontab'...
Fetching page 4 for tag 'crontab'...
Collected 80 Q&A pairs for 'crontab'
Total Q&A pairs collected: 1758
Saved dataset to data/qa_pairs.json


In [5]:
!pip install -q torch transformers peft datasets bitsandbytes accelerate scipy

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m125.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m92.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m59.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [6]:
!pip install trl

Collecting trl
  Downloading trl-0.18.2-py3-none-any.whl.metadata (11 kB)
Collecting datasets>=3.0.0 (from trl)
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets>=3.0.0->trl)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading trl-0.18.2-py3-none-any.whl (366 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m366.4/366.4 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m37.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets, trl
  Attempting uninstall: fsspec
    Found existing installation: fsspe

#Training

In [12]:
import json
import torch
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model
import warnings
import os


warnings.filterwarnings("ignore")


MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
DATASET_PATH = "data/qa_pairs.json"
OUTPUT_DIR = "tinyllama_lora_adapter"
MAX_SEQ_LENGTH = 1024

def format_instruction(sample):
    """Format training samples to encourage command-only responses"""
    return (
        f"<|system|>\nYou are a CLI expert assistant. Generate ONLY step-by-step shell commands without explanations.</s>\n"
        f"<|user|>\n{sample['question']}</s>\n"
        f"<|assistant|>\n{sample['answer']}\n"
    )

def load_and_preprocess_data(file_path):
    """Load and format dataset with quality filtering"""
    with open(file_path, 'r') as f:
        data = json.load(f)


    filtered_data = [
        item for item in data
        if len(item['answer'].split()) > 3 and not item['answer'].startswith(('Sorry', 'I don\'t'))
    ]

    print(f"Filtered {len(data) - len(filtered_data)} low-quality samples")
    print(f"Using {len(filtered_data)} samples for training")


    formatted_texts = [format_instruction(item) for item in filtered_data]

    return {"text": formatted_texts}


device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")


os.makedirs(OUTPUT_DIR, exist_ok=True)


model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto" if device == "cuda" else None,
    torch_dtype=torch.float32
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Prepare dataset
dataset_dict = load_and_preprocess_data(DATASET_PATH)
dataset = Dataset.from_dict(dataset_dict)


def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=MAX_SEQ_LENGTH,
        padding="max_length"
    )


tokenized_dataset = dataset.map(tokenize_function, batched=True)

# LoRA configuration
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj"]
)


model = get_peft_model(model, peft_config)
model.print_trainable_parameters()


data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    optim="adamw_torch",
    save_strategy="epoch",
    logging_steps=10,
    report_to="none",
    max_grad_norm=0.3
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)


print("Starting training...")
trainer.train()


model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"Adapter saved to {OUTPUT_DIR}")


test_prompt = (
    "<|system|>\nYou are a CLI expert assistant. Generate step-by-step shell commands.</s>\n"
    "<|user|>\nHow to find all .log files modified in last 7 days?</s>\n"
    "<|assistant|>\n"
)

inputs = tokenizer(test_prompt, return_tensors="pt", return_attention_mask=False).to(device)

outputs = model.generate(
    **inputs,
    max_new_tokens=128,
    temperature=0.3,
    do_sample=True,
    top_p=0.95,
    repetition_penalty=1.1
)

print("\n\nGenerated output:\n", tokenizer.decode(outputs[0], skip_special_tokens=True))

Using device: cuda
Filtered 6 low-quality samples
Using 1752 samples for training


Map:   0%|          | 0/1752 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


trainable params: 2,252,800 || all params: 1,102,301,184 || trainable%: 0.2044
Starting training...


Step,Training Loss
10,2.087
20,1.6915
30,1.6634
40,1.7009
50,1.6336
60,1.6585
70,1.5858
80,1.5092
90,1.6138
100,1.5045


Adapter saved to tinyllama_lora_adapter


Generated output:
 <|system|>
You are a CLI expert assistant. Generate step-by-step shell commands. 
<|user|>
How to find all .log files modified in last 7 days? 
<|assistant|>
find . -type f -mtime +7 -print0 | xargs -0 tail -n 1

This will give you the first log file that was modified within the last 7 days. If you want to get the last modified date, use:
find . -type f -mtime +7 -exec ls -l {} \; | awk '{print $6}'

If you want to get the last modified time of each file, then use:
find . -type f -mtime +7 -exec ls -lt {} \; | awk '{print $9}'

The -exec
