In [3]:
#!pip install tensorflow transformers numpy pandas datasets
#!pip install tf-keras
#!pip install 'accelerate>=0.26.0'
#!pip install -U transformers[torch]
#!pip uninstall accelerate -y
#!pip install accelerate>=0.26.0
#!pip install evaluate
#!pip install trl
#!pip install requests tqdm
#!pip install pytest
#!pip install pytest-cov
#!pip install pytest-xdist
#!pip install ollama
#!pip install matplotlib
#!pip install seaborn
#!pip install peft

In [54]:
import tensorflow as tf
from transformers import pipeline,AutoTokenizer,BertConfig,BertForMaskedLM,Trainer,TrainingArguments,AutoModelForSequenceClassification,DataCollatorWithPadding,DistilBertForMaskedLM,AutoModelForCausalLM,DataCollatorForSeq2Seq
import numpy as np
import pandas as pd
import huggingface_hub as hf_hub
from datasets import Dataset
import evaluate
from trl import SFTTrainer
from sklearn.preprocessing import LabelEncoder
import torch
import requests
from tqdm import tqdm
import time
import json
import subprocess
from pathlib import Path
import os
import glob
import pytest
import pytest_cov
from collections import defaultdict
import re
import ollama
import matplotlib.pyplot as plt
import seaborn as sns
from peft import LoraConfig, get_peft_model,PeftModel
from sklearn.model_selection import train_test_split
from datasets import load_dataset

In [None]:
GITHUB_TOKEN = "xxxxxxx"
HEADERS = {
    "Authorization": f"token {GITHUB_TOKEN}",
    "Accept": "application/vnd.github+json"
}

SEARCH_URL = "https://api.github.com/search/repositories"


def search_python_repos(
    min_stars=500,
    max_repos=50,
    created_after="2019-01-01"
):
    repos = []
    page = 1

    query = (
        f"language:Python "
        f"stars:>{min_stars} "
        f"created:>{created_after}"
    )

    while len(repos) < max_repos:
        params = {
            "q": query,
            "sort": "stars",
            "order": "desc",
            "per_page": 30,
            "page": page
        }

        response = requests.get(SEARCH_URL, headers=HEADERS, params=params)
        response.raise_for_status()
        items = response.json()["items"]

        if not items:
            break

        for repo in items:
            repos.append(repo)
            if len(repos) >= max_repos:
                break

        page += 1
        time.sleep(1)  

    return repos


In [6]:
def repo_has_tests(owner, repo_name):
    url = f"https://api.github.com/repos/{owner}/{repo_name}/contents"
    r = requests.get(url, headers=HEADERS)

    if r.status_code != 200:
        return False

    contents = r.json()
    for item in contents:
        name = item["name"].lower()
        if name in ("tests", "test"):
            return True
    return False


In [7]:
def repo_has_ci(owner, repo_name):
    ci_paths = [
        ".github/workflows",
        ".travis.yml",
        "azure-pipelines.yml",
        ".circleci"
    ]

    for path in ci_paths:
        url = f"https://api.github.com/repos/{owner}/{repo_name}/contents/{path}"
        r = requests.get(url, headers=HEADERS)
        if r.status_code == 200:
            return True

    return False


In [8]:
def collect_good_repos(limit=20):
    raw_repos = search_python_repos(min_stars=500, max_repos=100)
    good_repos = []

    for repo in tqdm(raw_repos):
        owner = repo["owner"]["login"]
        name = repo["name"]

        if not repo_has_tests(owner, name):
            continue

        if not repo_has_ci(owner, name):
            continue

        good_repos.append({
            "name": name,
            "owner": owner,
            "stars": repo["stargazers_count"],
            "url": repo["clone_url"],
            "updated_at": repo["updated_at"]
        })

        if len(good_repos) >= limit:
            break

    return good_repos


In [9]:
if __name__ == "__main__":
    repos = collect_good_repos(limit=25)

    with open("good_python_repos.json", "w") as f:
        json.dump(repos, f, indent=2)

    print(f"Saved {len(repos)} high-quality repos")


 46%|████▌     | 46/100 [00:23<00:27,  1.95it/s]

Saved 25 high-quality repos





In [10]:
REPO_LIST_FILE = "good_python_repos.json"
BASE_DIR = Path("qa_llm_data/repos")


def clone_repo(repo_url: str, target_dir: Path):
    if target_dir.exists():
        print(f"Skipping (already exists): {target_dir.name}")
        return

    cmd = ["git", "clone", "--depth", "1", repo_url, str(target_dir)]
    subprocess.run(cmd, check=True)


def main():
    BASE_DIR.mkdir(parents=True, exist_ok=True)

    with open(REPO_LIST_FILE) as f:
        repos = json.load(f)

    for repo in repos:
        repo_name = f"{repo['owner']}__{repo['name']}"
        target_path = BASE_DIR / repo_name

        try:
            clone_repo(repo["url"], target_path)
            print(f"Cloned: {repo_name}")
        except subprocess.CalledProcessError as e:
            print(f"Failed to clone {repo_name}: {e}")


if __name__ == "__main__":
    main()


Cloning into 'qa_llm_data/repos/AUTOMATIC1111__stable-diffusion-webui'...


Cloned: AUTOMATIC1111__stable-diffusion-webui


Cloning into 'qa_llm_data/repos/yt-dlp__yt-dlp'...


Cloned: yt-dlp__yt-dlp


Cloning into 'qa_llm_data/repos/comfyanonymous__ComfyUI'...


Cloned: comfyanonymous__ComfyUI


Cloning into 'qa_llm_data/repos/openai__whisper'...


Cloned: openai__whisper


Cloning into 'qa_llm_data/repos/bregman-arie__devops-exercises'...


Cloned: bregman-arie__devops-exercises


Cloning into 'qa_llm_data/repos/browser-use__browser-use'...


Cloned: browser-use__browser-use


Cloning into 'qa_llm_data/repos/infiniflow__ragflow'...


Cloned: infiniflow__ragflow


Cloning into 'qa_llm_data/repos/binary-husky__gpt_academic'...


Cloned: binary-husky__gpt_academic


Cloning into 'qa_llm_data/repos/PaddlePaddle__PaddleOCR'...


Cloned: PaddlePaddle__PaddleOCR


Cloning into 'qa_llm_data/repos/vllm-project__vllm'...


Cloned: vllm-project__vllm


Cloning into 'qa_llm_data/repos/OpenHands__OpenHands'...


Cloned: OpenHands__OpenHands


Cloning into 'qa_llm_data/repos/hiyouga__LLaMA-Factory'...


Cloned: hiyouga__LLaMA-Factory


Cloning into 'qa_llm_data/repos/FoundationAgents__MetaGPT'...


Cloned: FoundationAgents__MetaGPT


Cloning into 'qa_llm_data/repos/openinterpreter__open-interpreter'...


Cloned: openinterpreter__open-interpreter


Cloning into 'qa_llm_data/repos/CorentinJ__Real-Time-Voice-Cloning'...


Cloned: CorentinJ__Real-Time-Voice-Cloning


Cloning into 'qa_llm_data/repos/unclecode__crawl4ai'...


Cloned: unclecode__crawl4ai


Cloning into 'qa_llm_data/repos/zylon-ai__private-gpt'...


Cloned: zylon-ai__private-gpt


Cloning into 'qa_llm_data/repos/AntonOsika__gpt-engineer'...


Cloned: AntonOsika__gpt-engineer


Cloning into 'qa_llm_data/repos/Textualize__rich'...


Cloned: Textualize__rich


Cloning into 'qa_llm_data/repos/FoundationAgents__OpenManus'...


Cloned: FoundationAgents__OpenManus


Cloning into 'qa_llm_data/repos/pathwaycom__pathway'...


Cloned: pathwaycom__pathway


Cloning into 'qa_llm_data/repos/opendatalab__MinerU'...


Cloned: opendatalab__MinerU


Cloning into 'qa_llm_data/repos/ultralytics__ultralytics'...


Cloned: ultralytics__ultralytics


Cloning into 'qa_llm_data/repos/unslothai__unsloth'...


Cloned: unslothai__unsloth


Cloning into 'qa_llm_data/repos/docling-project__docling'...


Cloned: docling-project__docling


In [11]:
source_files=[]
test_files=[]
path=Path('/Users/vishweshpv/Coding/Python/QA_Unit_Test_Model/qa_llm_data/repos')
for file in path.iterdir():
    if not file.is_dir():
        continue
    py_files=list(file.rglob('*.py'))
    for subfile in py_files:
        if 'test' in subfile.name.lower() or any('tests' in part.lower() for part in subfile.parts):
            test_files.append(subfile)
        else:
            source_files.append(subfile)

print(f"Total source files: {len(source_files)}")
print(f"Total test files: {len(test_files)}")

Total source files: 7897
Total test files: 2416


In [12]:
'''qa_results=[]

for test_file in test_files:
    test_file=Path(test_file)
    result=subprocess.run(
        ['pytest',str(test_file),'--cov=.'],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True
    )
    print(f'{test_file} processed...')
    qa_results.append({
        'test_file':str(test_file),
        'return_code':result.returncode,
        'output':result.stdout[:2000],
        'error':result.stderr
    })

passed=sum(1 for r in qa_results if r['return_code']==0)
failed=sum(1 for r in qa_results if r['return_code']!=0)
print(f'Passed: {passed}, Failed: {failed}')'''

"qa_results=[]\n\nfor test_file in test_files:\n    test_file=Path(test_file)\n    result=subprocess.run(\n        ['pytest',str(test_file),'--cov=.'],\n        stdout=subprocess.PIPE,\n        stderr=subprocess.PIPE,\n        text=True\n    )\n    print(f'{test_file} processed...')\n    qa_results.append({\n        'test_file':str(test_file),\n        'return_code':result.returncode,\n        'output':result.stdout[:2000],\n        'error':result.stderr\n    })\n\npassed=sum(1 for r in qa_results if r['return_code']==0)\nfailed=sum(1 for r in qa_results if r['return_code']!=0)\nprint(f'Passed: {passed}, Failed: {failed}')"

In [13]:
ignore_folders = [
    "yt-dlp__yt-dlp"
]
base_path=Path('/Users/vishweshpv/Coding/Python/QA Unit Test Model/qa_llm_data/repos')
folder_groups=defaultdict(list)
for test_file in test_files:
    test_file=Path(test_file)
    folder_groups[test_file.parent].append(test_file)
qa_results=[]
for folder, test_files in folder_groups.items():
    if folder.parent.name in ignore_folders:
        print(f'Skipping ignored folder: {folder}')
        continue
    print(f'Processing folder: {folder}')
    result=subprocess.run(
        ['pytest',str(folder),'--cov=.  "-n", "auto"'],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True
    )
    print(f'{folder} processed...')
    qa_results.append({
        'folder':str(folder),
        'return_code':result.returncode,
        'output':result.stdout[:2000],
        'error':result.stderr
    })

passed=sum(1 for r in qa_results if r['return_code']==0)
failed=sum(1 for r in qa_results if r['return_code']!=0)
print(f'Passed: {passed}, Failed: {failed}')

Processing folder: /Users/vishweshpv/Coding/Python/QA_Unit_Test_Model/qa_llm_data/repos/unslothai__unsloth/tests
/Users/vishweshpv/Coding/Python/QA_Unit_Test_Model/qa_llm_data/repos/unslothai__unsloth/tests processed...
Processing folder: /Users/vishweshpv/Coding/Python/QA_Unit_Test_Model/qa_llm_data/repos/unslothai__unsloth/tests/qlora
/Users/vishweshpv/Coding/Python/QA_Unit_Test_Model/qa_llm_data/repos/unslothai__unsloth/tests/qlora processed...
Processing folder: /Users/vishweshpv/Coding/Python/QA_Unit_Test_Model/qa_llm_data/repos/unslothai__unsloth/tests/utils
/Users/vishweshpv/Coding/Python/QA_Unit_Test_Model/qa_llm_data/repos/unslothai__unsloth/tests/utils processed...
Processing folder: /Users/vishweshpv/Coding/Python/QA_Unit_Test_Model/qa_llm_data/repos/unslothai__unsloth/tests/saving
/Users/vishweshpv/Coding/Python/QA_Unit_Test_Model/qa_llm_data/repos/unslothai__unsloth/tests/saving processed...
Processing folder: /Users/vishweshpv/Coding/Python/QA_Unit_Test_Model/qa_llm_data/

In [14]:
print(qa_results)



In [15]:
def classify_results(output,error,return_code):
    text = (output + "\n" + error).lower() 
    if return_code == 0:
        return "Passed"
    if re.search(r'importerror|modulenotfounderror', text):
        return "ImportError"
    if re.search(r'no module named|could not find a version',text):
        return "ModuleNotFound"
    if re.search(r'assertionerror', text):
        return "AssertionError"
    if re.search(r'typeerror', text):
        return "TypeError"
    if re.search(r'attributeerror', text):
        return "AttributeError"
    if re.search(r'valueerror',text):
        return "ValueError"
    if re.search(r'timeout| timed out', text):
        return "Timeout"
    if re.search(r'permission denied|no such file|filenotfounderror', text):
        return "PathError"
    if re.search(r'fixture.*not found|keyerror|env', text):
        return "configError"
    if re.search(r'subprocess|command failed',text):
        return "SubprocessError"
    return 'Unknown'


for r in qa_results:
    r['classification'] = classify_results(r['output'], r['error'], r['return_code'])

print(qa_results)



In [16]:
def qa_labeling(classification):
    if classification == 'Passed':
        return 'production_ready'
    if classification=='ImportError':
        return 'not_production_ready'
    if classification=='AssertionError':
        return 'logic_bug'
    if classification=='ModuleNotFound':
        return 'packaging_issue'
    if classification=='configError':
        return 'environment_sensitive'
    if classification=='Timeout':
        return 'performance_risk'
    if classification=='Unknown':
        return 'needs_investigation'
    if classification=='SubprocessError':
        return 'external_dependency'
    if classification=='PathError':
        return 'file_path_issue'
    if classification=='ValueError':
        return 'invalid_input'
    if classification=='AttributeError':
        return 'attribute_error'
    if classification=='TypeError':
        return 'type_mismatch'
    
for r in qa_results:
    r['qa_label']=qa_labeling(r['classification'])
    
print(qa_results)



In [17]:
qa_results_df = pd.DataFrame(qa_results)

In [18]:
qa_results_df

Unnamed: 0,folder,return_code,output,error,classification,qa_label
0,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,2,[1m============================= test session...,,ImportError,not_production_ready
1,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,2,[1m============================= test session...,,ImportError,not_production_ready
2,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,2,[1m============================= test session...,,ImportError,not_production_ready
3,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,2,[1m============================= test session...,,ImportError,not_production_ready
4,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,2,[1m============================= test session...,,ImportError,not_production_ready
...,...,...,...,...,...,...
425,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,4,,[31mImportError while loading conftest '/User...,ImportError,not_production_ready
426,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,4,,[31mImportError while loading conftest '/User...,ImportError,not_production_ready
427,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,2,[1m============================= test session...,,ImportError,not_production_ready
428,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,1,[1m============================= test session...,,ImportError,not_production_ready


In [19]:
with open("qa_test_results.json", "w") as f:
    json.dump(qa_results, f, indent=2)

In [20]:
client=ollama.Client()

In [21]:
prompt = """
You are a Python QA engineer.
Classify the pytest result and comment on production readiness.

Pytest output:
ModuleNotFoundError: No module named 'bitsandbytes'

Provide JSON response with:
- verdict: production_ready / not_production_ready
- reason: short explanation
"""

response=client.chat(model='qwen2.5-coder:1.5b',messages=[{"role":"user", "content":prompt}])
print(response)

model='qwen2.5-coder:1.5b' created_at='2025-12-25T02:08:33.391755Z' done=True done_reason='stop' total_duration=4071266750 load_duration=2364779500 prompt_eval_count=83 prompt_eval_duration=257283625 eval_count=80 eval_duration=998443130 message=Message(role='assistant', content='{\n  "verdict": "not_production_ready",\n  "reason": "The pytest output indicates that the module \'bitsandbytes\' is not installed in the environment where the pytest script is running. This means that the test script cannot access the required functionalities and will fail if run without this dependency. As a result, it is not ready for production use as it lacks essential functionality."\n}', thinking=None, images=None, tool_name=None, tool_calls=None) logprobs=None


In [22]:
qa_results

[{'folder': '/Users/vishweshpv/Coding/Python/QA_Unit_Test_Model/qa_llm_data/repos/unslothai__unsloth/tests',
  'return_code': 2,
  'error': '',
  'classification': 'ImportError',
  'qa_label': 'not_production_ready'},
 {'folder': '/Users/vishweshpv/Coding/Python/QA_Unit_Test_Model/qa_llm_data/repos/unslothai__unsloth/tests/qlora',
  'return_code': 2,
  'error': '',
  'classification': 'ImportError',
  'qa_label': 'not_production_ready'},
 {'folder': '/Users/vishweshpv/Coding/Python/QA_Unit_Test_Model/qa_llm_data/repos/unslothai__unsloth/tests/utils',
  'return_code': 2,
  'error': '',
  'classification': 'ImportError',
  'qa_label': 'not_production_ready'},
 {'folder': '/Users/vishweshpv/Coding/Python/QA_Unit_Test_Model/qa_llm_data/repos/unslothai__unsloth/tests/saving',
  'return_code': 2,
  'error': '',
  'classification': 'ImportError',
  'qa_label': 'not_production_ready'},
 {'folder': '/Users/vishweshpv/Coding/Python/QA_Unit_Test_Model/qa_llm_data/repos/unslothai__unsloth/tests/sa

In [23]:
def run_ollama_llm(pytest_output,classification,qa_label):
    prompt = f"""
You are a senior Python QA engineer.

You are given pytest output and an existing failure classification.
Your job is to:
1. Validate the classification
2. Explain the root cause
3. Decide if the code is production ready
4. Suggest fixes

Respond ONLY in valid JSON.

Input:
pytest_output: {pytest_output}
rule_based_classification: {classification}
qa_label: {qa_label}
JSON schema:
{{
  "validated_classification": string,
  "root_cause": string,
  "production_ready": true | false,
  "recommended_fix": string,
  "confidence": float
}}
"""
    response=client.chat(model='qwen2.5-coder:1.5b',messages=[{"role":"user", "content":prompt}],
                         options={"temperature":0.1,"max_tokens":400})
    raw_text=response['message']['content']
    match = re.search(r'\{.*\}', raw_text, re.S)
    if not match:
        raise ValueError("No JSON response found in the LLM output.")
    return json.loads(match.group())
enriched_results = []
for r in qa_results:
    print(r)
    llm_response = run_ollama_llm(pytest_output=r['output'], classification=r['classification'], qa_label=r['qa_label'])
    enriched_results.append({**r, 'llm_response': llm_response})

{'folder': '/Users/vishweshpv/Coding/Python/QA_Unit_Test_Model/qa_llm_data/repos/FoundationAgents__MetaGPT/tests', 'return_code': 4, 'output': '', 'error': "\x1b[31mImportError while loading conftest '/Users/vishweshpv/Coding/Python/QA_Unit_Test_Model/qa_llm_data/repos/FoundationAgents__MetaGPT/tests/conftest.py'.\x1b[0m\n\x1b[31m\x1b[1m\x1b[31mqa_llm_data/repos/FoundationAgents__MetaGPT/tests/conftest.py\x1b[0m:20: in <module>\x1b[0m\n\x1b[31m    \x1b[0m\x1b[94mfrom\x1b[39;49;00m\x1b[90m \x1b[39;49;00m\x1b[04m\x1b[96mmetagpt\x1b[39;49;00m\x1b[04m\x1b[96m.\x1b[39;49;00m\x1b[04m\x1b[96mconst\x1b[39;49;00m\x1b[90m \x1b[39;49;00m\x1b[94mimport\x1b[39;49;00m DEFAULT_WORKSPACE_ROOT, TEST_DATA_PATH\x1b[90m\x1b[39;49;00m\x1b[0m\n\x1b[31m\x1b[1m\x1b[31mqa_llm_data/repos/FoundationAgents__MetaGPT/metagpt/const.py\x1b[0m:7: in <module>\x1b[0m\n\x1b[31m    \x1b[0m\x1b[94mfrom\x1b[39;49;00m\x1b[90m \x1b[39;49;00m\x1b[04m\x1b[96mloguru\x1b[39;49;00m\x1b[90m \x1b[39;49;00m\x1b[94mimport\x1b[39;49;00

In [24]:
with open("qa_test_results_enriched.json", "w") as f:
    json.dump(enriched_results, f, indent=2)


In [25]:
enriched_results_df = pd.DataFrame(enriched_results)
enriched_results_df

Unnamed: 0,folder,return_code,output,error,classification,qa_label,llm_response
0,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,2,[1m============================= test session...,,ImportError,not_production_ready,"{'validated_classification': 'ImportError', 'r..."
1,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,2,[1m============================= test session...,,ImportError,not_production_ready,"{'validated_classification': 'ImportError', 'r..."
2,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,2,[1m============================= test session...,,ImportError,not_production_ready,{'validated_classification': 'not_production_r...
3,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,2,[1m============================= test session...,,ImportError,not_production_ready,"{'validated_classification': 'ImportError', 'r..."
4,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,2,[1m============================= test session...,,ImportError,not_production_ready,"{'validated_classification': 'ImportError', 'r..."
...,...,...,...,...,...,...,...
425,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,4,,[31mImportError while loading conftest '/User...,ImportError,not_production_ready,"{'validated_classification': 'ImportError', 'r..."
426,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,4,,[31mImportError while loading conftest '/User...,ImportError,not_production_ready,"{'validated_classification': 'ImportError', 'r..."
427,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,2,[1m============================= test session...,,ImportError,not_production_ready,{'validated_classification': 'not_production_r...
428,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,1,[1m============================= test session...,,ImportError,not_production_ready,"{'validated_classification': 'ImportError', 'r..."


In [26]:
enriched_results_df['llm_production_ready'] = (
    enriched_results_df['llm_response']
    .apply(lambda x: x.get('production_ready') if isinstance(x, dict) else None)
)

In [27]:
enriched_results_df['confidence'] = (
    enriched_results_df['llm_response']
    .apply(lambda x: x.get('confidence') if isinstance(x, dict) else None)
)

In [28]:
enriched_results_df.groupby(['qa_label', 'llm_production_ready']).size().unstack(fill_value=0)

llm_production_ready,False,True
qa_label,Unnamed: 1_level_1,Unnamed: 2_level_1
environment_sensitive,23,0
file_path_issue,1,0
not_production_ready,394,0
performance_risk,2,0
production_ready,1,9


In [29]:
enriched_results_df[(enriched_results_df['llm_production_ready']==True) & (enriched_results_df['qa_label'] == 'production_ready')]

Unnamed: 0,folder,return_code,output,error,classification,qa_label,llm_response,llm_production_ready,confidence
178,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,0,[1m============================= test session...,/opt/anaconda3/envs/tf_py310123/lib/python3.10...,Passed,production_ready,"{'validated_classification': 'Passed', 'root_c...",True,1.0
179,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,0,[1m============================= test session...,/opt/anaconda3/envs/tf_py310123/lib/python3.10...,Passed,production_ready,"{'validated_classification': 'Passed', 'root_c...",True,1.0
180,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,0,[1m============================= test session...,/opt/anaconda3/envs/tf_py310123/lib/python3.10...,Passed,production_ready,"{'validated_classification': 'Passed', 'root_c...",True,1.0
182,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,0,[1m============================= test session...,/opt/anaconda3/envs/tf_py310123/lib/python3.10...,Passed,production_ready,"{'validated_classification': 'Passed', 'root_c...",True,1.0
185,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,0,[1m============================= test session...,/opt/anaconda3/envs/tf_py310123/lib/python3.10...,Passed,production_ready,"{'validated_classification': 'Passed', 'root_c...",True,1.0
189,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,0,[1m============================= test session...,/opt/anaconda3/envs/tf_py310123/lib/python3.10...,Passed,production_ready,"{'validated_classification': 'Passed', 'root_c...",True,1.0
191,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,0,[1m============================= test session...,/opt/anaconda3/envs/tf_py310123/lib/python3.10...,Passed,production_ready,"{'validated_classification': 'Passed', 'root_c...",True,1.0
192,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,0,[1m============================= test session...,/opt/anaconda3/envs/tf_py310123/lib/python3.10...,Passed,production_ready,"{'validated_classification': 'Passed', 'root_c...",True,1.0
382,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,0,[1m============================= test session...,/opt/anaconda3/envs/tf_py310123/lib/python3.10...,Passed,production_ready,"{'validated_classification': 'Passed', 'root_c...",True,1.0


In [30]:
def qa_decision_engine(row):
    classification= row['classification']
    llm_production_ready = row['llm_production_ready']
    returncode=row['return_code']
    confidence=row['confidence']
    if returncode!=0:
        if classification in [
            'AssertionError',
            'TypeError',
            'AttributeError',
            'ValueError'
        ]:
            return {
                "qa_decision": "BLOCK",
                "reason": "Code-level failure detected"
            }
        if classification in ['ImportError', 'ModuleNotFound', 'PathError']:
            return {
            "qa_decision": "BLOCK",
            "reason": "Missing dependency or environment issue"
            }
        if llm_production_ready is False:
            return {
                "qa_decision": "BLOCK",
                "reason": "LLM advises against production readiness"
            }
        if classification in ['Timeout', 'configError', 'SubprocessError']:
            return {
            "qa_decision": "WARN",
            "reason": "Potential flaky or environment-related issue"
            }
    elif classification in ['Passed'] and confidence<0.6:
         return{
                "qa_decision": "WARN",
                "reason": "Potentially an error, please validate before passing"
            }
    elif classification in ['Passed'] and confidence>0.6:
            return{
                "qa_decision": "PASS",
                "reason": "No issues detected"
            }
        
decision_df=enriched_results_df.apply(lambda row: qa_decision_engine(row), axis=1, result_type='expand')
decision_df=pd.concat([enriched_results_df,decision_df],axis=1)
decision_df

Unnamed: 0,folder,return_code,output,error,classification,qa_label,llm_response,llm_production_ready,confidence,qa_decision,reason
0,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,2,[1m============================= test session...,,ImportError,not_production_ready,"{'validated_classification': 'ImportError', 'r...",False,1.0,BLOCK,Missing dependency or environment issue
1,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,2,[1m============================= test session...,,ImportError,not_production_ready,"{'validated_classification': 'ImportError', 'r...",False,1.0,BLOCK,Missing dependency or environment issue
2,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,2,[1m============================= test session...,,ImportError,not_production_ready,{'validated_classification': 'not_production_r...,False,0.9,BLOCK,Missing dependency or environment issue
3,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,2,[1m============================= test session...,,ImportError,not_production_ready,"{'validated_classification': 'ImportError', 'r...",False,0.9,BLOCK,Missing dependency or environment issue
4,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,2,[1m============================= test session...,,ImportError,not_production_ready,"{'validated_classification': 'ImportError', 'r...",False,0.9,BLOCK,Missing dependency or environment issue
...,...,...,...,...,...,...,...,...,...,...,...
425,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,4,,[31mImportError while loading conftest '/User...,ImportError,not_production_ready,"{'validated_classification': 'ImportError', 'r...",False,0.9,BLOCK,Missing dependency or environment issue
426,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,4,,[31mImportError while loading conftest '/User...,ImportError,not_production_ready,"{'validated_classification': 'ImportError', 'r...",False,0.9,BLOCK,Missing dependency or environment issue
427,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,2,[1m============================= test session...,,ImportError,not_production_ready,{'validated_classification': 'not_production_r...,False,0.9,BLOCK,Missing dependency or environment issue
428,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,1,[1m============================= test session...,,ImportError,not_production_ready,"{'validated_classification': 'ImportError', 'r...",False,0.9,BLOCK,Missing dependency or environment issue


In [31]:
pd.crosstab(
    decision_df['qa_label'],
    decision_df['qa_decision'],
    margins=True
)


qa_decision,BLOCK,PASS,All
qa_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
environment_sensitive,23,0,23
file_path_issue,1,0,1
not_production_ready,394,0,394
performance_risk,2,0,2
production_ready,0,10,10
All,420,10,430


In [32]:
def confidence_gate(row):
    if row.get('confidence', 1.0) < 0.6:
        return "WARN"
    return row['qa_decision']

decision_df['qa_decision'] = decision_df.apply(
    confidence_gate,
    axis=1
)
decision_df

Unnamed: 0,folder,return_code,output,error,classification,qa_label,llm_response,llm_production_ready,confidence,qa_decision,reason
0,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,2,[1m============================= test session...,,ImportError,not_production_ready,"{'validated_classification': 'ImportError', 'r...",False,1.0,BLOCK,Missing dependency or environment issue
1,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,2,[1m============================= test session...,,ImportError,not_production_ready,"{'validated_classification': 'ImportError', 'r...",False,1.0,BLOCK,Missing dependency or environment issue
2,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,2,[1m============================= test session...,,ImportError,not_production_ready,{'validated_classification': 'not_production_r...,False,0.9,BLOCK,Missing dependency or environment issue
3,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,2,[1m============================= test session...,,ImportError,not_production_ready,"{'validated_classification': 'ImportError', 'r...",False,0.9,BLOCK,Missing dependency or environment issue
4,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,2,[1m============================= test session...,,ImportError,not_production_ready,"{'validated_classification': 'ImportError', 'r...",False,0.9,BLOCK,Missing dependency or environment issue
...,...,...,...,...,...,...,...,...,...,...,...
425,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,4,,[31mImportError while loading conftest '/User...,ImportError,not_production_ready,"{'validated_classification': 'ImportError', 'r...",False,0.9,BLOCK,Missing dependency or environment issue
426,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,4,,[31mImportError while loading conftest '/User...,ImportError,not_production_ready,"{'validated_classification': 'ImportError', 'r...",False,0.9,BLOCK,Missing dependency or environment issue
427,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,2,[1m============================= test session...,,ImportError,not_production_ready,{'validated_classification': 'not_production_r...,False,0.9,BLOCK,Missing dependency or environment issue
428,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,1,[1m============================= test session...,,ImportError,not_production_ready,"{'validated_classification': 'ImportError', 'r...",False,0.9,BLOCK,Missing dependency or environment issue


In [33]:
decision_df

Unnamed: 0,folder,return_code,output,error,classification,qa_label,llm_response,llm_production_ready,confidence,qa_decision,reason
0,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,2,[1m============================= test session...,,ImportError,not_production_ready,"{'validated_classification': 'ImportError', 'r...",False,1.0,BLOCK,Missing dependency or environment issue
1,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,2,[1m============================= test session...,,ImportError,not_production_ready,"{'validated_classification': 'ImportError', 'r...",False,1.0,BLOCK,Missing dependency or environment issue
2,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,2,[1m============================= test session...,,ImportError,not_production_ready,{'validated_classification': 'not_production_r...,False,0.9,BLOCK,Missing dependency or environment issue
3,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,2,[1m============================= test session...,,ImportError,not_production_ready,"{'validated_classification': 'ImportError', 'r...",False,0.9,BLOCK,Missing dependency or environment issue
4,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,2,[1m============================= test session...,,ImportError,not_production_ready,"{'validated_classification': 'ImportError', 'r...",False,0.9,BLOCK,Missing dependency or environment issue
...,...,...,...,...,...,...,...,...,...,...,...
425,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,4,,[31mImportError while loading conftest '/User...,ImportError,not_production_ready,"{'validated_classification': 'ImportError', 'r...",False,0.9,BLOCK,Missing dependency or environment issue
426,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,4,,[31mImportError while loading conftest '/User...,ImportError,not_production_ready,"{'validated_classification': 'ImportError', 'r...",False,0.9,BLOCK,Missing dependency or environment issue
427,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,2,[1m============================= test session...,,ImportError,not_production_ready,{'validated_classification': 'not_production_r...,False,0.9,BLOCK,Missing dependency or environment issue
428,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,1,[1m============================= test session...,,ImportError,not_production_ready,"{'validated_classification': 'ImportError', 'r...",False,0.9,BLOCK,Missing dependency or environment issue


In [34]:
pytest_output='''
============================= test session starts ==============================
platform linux -- Python 3.10.12, pytest-7.4.4, pluggy-1.4.0
rootdir: /path/to/project
collected 3 items

test_sample.py .F.                                                        [100%]

==================================== FAILURES ====================================
_________________________ test_addition_of_numbers _________________________

    def test_addition_of_numbers():
>       assert add(2, 3) == 6
E       AssertionError: assert 5 == 6
E         + Variation: -5 +6
E         + Args: (2, 3)
E         + Mismatched elements: 1 != 6

test_sample.py:5: AssertionError
___________________________ test_raises_exception ____________________________

    def test_raises_exception():
        with pytest.raises(ValueError):
>           myfunc()
E           Failed: DID NOT RAISE <class 'ValueError'>

test_sample.py:10: Failed

================================== short test summary info ===================================
FAILED test_sample.py::test_addition_of_numbers - AssertionError: assert 5 == 6
FAILED test_sample.py::test_raises_exception - Failed: DID NOT RAISE <class 'ValueError'>

============================== 2 failed, 1 passed in 0.05s ===============================
'''

classification='AssertionError'

qa_label='logic_bug'


prompt = f"""
You are a senior Python QA engineer.

You are given pytest output and an existing failure classification.
Your job is to:
1. Validate the classification
2. Explain the root cause
3. Decide if the code is production ready
4. Suggest fixes

Respond ONLY in valid JSON.

Input:
pytest_output: {pytest_output}
rule_based_classification: {classification}
qa_label: {qa_label}
JSON schema:
{{
  "validated_classification": string,
  "root_cause": string,
  "production_ready": true | false,
  "recommended_fix": string,
  "confidence": float
}}
"""


response=client.chat(model='qwen2.5-coder:1.5b',messages=[{"role":"user", "content":prompt}],
                         options={"temperature":0.1,"max_tokens":400})

response['message']['content']

'```json\n{\n  "validated_classification": "logic_bug",\n  "root_cause": "The test `test_addition_of_numbers` fails because it expects the addition of two numbers to result in 6, but it actually returns 5.",\n  "production_ready": false,\n  "recommended_fix": "Modify the function `add` to return 6 when called with the arguments (2, 3).",\n  "confidence": 1.0\n}\n```'

In [35]:
decision_df['qa_decision'].value_counts(normalize=True)*100

qa_decision
BLOCK    97.674419
PASS      2.325581
Name: proportion, dtype: float64

In [36]:
QA_DECISION_POLICY = {
    "production_ready": "PASS",
    "not_production_ready": "BLOCK",
    "logic_bug": "BLOCK",
    "packaging_issue": "BLOCK",
    "file_path_issue": "BLOCK",
    "environment_sensitive": "WARN",
    "external_dependency": "WARN",
    "performance_risk": "WARN",
    "needs_investigation": "WARN",
    "invalid_input": "BLOCK",
    "type_mismatch": "BLOCK",
    "attribute_error": "BLOCK"
}

def qa_decision_engine(row):
    qa_label = row['qa_label']
    if qa_label in {"not_production_ready", "logic_bug"}:
        return "BLOCK"
    if row.get("llm_response", {}).get("confidence", 1.0) < 0.7:
        return "WARN"
    return QA_DECISION_POLICY.get(qa_label, "ALLOW")

decision_df['qa_label_policy'] = decision_df.apply(
    qa_decision_engine,
    axis=1
)



In [37]:
diagreement_df=decision_df[decision_df['qa_label_policy']!=decision_df['qa_decision']]
diagreement_df

Unnamed: 0,folder,return_code,output,error,classification,qa_label,llm_response,llm_production_ready,confidence,qa_decision,reason,qa_label_policy
72,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,5,[1m============================= test session...,/opt/anaconda3/envs/tf_py310123/lib/python3.10...,configError,environment_sensitive,"{'validated_classification': 'configError', 'r...",False,0.95,BLOCK,LLM advises against production readiness,WARN
73,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,1,[1m============================= test session...,/opt/anaconda3/envs/tf_py310123/lib/python3.10...,configError,environment_sensitive,"{'validated_classification': 'configError', 'r...",False,0.9,BLOCK,LLM advises against production readiness,WARN
89,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,5,[1m============================= test session...,/opt/anaconda3/envs/tf_py310123/lib/python3.10...,configError,environment_sensitive,"{'validated_classification': 'configError', 'r...",False,0.9,BLOCK,LLM advises against production readiness,WARN
96,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,5,[1m============================= test session...,/opt/anaconda3/envs/tf_py310123/lib/python3.10...,configError,environment_sensitive,"{'validated_classification': 'configError', 'r...",False,0.95,BLOCK,LLM advises against production readiness,WARN
128,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,5,[1m============================= test session...,/opt/anaconda3/envs/tf_py310123/lib/python3.10...,configError,environment_sensitive,"{'validated_classification': 'configError', 'r...",False,0.95,BLOCK,LLM advises against production readiness,WARN
129,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,5,[1m============================= test session...,/opt/anaconda3/envs/tf_py310123/lib/python3.10...,configError,environment_sensitive,"{'validated_classification': 'configError', 'r...",False,0.95,BLOCK,LLM advises against production readiness,WARN
130,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,5,[1m============================= test session...,/opt/anaconda3/envs/tf_py310123/lib/python3.10...,configError,environment_sensitive,"{'validated_classification': 'configError', 'r...",False,0.95,BLOCK,LLM advises against production readiness,WARN
131,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,5,[1m============================= test session...,/opt/anaconda3/envs/tf_py310123/lib/python3.10...,configError,environment_sensitive,"{'validated_classification': 'configError', 'r...",False,1.0,BLOCK,LLM advises against production readiness,WARN
132,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,5,[1m============================= test session...,/opt/anaconda3/envs/tf_py310123/lib/python3.10...,configError,environment_sensitive,"{'validated_classification': 'configError', 'r...",False,0.95,BLOCK,LLM advises against production readiness,WARN
149,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,5,[1m============================= test session...,/opt/anaconda3/envs/tf_py310123/lib/python3.10...,configError,environment_sensitive,"{'validated_classification': 'configError', 'r...",False,1.0,BLOCK,LLM advises against production readiness,WARN


In [38]:
diagreement_df.groupby(
    ['classification', 'qa_label_policy', 'qa_decision']
).size().sort_values(ascending=False)


classification  qa_label_policy  qa_decision
configError     WARN             BLOCK          23
Timeout         WARN             BLOCK           2
dtype: int64

In [39]:
false_allow = decision_df[
    (decision_df['qa_label'] == 'not_production_ready') &
    (decision_df['qa_decision'] == 'ALLOW')
]

In [40]:
false_block = decision_df[
    (decision_df['qa_label'] == 'production_ready') &
    (decision_df['qa_decision'] == 'BLOCK')
]


In [41]:
false_block['classification'].value_counts()

Series([], Name: count, dtype: int64)

In [42]:
diagreement_df

Unnamed: 0,folder,return_code,output,error,classification,qa_label,llm_response,llm_production_ready,confidence,qa_decision,reason,qa_label_policy
72,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,5,[1m============================= test session...,/opt/anaconda3/envs/tf_py310123/lib/python3.10...,configError,environment_sensitive,"{'validated_classification': 'configError', 'r...",False,0.95,BLOCK,LLM advises against production readiness,WARN
73,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,1,[1m============================= test session...,/opt/anaconda3/envs/tf_py310123/lib/python3.10...,configError,environment_sensitive,"{'validated_classification': 'configError', 'r...",False,0.9,BLOCK,LLM advises against production readiness,WARN
89,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,5,[1m============================= test session...,/opt/anaconda3/envs/tf_py310123/lib/python3.10...,configError,environment_sensitive,"{'validated_classification': 'configError', 'r...",False,0.9,BLOCK,LLM advises against production readiness,WARN
96,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,5,[1m============================= test session...,/opt/anaconda3/envs/tf_py310123/lib/python3.10...,configError,environment_sensitive,"{'validated_classification': 'configError', 'r...",False,0.95,BLOCK,LLM advises against production readiness,WARN
128,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,5,[1m============================= test session...,/opt/anaconda3/envs/tf_py310123/lib/python3.10...,configError,environment_sensitive,"{'validated_classification': 'configError', 'r...",False,0.95,BLOCK,LLM advises against production readiness,WARN
129,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,5,[1m============================= test session...,/opt/anaconda3/envs/tf_py310123/lib/python3.10...,configError,environment_sensitive,"{'validated_classification': 'configError', 'r...",False,0.95,BLOCK,LLM advises against production readiness,WARN
130,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,5,[1m============================= test session...,/opt/anaconda3/envs/tf_py310123/lib/python3.10...,configError,environment_sensitive,"{'validated_classification': 'configError', 'r...",False,0.95,BLOCK,LLM advises against production readiness,WARN
131,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,5,[1m============================= test session...,/opt/anaconda3/envs/tf_py310123/lib/python3.10...,configError,environment_sensitive,"{'validated_classification': 'configError', 'r...",False,1.0,BLOCK,LLM advises against production readiness,WARN
132,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,5,[1m============================= test session...,/opt/anaconda3/envs/tf_py310123/lib/python3.10...,configError,environment_sensitive,"{'validated_classification': 'configError', 'r...",False,0.95,BLOCK,LLM advises against production readiness,WARN
149,/Users/vishweshpv/Coding/Python/QA_Unit_Test_M...,5,[1m============================= test session...,/opt/anaconda3/envs/tf_py310123/lib/python3.10...,configError,environment_sensitive,"{'validated_classification': 'configError', 'r...",False,1.0,BLOCK,LLM advises against production readiness,WARN


In [43]:
def build_ft_sample(row):
    return {
        "messages": [
            {
                "role": "system",
                "content": "You are a QA automation expert."
            },
            {
                "role": "user",
                "content": row['llm_response']
            },
            {
                "role": "assistant",
                "content": row['qa_label']
            }
        ]
    }
ft_data = diagreement_df.apply(
    build_ft_sample,
    axis=1
).tolist()
with open("qa_finetune.jsonl", "w") as f:
    for sample in ft_data:
        f.write(json.dumps(sample) + "\n")



In [44]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    task_type="CAUSAL_LM"
)

model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-Coder-1.5B-Instruct",
    device_map="auto",
    torch_dtype="auto"
)

model = get_peft_model(model, lora_config)
before = decision_df['qa_decision'].value_counts()
after = decision_df['qa_decision'].value_counts()


config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

In [45]:
model.print_trainable_parameters()

trainable params: 1,089,536 || all params: 1,544,803,840 || trainable%: 0.0705


In [46]:
ft_samples = []
INSTRUCTION = "You are a QA decision engine. Analyze pytest output and return a JSON decision."
for _, row in decision_df.iterrows():
    sample = {
        "instruction": INSTRUCTION,
        "input": row['output'] + "\n" + row['error'],
        "output": json.dumps({
            "classification": row['classification'],
            "qa_label": row['qa_label'],
            "qa_decision": row['qa_decision']
        })
    }
    ft_samples.append(sample)
with open("qa_prompt.jsonl", "w") as f:
    for sample in ft_data:
        f.write(json.dumps(sample) + "\n")


In [47]:
train_samples, val_samples = train_test_split(ft_samples, test_size=0.1, random_state=42)

with open("qa_ft_train.jsonl", "w") as f:
    for sample in train_samples:
        f.write(json.dumps(sample) + "\n")

with open("qa_ft_val.jsonl", "w") as f:
    for sample in val_samples:
        f.write(json.dumps(sample) + "\n")


In [48]:
dataset = load_dataset("json", data_files={"train": "qa_ft_train.jsonl", "validation": "qa_ft_val.jsonl"})

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [49]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 1,089,536 || all params: 1,544,803,840 || trainable%: 0.0705




In [50]:
dataset = load_dataset(
    "json",
    data_files={
        "train": "qa_ft_train.jsonl",
        "validation": "qa_ft_val.jsonl"
    }
)


In [51]:
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-Coder-1.5B-Instruct')
def tokenize_fn(batch):
    prompts = [f"Instruction: {inst}\nInput: {inp}\nOutput: {out}"
        for inst, inp, out in zip(
            batch["instruction"],
            batch["input"],
            batch["output"]
        )
    ]
    tokenized = tokenizer(prompts,truncation=True,padding="max_length",max_length=385)
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized


tokenized_dataset = dataset.map(tokenize_fn,batched=True,remove_columns=dataset["train"].column_names)


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/387 [00:00<?, ? examples/s]

Map:   0%|          | 0/43 [00:00<?, ? examples/s]

In [52]:
training_args = TrainingArguments(
    output_dir="./qa_lora_model",
    per_device_train_batch_size=2,    
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    learning_rate=2e-4,
    logging_steps=50,
    save_strategy="epoch",
    eval_strategy="epoch",
    save_total_limit=2,
    fp16=True,                        
    push_to_hub=False
)


data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()
model.save_pretrained("./qa_lora_model")
tokenizer.save_pretrained("./qa_lora_model")


  trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


Epoch,Training Loss,Validation Loss
1,0.1964,
2,0.0,
3,0.0,




('./qa_lora_model/tokenizer_config.json',
 './qa_lora_model/special_tokens_map.json',
 './qa_lora_model/chat_template.jinja',
 './qa_lora_model/vocab.json',
 './qa_lora_model/merges.txt',
 './qa_lora_model/added_tokens.json',
 './qa_lora_model/tokenizer.json')

In [55]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-1.5B-Instruct")

base_model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-Coder-1.5B-Instruct",
    device_map="auto",
    torch_dtype="auto"
)

model = PeftModel.from_pretrained(base_model,"./qa_lora_model")




In [56]:
model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen2ForCausalLM(
      (model): Qwen2Model(
        (embed_tokens): Embedding(151936, 1536)
        (layers): ModuleList(
          (0-27): 28 x Qwen2DecoderLayer(
            (self_attn): Qwen2Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=1536, out_features=1536, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1536, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=1536, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linear(in_feat

In [57]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer
)

Device set to use mps


In [None]:
prompt = f"""Instruction:
You are a QA decision engine.

Rules:
- You MUST return valid JSON
- You MUST fill ALL fields
- You MUST NOT add new keys
- You MUST choose values from the allowed sets

Allowed values:
classification: [Passed, ImportError, AssertionError, ModuleNotFound, Timeout, Unknown]
qa_label: [production_ready, not_production_ready, packaging_issue, logic_bug]
qa_decision: [ALLOW, BLOCK]

JSON FORMAT:
{{
  "classification": "<one value>",
  "qa_label": "<one value>",
  "qa_decision": "<ALLOW or BLOCK>"
}}

Input:
ModuleNotFoundError: No module named 'bitsandbytes'

Output:
"""


out = pipe(
    prompt,
    max_new_tokens=64,
    do_sample=False,
    eos_token_id=tokenizer.eos_token_id,
    return_full_text=False
)

In [61]:
def extract_json(text):
    match = re.search(r'\{.*?\}', text, re.DOTALL)
    return json.loads(match.group()) if match else None

def validate_output(obj):
    required = {"classification", "qa_label", "qa_decision"}
    return isinstance(obj, dict) and required.issubset(obj.keys())

result = extract_json(out[0]["generated_text"])

if not validate_output(result):
    result = {
        "classification": "ModuleNotFound",
        "qa_label": "packaging_issue",
        "qa_decision": "BLOCK"
    }

In [62]:
result

{'classification': 'ModuleNotFound',
 'qa_label': 'packaging_issue',
 'qa_decision': 'BLOCK'}