In [None]:
# Install all required packages
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install transformers
!pip install peft
!pip install datasets
!pip install accelerate
!pip install tqdm
!pip install bitsandbytes
!pip install sentencepiece
!pip install protobuf
!pip install evaluate
!pip install rouge_score
!pip install bert_score

# Verify installations
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"Number of GPUs: {torch.cuda.device_count()}")


Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl (61.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.47.0
Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_sc

In [None]:
# GPU VERSION - Remove TPU imports, add GPU imports
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
from datasets import load_dataset
import torch.nn.utils.prune as prune
import torch.nn.functional as F
from torch.utils.data import DataLoader
from tqdm import tqdm
import json

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name()}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")



Using device: cuda
GPU: Tesla T4
GPU Memory: 15.83 GB


In [None]:
from datasets import load_dataset

eval_dataset = load_dataset("json", data_files="./data/hf_eval.jsonl")["train"]


print("Eval samples:", len(eval_dataset))
print(eval_dataset[0])


Generating train split: 0 examples [00:00, ? examples/s]

Eval samples: 911
{'instruction': '#User\nDesign a feature for a social media website to recommend articles to users based on how similar the articles are to their previously liked articles.\n<Reference API>:{"domain": "Natural Language Processing Sentence Similarity", "framework": "Hugging Face Transformers", "functionality": "Feature Extraction", "api_name": "princeton-nlp/unsup-simcse-roberta-base", "api_call": "AutoModel.from_pretrained(\'princeton-nlp/unsup-simcse-roberta-base\')", "api_arguments": null, "python_environment_requirements": ["transformers"], "example_code": null, "performance": {"dataset": null, "accuracy": null}, "description": "An unsupervised sentence embedding model trained using the SimCSE approach with a Roberta base architecture."}\n###Assistant', 'output': "<<<domain>>>: Natural Language Processing Sentence Similarity\n<<<api_call>>>: AutoModel.from_pretrained('princeton-nlp/unsup-simcse-roberta-base')\n<<<api_provider>>>: Hugging Face Transformers\n<<<expla

In [None]:
print(eval_dataset)

Dataset({
    features: ['instruction', 'output'],
    num_rows: 911
})


In [None]:
# Load model and tokenizer
model_name = "tiiuae/Falcon3-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)


# Add padding token if missing
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

In [None]:

# Base model name
BASE_MODEL = model_name

print("\n🚀 Loading FINETUNED model (with LoRA)...")
base_for_lora = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)



🚀 Loading FINETUNED model (with LoRA)...


config.json:   0%|          | 0.00/658 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

In [None]:
FINETUNED_PATH = "./falcon3b_instruct_2ndepoch"

finetuned_model = PeftModel.from_pretrained(base_for_lora, FINETUNED_PATH, device_map="auto")


In [None]:
import evaluate
# -----------------------------
# Metrics (ROUGE-L, BERTScore for explanation; exact/AST for code; exact for structured fields)
# -----------------------------
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
# eval_pipeline.py
import os
import re
import json
import ast
import numpy as np
import torch
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer
import evaluate
from typing import Set



# -----------------------------
# Generation (keeps same signature and generation hyperparam names)
# -----------------------------
def generate_response(model, instruction, max_length=1024):
    """Generate response for given instruction (keeps your generation hyperparams)."""
    prompt = f"{instruction}\n"
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    model.eval()
    with torch.no_grad():
          outputs = model.generate(
              **inputs,
              max_new_tokens=768,
              do_sample=True,
              temperature=0.1,
              top_p=0.95,
              repetition_penalty=1.2,
              no_repeat_ngram_size=0,  # Disable to allow <<<>>> tags
              pad_token_id=tokenizer.pad_token_id,
              eos_token_id=tokenizer.eos_token_id,
              early_stopping=False
          )

    full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = full_output.replace(prompt, "").strip()
    return response

# -----------------------------
# File I/O Functions
# -----------------------------
def save_predictions_to_file(predictions, filename):
    """Save predictions to a JSONL file"""
    print(f"💾 Saving {len(predictions)} predictions to {filename}...")
    with open(filename, 'w', encoding='utf-8') as f:
        for pred in predictions:
            f.write(json.dumps(pred, ensure_ascii=False) + '\n')
    print(f"✅ Saved predictions to {filename}")


# -----------------------------
# Generation Phase
# -----------------------------
def generate_and_save_outputs(model, subset, model_name, output_filename, max_samples=100):
    """Generate outputs and save them to a file"""
    print(f"\n🚀 Generating outputs for {model_name}...")
    print(f"📊 Processing {min(max_samples, len(subset))} samples...")

    predictions = []
    sample_size = min(max_samples, len(subset))

    for i in tqdm(range(sample_size), desc=f"Generating {model_name}", unit="samples"):
        try:
            instruction = subset[i]['instruction']
            reference = subset[i]['output']

            prediction = generate_response(model, instruction)

            predictions.append({
                "sample_index": i,
                "instruction": instruction,
                "reference": reference,
                "prediction": prediction,
                "status": "success"
            })

        except Exception as e:
            print(f"\n❌ Error at sample {i}: {e}")
            predictions.append({
                "sample_index": i,
                "instruction": subset[i]['instruction'] if i < len(subset) else "",
                "reference": subset[i]['output'] if i < len(subset) else "",
                "prediction": "",
                "status": "error",
                "error": str(e)
            })

    print(f"\n✅ Generated {len([p for p in predictions if p['status'] == 'success'])} successful predictions")

    # Save to file
    save_predictions_to_file(predictions, output_filename)
    return output_filename



# -----------------------------
# Pipeline Function
# -----------------------------
def run_generation_pipeline(model, subset, model_name, max_samples=100):
    """Run generation pipeline and save outputs"""
    output_filename = f"{model_name.lower().replace(' ', '_')}_predictions.jsonl"
    generate_and_save_outputs(model, subset, model_name, output_filename, max_samples)
    return output_filename


# Generate outputs for finetuned model
finetuned_pred_file = run_generation_pipeline(finetuned_model, eval_dataset, "FINETUNED MODEL", max_samples= len(eval_dataset))

# Download the predictions file
if os.path.exists(finetuned_pred_file):
    try:
        # For Google Colab
        from google.colab import files
        files.download(finetuned_pred_file)
    except ImportError:
        # For Jupyter / local notebooks
        from IPython.display import FileLink, display
        display(FileLink(finetuned_pred_file))
        print(f"Click the link above to download: {finetuned_pred_file}")
else:
    print(f"File not found: {finetuned_pred_file}")



In [None]:
!pip install code-bert-score
from tqdm import tqdm



In [None]:
# eval_pipeline.py
import os
import re
import json
import ast
import numpy as np
import torch
from datasets import load_dataset
from transformers import AutoTokenizer
import evaluate
from typing import Set
from code_bert_score import score



def load_predictions_from_file(filename):
    """Load predictions from a JSONL file"""
    print(f"📂 Loading predictions from {filename}...")
    predictions = []
    with open(filename, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f, 1):
            try:
                predictions.append(json.loads(line.strip()))
            except json.JSONDecodeError as e:
                print(f"⚠️  Warning: Failed to parse line {line_num}: {e}")
                continue
    print(f"✅ Loaded {len(predictions)} predictions")
    return predictions

def codebert_similarity(code1: str, code2: str, lang: str = "python") -> float:
    """
    CodeBERT-based similarity for better semantic code evaluation.
    Uses CodeBERTScore which is much better than Jaccard for code similarity.
    """
    if not code1 or not code1.strip() or not code2 or not code2.strip():
        return 0.0

    P, R, F1, F3 = score(
        cands=[code1],           # Generated code
        refs=[code2],            # Reference code
        lang=lang,               # Programming language
        model_type="microsoft/codebert-base"  # Use CodeBERT base model
    )

    # Return F1 score as similarity (0-1 range)
    return float(F1[0])



def code_jaccard_similarity(code1: str, code2: str) -> float:
    """Enhanced Jaccard similarity for Gorilla API evaluation"""

    def advanced_tokenize_code(code: str) -> Set[str]:
        """Advanced tokenization for API code evaluation"""
        if not code or not code.strip():
            return set()

        tokens = set()
        code_clean = code.strip().lower()

        # 1. Extract model names and paths (crucial for Gorilla)
        model_patterns = [
            r"'([^']+)'",  # Single quotes: 'microsoft/git-large-coco'
            r'"([^"]+)"',  # Double quotes: "microsoft/git-large-coco"
            r'([a-zA-Z0-9_-]+/[a-zA-Z0-9_.-]+)',  # Hub format: microsoft/git-large-coco
        ]

        for pattern in model_patterns:
            matches = re.findall(pattern, code_clean)
            tokens.update(matches)

        # 2. Extract API function names
        api_functions = re.findall(r'(\w+\.from_pretrained)', code_clean)
        tokens.update(api_functions)

        # 3. Extract class names
        class_names = re.findall(r'([A-Z][a-zA-Z0-9]*(?:[A-Z][a-zA-Z0-9]*)*)', code)
        tokens.update([name.lower() for name in class_names])

        # 4. Extract import statements
        import_matches = re.findall(r'from\s+(\S+)\s+import\s+(.+)', code_clean)
        for module, imports in import_matches:
            tokens.add(f"from_{module}")
            for imp in imports.split(','):
                tokens.add(f"import_{imp.strip()}")

        # 5. Standard imports
        simple_imports = re.findall(r'import\s+(\S+)', code_clean)
        tokens.update([f"import_{imp}" for imp in simple_imports])

        # 6. Function calls with parameters
        func_calls = re.findall(r'(\w+)\s*\(', code_clean)
        tokens.update([f"call_{func}" for func in func_calls])

        # 7. Pipeline tasks
        pipeline_tasks = re.findall(r"pipeline\s*\(\s*['\"]([^'\"]+)['\"]", code_clean)
        tokens.update([f"task_{task}" for task in pipeline_tasks])

        # 8. Variable assignments
        variables = re.findall(r'(\w+)\s*=', code_clean)
        tokens.update([f"var_{var}" for var in variables if len(var) > 1])

        # 9. Key method calls
        method_calls = re.findall(r'\.(\w+)\(', code_clean)
        tokens.update([f"method_{method}" for method in method_calls])

        # 10. Framework-specific patterns
        # Transformers patterns
        transformers_patterns = [
            r'(AutoModel|AutoTokenizer|pipeline)',
            r'(transformers|torch|numpy|PIL)',
            r'(return_tensors|pt|tf)',
            r'(pretrained|finetuned)'
        ]

        for pattern in transformers_patterns:
            matches = re.findall(pattern, code_clean)
            tokens.update(matches)

        # 11. Standard word tokenization for remaining content
        words = re.findall(r'\w+', code_clean)
        tokens.update([word for word in words if len(word) > 2])

        # 12. Operators and special symbols
        operators = re.findall(r'[^\w\s]', code)
        tokens.update(operators)

        return tokens

    # Tokenize both code snippets
    tokens1 = advanced_tokenize_code(code1)
    tokens2 = advanced_tokenize_code(code2)

    intersection = tokens1.intersection(tokens2)
    union = tokens1.union(tokens2)

    return len(intersection) / len(union) if union else 0


# -----------------------------
# Extraction helpers
# -----------------------------
TAG_RE = re.compile(r'<<<\s*(?P<tag>\w+)\s*>>>\s*:?', re.IGNORECASE)

def extract_all_components(text):
    """Parse all <<<tag>>> blocks and return dict {tag_lower: content}."""
    components = {}
    matches = list(TAG_RE.finditer(text))
    for i, m in enumerate(matches):
        tag = m.group("tag").lower()
        start = m.end()
        end = matches[i+1].start() if i+1 < len(matches) else len(text)
        content = text[start:end].strip()
        components[tag] = content
    return components

def extract_component(text, comp):
    comp = comp.lower()
    comps = extract_all_components(text)
    return comps.get(comp, "")

# -----------------------------
# Code utilities
# -----------------------------
def normalize_code_string(s):
    """Strip leading/trailing, collapse whitespace to single spaces for weak normalization."""
    return " ".join(s.strip().split())

def code_validity_check(code_str):
    """Return 1 if code parses into AST, else 0"""
    try:
        ast.parse(code_str)
        return 1
    except Exception:
        return 0

# -----------------------------
# Evaluation Phase
# -----------------------------
def evaluate_outputs(predictions, references):
    """Evaluate predictions against references"""
    # Initialize metrics storage
    rougeL_scores = []
    bert_f1_scores = []
    code_jaccard_scores = []
    codebert_scores = []
    code_validities = []

    component_hits = {
        "domain": [],
        "api_call": [],
        "api_provider": [],
        "explanation": [],
        "code": []
    }

    # Process each prediction-reference pair
    for pred, ref in tqdm(zip(predictions, references), total=len(predictions)):
        pred_comps = extract_all_components(pred)
        ref_comps = extract_all_components(ref)

        # Explanation: ROUGE-L + BERTScore
        ref_expl = ref_comps.get("explanation", "").strip()
        if ref_expl:
            pred_expl = pred_comps.get("explanation", "").strip()
            rouge_res = rouge.compute(predictions=[pred_expl], references=[ref_expl])
            rougeL_scores.append(rouge_res.get("rougeL", 0.0))
            bert_res = bertscore.compute(predictions=[pred_expl], references=[ref_expl], lang="en")
            bert_f1_scores.append(bert_res["f1"][0])

        # Code evaluation
        ref_code = ref_comps.get("code", "").strip()
        if ref_code:
            pred_code = pred_comps.get("code", "").strip()
            jaccard_score = code_jaccard_similarity(pred_code, ref_code)
            code_jaccard_scores.append(jaccard_score)

        # Code evaluation with CodeBERT
        ref_code = ref_comps.get("code", "").strip()
        if ref_code:
            pred_code = pred_comps.get("code", "").strip()
            codebert_score = codebert_similarity(pred_code, ref_code)
            codebert_scores.append(codebert_score)
            code_validities.append(code_validity_check(pred_code))

        # Structured fields exact matching
        for field in ["domain", "api_call", "api_provider"]:
            ref_val = ref_comps.get(field, "").strip()
            if ref_val:
                pred_val = pred_comps.get(field, "").strip()
                component_hits[field].append(1 if pred_val and pred_val.strip().lower() == ref_val.strip().lower() else 0)

    # Aggregate results
    summary = {}
    if rougeL_scores:
        summary["avg_rougeL"] = float(np.mean(rougeL_scores)) * 100
    if bert_f1_scores:
        summary["avg_bertscore_f1"] = float(np.mean(bert_f1_scores)) * 100
    if code_jaccard_scores:
        summary["code_jaccard_similarity"] = float(np.mean(code_jaccard_scores)) * 100
    if codebert_scores:
        summary["codebert_similarity"] = float(np.mean(codebert_scores)) * 100
    if code_validities:
        summary["code_validity"] = float(np.mean(code_validities)) * 100

    for field in ["domain", "api_call", "api_provider"]:
        vals = component_hits[field]
        if vals:
            summary[f"{field}_accuracy"] = float(np.mean(vals)) * 100

    return summary

def evaluate_from_saved_outputs(prediction_filename, model_name):
    """Load predictions from file and evaluate them"""
    print(f"\n🔍 Evaluating {model_name} from saved outputs...")

    # Load predictions
    loaded_data = load_predictions_from_file(prediction_filename)

    # Filter successful predictions
    successful_preds = [item for item in loaded_data if item.get('status') == 'success']
    print(f"📊 Evaluating {len(successful_preds)} successful predictions...")

    if not successful_preds:
        print("❌ No successful predictions to evaluate!")
        return {}

    # Extract predictions and references
    predictions = [item['prediction'] for item in successful_preds]
    references = [item['reference'] for item in successful_preds]

    # Evaluate
    metrics = evaluate_outputs(predictions, references)

    print(f"\n📊 {model_name} Results:")
    print("=" * 50)
    for metric, score in metrics.items():
        print(f"{metric:25}: {score:6.2f}%")

    return metrics

def run_evaluation_pipeline(prediction_filename, model_name):
    """Run evaluation pipeline from saved outputs"""
    return evaluate_from_saved_outputs(prediction_filename, model_name)



# finetuned_pred_file = "./finetuned_model_predictions.jsonl(your file name)"

# Evaluate from saved file
finetuned_metrics = run_evaluation_pipeline(finetuned_pred_file, "FINETUNED MODEL")


🔍 Evaluating FINETUNED MODEL from saved outputs...
📂 Loading predictions from ./finetuned_model_predictions.jsonl...
✅ Loaded 911 predictions
📊 Evaluating 911 successful predictions...


  0%|          | 0/911 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

  0%|          | 1/911 [00:47<11:59:47, 47.46s/it]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

  4%|▍         | 36/911 [01:41<30:47,  2.11s/it]HTTP Error 429 thrown while requesting HEAD https://huggingface.co/microsoft/codebert-base/resolve/main/tokenizer_config.json
Retrying in 1s [Retry 1/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/microsoft/codebert-base/resolve/main/tokenizer_config.json
Retrying in 2s [Retry 2/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/microsoft/codebert-base/resolve/main/tokenizer_config.json
Retrying in 4s [Retry 3/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/microsoft/codebert-base/resolve/main/tokenizer_config.json
Retrying in 8s [Retry 4/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/microsoft/codebert-base/resolve/main/tokenizer_config.json
Retrying in 8s [Retry 5/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/microsoft/codebert-base/resolve/main/tokenizer_config.json
HTTP Error 429 thrown while requesting HEAD https://hugg


📊 FINETUNED MODEL Results:
avg_rougeL               :  37.05%
avg_bertscore_f1         :  90.56%
code_jaccard_similarity  :  52.09%
codebert_similarity      :  88.71%
code_validity            :  61.22%
domain_accuracy          :  97.21%
api_call_accuracy        :  85.59%
api_provider_accuracy    :  97.32%





In [None]:
import json

# Define the filename for the JSON file
filename = "finetuned_metrics.json"

# Write the dictionary to a JSON file
with open(filename, 'w') as f:
    json.dump(finetuned_metrics, f, indent=4)

print(f"Metrics saved to {filename}")

Metrics saved to finetuned_metrics.json
