In [1]:
import torch
import logging
from transformers import AutoTokenizer, AutoModelForCausalLM
import time
from tree_parser import repo_scan_parser, build_mamba_prompt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Configuration
MODEL_ID = "Zyphra/Zamba2-2.7B"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
print(f"‚öôÔ∏è  Loading Zamba2 (Hybrid Mamba+Attention) on {DEVICE}...")

# Set logging to see download progress
logging.basicConfig(level=logging.INFO)

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

try:
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        device_map=DEVICE,
        dtype=torch.bfloat16,       # Changed from 'torch_dtype' to 'dtype'
        trust_remote_code=True
    )
    print("‚úÖ Zamba2 Loaded Successfully!")
except ImportError:
    print("‚ùå Error: Missing 'mamba-ssm' or 'causal-conv1d'.")
except ValueError as e:
    print(f"‚ùå Configuration Error: {e}")

‚öôÔ∏è  Loading Zamba2 (Hybrid Mamba+Attention) on cuda...


Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:02<00:00,  1.21s/it]


‚úÖ Zamba2 Loaded Successfully!


In [4]:
print("\nüöÄ Scanning Repository...")
scanned_results = repo_scan_parser()
full_prompt = build_mamba_prompt(scanned_results)


üöÄ Scanning Repository...
üîπ Authenticate with GitHub...
üîπ Fetching file list from branch: V2.1...
Found 15 files in the repository.

‚è≠Ô∏è Skipping unsupported file: .env
‚è≠Ô∏è Skipping unsupported file: .gitignore
‚úÖ Loaded grammar: CPP
üîπ Processing file (3/15): HPC/heston_gpu.cu
‚úÖ Loaded grammar: PYTHON
üîπ Processing file (4/15): HPC/heston_server.py
üîπ Processing file (5/15): HPC/stock_calibration.py
üîπ Processing file (6/15): ML/generate_training_data.py
üîπ Processing file (7/15): ML/improved_parameter_variation.py
üîπ Processing file (8/15): ML/ml_data_loader.py
üîπ Processing file (9/15): ML/ml_model.py
‚è≠Ô∏è Skipping unsupported file: ML/mlp.ipynb
‚è≠Ô∏è Skipping unsupported file: README.md
‚è≠Ô∏è Skipping known massive vendor file: include/json.hpp
‚è≠Ô∏è Skipping unsupported file: requirements.txt
‚è≠Ô∏è Skipping unsupported file: training_data/metadata.json
‚è≠Ô∏è Skipping unsupported file: training_data/normalization_stats.json

‚úÖ Scan Complete.

In [5]:
# 4. Tokenize & Generate
print(f"\nüß† Processing {len(full_prompt)} characters...")
input_ids = tokenizer(full_prompt, return_tensors="pt").input_ids.to(DEVICE)
    
print("üïµÔ∏è‚Äç‚ôÇÔ∏è WCA Auditor is thinking...")
start_time = time.time()


üß† Processing 93834 characters...
üïµÔ∏è‚Äç‚ôÇÔ∏è WCA Auditor is thinking...


In [6]:
with torch.inference_mode():
    outputs = model.generate(
        input_ids, 
        max_new_tokens=1024, 
        do_sample=True,
        temperature=0.4,
        use_cache=True 
    )

This is a friendly reminder - the current text generation call has exceeded the model's predefined maximum length (4096). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


RuntimeError: CUDA error: unknown error
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
print("\n" + "="*50)
print("üõ°Ô∏è  AUDIT REPORT")
print("="*50)
# Decode only the NEW tokens
print(tokenizer.decode(outputs[0][input_ids.shape[1]:], skip_special_tokens=True))
print(f"\n‚è±Ô∏è  Time: {time.time() - start_time:.2f}s")