In [None]:
import torch
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import time
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from tqdm import tqdm
import re
import ast
import json
import os
import subprocess
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.cluster import KMeans
from sklearn.datasets import make_classification, make_blobs
from datasets import load_dataset, Dataset
import torch.nn.utils.prune as prune
from torch.quantization import quantize_dynamic
import sklearn
import types
import tempfile

CPU_MODE = False
TEST_SAMPLE_SIZE = 15
OPTIMIZATION_TECHNIQUES = [
    "Base Model",
    "Pruning",
    "Quantization",
    "Weight Sharing"
]

tokenizer = AutoTokenizer.from_pretrained("./final-model")

def load_test_datasets():
    test_prompts = []
    try:
        scidocs_path = "scidocs_data"
        os.makedirs(scidocs_path, exist_ok=True)
        if not os.path.exists(os.path.join(scidocs_path, "paper_metadata_view_cite_read.json")):
            subprocess.run([
                "aws", "s3", "sync", "--no-sign-request",
                "s3://ai2-s2-research-public/specter/scidocs/",
                scidocs_path, "--region", "us-west-2", "--quiet"
            ], check=True)
        with open(os.path.join(scidocs_path, "paper_metadata_view_cite_read.json"), "r") as f:
            scidocs_data = json.load(f)
        for i, (paper_id, content) in enumerate(scidocs_data.items()):
            if i >= 5:
                break
            title = content.get('title', '') or ''
            abstract = content.get('abstract', '') or ''
            if len(title) > 10 and len(abstract) > 200:
                test_prompts.append({
                    "text": (
                        f"Generate a complete, self-contained Python code for text classification. "
                        f"Title: {title}\n"
                        f"Abstract: {abstract[:400]}\n"
                        "Create a synthetic dataset based on the abstract and implement a classification model."
                    ),
                    "source": "scidocs",
                    "type": "classification"
                })
    except Exception as e:
        print(f"SciDocs loading failed: {str(e)}")
    try:
        astronomy = load_dataset("David-Xu/astronomy-stack-dpo-text", split="train")
        for i, example in enumerate(astronomy):
            if i >= 5:
                break
            test_prompts.append({
                "text": (
                    "Generate a complete, self-contained Python code to solve this astronomy problem:\n"
                    f"{example['prompt']}\n"
                    "Create any necessary synthetic data and implement a solution."
                ),
                "source": "astronomy",
                "type": "problem_solving"
            })
    except Exception as e:
        print(f"Astronomy dataset loading failed: {str(e)}")
    try:
        science = load_dataset("millawell/wikipedia_field_of_science", split="train")
        for i, example in enumerate(science):
            if i >= 5:
                break
            test_prompts.append({
                "text": (
                    "Generate a complete, self-contained Python code for scientific text classification:\n"
                    f"Text: {example['text']}\n"
                    "Create a synthetic dataset and implement a classification model."
                ),
                "source": "wikipedia_science",
                "type": "classification"
            })
    except Exception as e:
        print(f"Science dataset loading failed: {str(e)}")
    for i in range(5):
        X, y = make_classification(
            n_samples=100, 
            n_features=4, 
            n_informative=2, 
            n_classes=2,
            random_state=i
        )
        data = pd.DataFrame(X, columns=[f"feature_{j}" for j in range(4)])
        data["target"] = y
        test_prompts.append({
            "text": "Create a RandomForest classifier and show accuracy",
            "data": data,
            "source": "synthetic",
            "type": "classification"
        })
        X, y = make_blobs(n_samples=100, centers=3, cluster_std=1.5, random_state=i)
        data = pd.DataFrame(X, columns=["x", "y"])
        test_prompts.append({
            "text": "Perform K-means clustering on this data",
            "data": data,
            "source": "synthetic",
            "type": "clustering"
        })
        X, _ = make_blobs(n_samples=100, centers=3, cluster_std=1.5, random_state=i)
        outliers = np.random.uniform(low=-10, high=10, size=(5, 2))
        X = np.vstack([X, outliers])
        data = pd.DataFrame(X, columns=["x", "y"])
        test_prompts.append({
            "text": "Detect anomalies using Isolation Forest",
            "data": data,
            "source": "synthetic",
            "type": "outlier_detection"
        })
    print(f"Created {len(test_prompts)} test prompts")
    return test_prompts

def apply_optimization(technique_name):
    try:
        model = AutoModelForCausalLM.from_pretrained("./final-model")
        if technique_name == "Pruning":
            for name, module in model.named_modules():
                if isinstance(module, torch.nn.Linear) and "lora" not in name.lower():
                    try:
                        prune.l1_unstructured(module, name='weight', amount=0.1)
                        prune.remove(module, 'weight')
                    except:
                        continue
            return model
        if technique_name == "Quantization":
            model = model.cpu()
            return quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
        if technique_name == "Weight Sharing":
            if hasattr(model, 'lm_head') and hasattr(model, 'model'):
                if hasattr(model.model, 'embed_tokens'):
                    try:
                        model.lm_head.weight = model.model.embed_tokens.weight
                    except:
                        pass
            return model
        return model
    except Exception as e:
        print(f"Error applying {technique_name}: {str(e)}")
        return None

def generate_robust_code(generator, prompt_text, task_type):
    if task_type == "classification":
        task_instructions = "Focus on classification using RandomForestClassifier. Create synthetic data if needed."
    elif task_type == "clustering":
        task_instructions = "Use KMeans clustering and visualize results with matplotlib."
    elif task_type == "outlier_detection":
        task_instructions = "Use IsolationForest for outlier detection. Highlight anomalies in visualization."
    elif task_type == "problem_solving":
        task_instructions = "Solve the problem using appropriate scientific computing techniques."
    else:
        task_instructions = "Solve the problem efficiently with appropriate algorithms."
    structured_prompt = f"""
Generate complete, self-contained Python code to solve this task:
{prompt_text}

Specific Requirements:
1. Create any necessary synthetic data if not provided
2. Use only numpy, pandas, sklearn and matplotlib
3. {task_instructions}
4. Create complete, runnable code
5. Print results clearly
6. For visualizations, use plt.savefig('output.png') instead of plt.show()
7. Ensure the code is syntactically correct

Code:
```python
"""
    try:
        output = generator(
            structured_prompt,
            temperature=0.1,
            max_new_tokens=700,
            truncation=True,
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id
        )
        return output[0]['generated_text']
    except Exception as e:
        print(f"Generation error: {str(e)}")
        return ""

def validate_code(generated_code):
    if not generated_code:
        return ""
    if "```python" in generated_code:
        generated_code = generated_code.split("```python")[1].split("```")[0]
    elif "```" in generated_code:
        generated_code = generated_code.split("```")[1].split("```")[0]
    repairs = [
        (r"from sklearn\.\w+ import \*", ""),
        (r"fit\(\)", "fit(X_train, y_train)"),
        (r"predict\(\)", "predict(X_test)"),
        (r"plt\.show\(\)", "plt.savefig('output.png')"),
        (r"import matplotlib\.pyplot as plt", "import matplotlib.pyplot as plt\nplt.switch_backend('Agg')"),
        (r"\.to_csv\('data\.csv'\)", "")
    ]
    for pattern, replacement in repairs:
        generated_code = re.sub(pattern, replacement, generated_code)
    required_imports = [
        "import numpy as np",
        "import pandas as pd",
        "import matplotlib.pyplot as plt"
    ]
    for imp in required_imports:
        if imp not in generated_code:
            generated_code = imp + "\n" + generated_code
    if "from sklearn" not in generated_code and "import sklearn" not in generated_code:
        generated_code = "from sklearn.ensemble import RandomForestClassifier, IsolationForest\n" + \
                         "from sklearn.cluster import KMeans\n" + \
                         "from sklearn.model_selection import train_test_split\n" + \
                         "from sklearn.metrics import accuracy_score, classification_report\n" + generated_code
    if "pd.DataFrame" not in generated_code and "X =" not in generated_code:
        synthetic_data = "\n# Create synthetic data\nX = np.random.rand(100, 4)\ny = np.random.randint(0, 2, 100)\n"
        generated_code = generated_code.replace("import numpy as np", "import numpy as np" + synthetic_data, 1)
    return generated_code.strip()

def safe_execute(code: str, data=None):
    if not code:
        return {"status": "error", "message": "Empty code"}
    safe_env = {
        "__builtins__": {
            'print': print, 'range': range, 'len': len, 'str': str, 'int': int,
            'float': float, 'bool': bool, 'list': list, 'dict': dict, 'tuple': tuple,
            'set': set, 'min': min, 'max': max, 'sum': sum, 'abs': abs, 'round': round,
            'enumerate': enumerate, 'zip': zip, '__import__': __import__
        },
        "np": np,
        "pd": pd,
        "plt": plt,
        "sklearn": sklearn,
        "RandomForestClassifier": RandomForestClassifier,
        "IsolationForest": IsolationForest,
        "KMeans": KMeans,
        "train_test_split": train_test_split,
        "accuracy_score": accuracy_score,
        "classification_report": classification_report,
    }
    if data is not None:
        safe_env["data"] = data
    try:
        ast.parse(code)
        exec(code, safe_env)
        return {"status": "success"}
    except Exception as e:
        return {"status": "error", "message": f"{type(e).__name__}: {str(e)}"}

def measure_inference_performance(generator, prompt_text, num_runs=3):
    metrics = {
        "avg_latency": 0,
        "throughput": 0,
        "memory_usage": 0,
        "success_rate": 0
    }
    successes = 0
    latencies = []
    try:
        _ = generator(prompt_text, max_new_tokens=50, truncation=True)
        start_time = time.time()
        for _ in range(num_runs):
            try:
                run_start = time.time()
                output = generator(
                    prompt_text,
                    max_new_tokens=300,
                    truncation=True,
                    pad_token_id=tokenizer.eos_token_id
                )
                latencies.append(time.time() - run_start)
                successes += 1
            except Exception:
                continue
        metrics["avg_latency"] = np.mean(latencies) * 1000 if latencies else 0
        metrics["throughput"] = successes / max(0.001, time.time() - start_time)
        metrics["success_rate"] = successes / num_runs
        if torch.cuda.is_available():
            torch.cuda.reset_peak_memory_stats()
            metrics["memory_usage"] = torch.cuda.max_memory_allocated() / (1024 ** 2)
        else:
            metrics["memory_usage"] = 0
    except Exception as e:
        print(f"Performance measurement failed: {str(e)}")
    return metrics

def evaluate_code_quality(generated_code):
    if not generated_code:
        return {
            "has_imports": False,
            "has_model": False,
            "has_print": False,
            "is_runnable": False,
            "score": 0
        }
    has_imports = any(keyword in generated_code for keyword in
                      ["import numpy", "import pandas", "import sklearn"])
    has_model = any(keyword in generated_code for keyword in
                    ["RandomForest", "IsolationForest", "KMeans"])
    has_print = "print(" in generated_code
    has_visualization = "plt.savefig" in generated_code or "plt.plot" in generated_code
    has_data = "X =" in generated_code or "pd.DataFrame" in generated_code
    is_runnable = has_imports and has_model and has_print and has_data
    score = sum([has_imports, has_model, has_print, is_runnable, has_visualization]) / 5
    return {
        "has_imports": has_imports,
        "has_model": has_model,
        "has_print": has_print,
        "has_visualization": has_visualization,
        "has_data": has_data,
        "is_runnable": is_runnable,
        "score": score
    }

def run_smoke_tests(generator, test_prompts):
    print("\n" + "="*50)
    print("Running Enhanced Smoke Tests")
    print("="*50)
    smoke_prompts = []
    for prompt in test_prompts:
        if prompt["source"] in ["scidocs", "astronomy", "wikipedia_science"]:
            smoke_prompts.append(prompt)
            if len(smoke_prompts) >= 3:
                break
    for prompt in test_prompts:
        if prompt["source"] == "synthetic":
            smoke_prompts.append(prompt)
            break
    for i, item in enumerate(smoke_prompts):
        print(f"\nSmoke Test {i+1}: {item['text'][:100]}...")
        generated = generate_robust_code(generator, item["text"], item.get("type", ""))
        code = validate_code(generated)
        print("\nGenerated Code:")
        print(code[:1000] + "..." if len(code) > 1000 else code)
        data = item.get("data", None)
        exec_result = safe_execute(code, data)
        print("\nExecution Result:")
        print(exec_result)
        if exec_result["status"] == "error":
            print("\nFULL CODE WITH ERROR:")
            print(code)
        if os.path.exists("output.png"):
            print("Visualization created: output.png")
            os.remove("output.png")
        print("-"*50)

def run_test_pipeline():
    test_prompts = load_test_datasets()
    results = {}
    for technique in OPTIMIZATION_TECHNIQUES:
        print(f"\n{'='*40}")
        print(f"Testing: {technique}")
        print(f"{'='*40}")
        model = apply_optimization(technique)
        if model is None:
            print(f"Skipping {technique} due to initialization error")
            continue
        model.eval()
        device = 0 if torch.cuda.is_available() and not CPU_MODE else -1
        if technique == "Quantization":
            device = -1
        print(f"Using device: {'GPU' if device >= 0 else 'CPU'}")
        generator = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            device=device
        )
        if technique == "Base Model":
            run_smoke_tests(generator, test_prompts)
        tech_results = {
            "inference": {
                "avg_latency": 0,
                "throughput": 0,
                "memory_usage": 0,
                "success_rate": 0
            },
            "quality": {
                "syntax_errors": 0,
                "execution_errors": 0,
                "valid_count": 0,
                "quality_score": 0,
                "scores": []
            }
        }
        if test_prompts:
            perf_metrics = measure_inference_performance(generator, test_prompts[0]["text"])
            tech_results["inference"] = perf_metrics
        for item in tqdm(test_prompts, desc="Testing"):
            try:
                generated = generate_robust_code(generator, item["text"], item.get("type", ""))
                code = validate_code(generated)
                if not code:
                    tech_results["quality"]["syntax_errors"] += 1
                    tech_results["quality"]["scores"].append(0)
                    continue
                quality_metrics = evaluate_code_quality(code)
                quality_score = quality_metrics["score"]
                tech_results["quality"]["scores"].append(quality_score)
                if not quality_metrics["is_runnable"]:
                    tech_results["quality"]["syntax_errors"] += 1
                    continue
                tech_results["quality"]["valid_count"] += 1
                data = item.get("data", None)
                if data is not None:
                    exec_result = safe_execute(code, data)
                    if exec_result["status"] == "error":
                        tech_results["quality"]["execution_errors"] += 1
            except Exception as e:
                tech_results["quality"]["syntax_errors"] += 1
                tech_results["quality"]["scores"].append(0)
                print(f"Error during testing: {str(e)}")
        if tech_results["quality"]["scores"]:
            tech_results["quality"]["quality_score"] = np.mean(tech_results["quality"]["scores"])
        else:
            tech_results["quality"]["quality_score"] = 0
        results[technique] = tech_results
    return results

def present_results(results):
    table_data = []
    for tech, metrics in results.items():
        inf = metrics["inference"]
        qual = metrics["quality"]
        valid_count = qual["valid_count"]
        total_tests = len(qual["scores"]) if "scores" in qual else TEST_SAMPLE_SIZE
        table_data.append({
            "Technique": tech,
            "Latency (ms)": f"{inf['avg_latency']:.2f}",
            "Throughput (samples/s)": f"{inf['throughput']:.2f}",
            "Memory (MB)": f"{inf['memory_usage']:.1f}",
            "Inference Success (%)": f"{inf['success_rate'] * 100:.1f}",
            "Valid Code (%)": f"{valid_count / total_tests * 100:.1f}" if total_tests > 0 else "N/A",
            "Execution Success (%)": f"{(1 - qual['execution_errors'] / max(1, valid_count)) * 100:.1f}" if valid_count > 0 else "N/A",
            "Quality Score": f"{qual['quality_score'] * 100:.1f}"
        })
    results_df = pd.DataFrame(table_data)
    print("\n" + "="*80)
    print("Optimization Results")
    print("="*80)
    print(results_df.to_string(index=False))
    results_df.to_csv("optimization_results.csv", index=False)
    print("\nResults saved to optimization_results.csv")
    if not results_df.empty:
        fig, ax = plt.subplots(2, 2, figsize=(15, 12))
        results_df["Latency Value"] = results_df["Latency (ms)"].str.extract(r'(\d+\.?\d*)').astype(float)
        results_df["Valid Code Value"] = results_df["Valid Code (%)"].str.extract(r'(\d+\.?\d*)').astype(float)
        results_df["Exec Success Value"] = results_df["Execution Success (%)"].str.extract(r'(\d+\.?\d*)').astype(float)
        results_df["Quality Value"] = results_df["Quality Score"].str.extract(r'(\d+\.?\d*)').astype(float)
        results_df.plot.bar(x="Technique", y="Latency Value", ax=ax[0, 0], legend=False, color='skyblue')
        ax[0, 0].set_title('Inference Latency')
        ax[0, 0].set_ylabel('Milliseconds')
        results_df.plot.bar(x="Technique", y="Valid Code Value", ax=ax[0, 1], legend=False, color='lightgreen')
        ax[0, 1].set_title('Valid Code Rate')
        ax[0, 1].set_ylabel('Percentage')
        results_df.plot.bar(x="Technique", y="Exec Success Value", ax=ax[1, 0], legend=False, color='salmon')
        ax[1, 0].set_title('Execution Success Rate')
        ax[1, 0].set_ylabel('Percentage')
        results_df.plot.bar(x="Technique", y="Quality Value", ax=ax[1, 1], legend=False, color='purple')
        ax[1, 1].set_title('Code Quality Score')
        ax[1, 1].set_ylabel('Score (0-100)')
        plt.tight_layout()
        plt.savefig('optimization_results.png', dpi=150)
        print("Visualization saved to optimization_results.png")
        plt.close()
    return results_df

if __name__ == "__main__":
    print(f"Starting optimization comparison with {TEST_SAMPLE_SIZE} test prompts")
    torch.manual_seed(42)
    np.random.seed(42)
    results = run_test_pipeline()
    results_df = present_results(results)
    print("\nTest pipeline completed successfully!")

Starting optimization comparison with 15 test prompts
Created 30 test prompts

Testing: Base Model


Device set to use cuda:0


Using device: GPU


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



Running Enhanced Smoke Tests

Smoke Test 1: Generate a complete, self-contained Python code for text classification. Title: Pelvic Girdle Pain d...

Generated Code:
import numpy as np
# Create synthetic data
X = np.random.rand(100, 4)
y = np.random.randint(0, 2, 100)

import pandas as pd
import matplotlib.pyplot as plt
plt.switch_backend('Agg')
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load data
df = pd.read_csv('data/pelvic-girdle-pain-during-or-after-pregnancy-a-review-of-recent-evidence-and-a-clinical-care-path-proposal.csv')

# Split data
X, y = df.iloc[:, 0:2], df.iloc[:, 2]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Print results
print('Accuracy:', accuracy_score(y_test, y_

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
Testing: 100%|███████████████████████████████████████████████████████████████████████| 30/30 [09:24<00:00, 18.81s/it]



Testing: Pruning


Device set to use cuda:0


Using device: GPU


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
Testing:  73%|████████████████████████████████████████████████████                   | 22/30 [07:10<02:32, 19.09s/it]

Accuracy: 0.1


Testing:  77%|██████████████████████████████████████████████████████▍                | 23/30 [07:30<02:15, 19.33s/it]

Mean cluster 1: [0.50670111 0.50224629 0.48357157 0.51931862 0.45603444 0.51805392
 0.50348485 0.48355575 0.48547901 0.48230391 0.46579627 0.48085043
 0.49917958 0.51608233 0.55675878 0.50889784 0.5027395  0.46246684
 0.44249029 0.49109996 0.49078    0.51803542 0.4868742  0.44413671
 0.50630479 0.49034262 0.5045502  0.50168443 0.50287687 0.52920788
 0.48666926 0.49255272 0.44847509 0.50480451 0.5061533  0.50834318
 0.4267994  0.48945894 0.50101083 0.51535873 0.54997231 0.44782594
 0.51630592 0.48719913 0.57823812 0.56294441 0.5700564  0.49642837
 0.47771048 0.4561513  0.46771844 0.51569986 0.42146168 0.44552444
 0.47287754 0.4914742  0.49052169 0.50562655 0.46091732 0.49718929
 0.48769634 0.52167781 0.4951209  0.51149464 0.5253621  0.52285975
 0.48688125 0.43813436 0.43751669 0.50264748 0.46520519 0.50027994
 0.49867802 0.50738294 0.47547913 0.48042824 0.48041331 0.48378078
 0.48624253 0.47321851 0.54737698 0.55218129 0.50166451 0.55426822
 0.4995175  0.55512668 0.4988276  0.44646204 0

  return fit_method(estimator, *args, **kwargs)
Testing: 100%|███████████████████████████████████████████████████████████████████████| 30/30 [09:42<00:00, 19.41s/it]



Testing: Quantization


Device set to use cpu


Using device: CPU
Performance measurement failed: 'function' object has no attribute 'dtype'


Testing:   7%|████▊                                                                   | 2/30 [00:00<00:05,  5.24it/s]

Generation error: 'function' object has no attribute 'dtype'
Generation error: 'function' object has no attribute 'dtype'


Testing:  13%|█████████▌                                                              | 4/30 [00:00<00:05,  5.14it/s]

Generation error: 'function' object has no attribute 'dtype'
Generation error: 'function' object has no attribute 'dtype'


Testing:  20%|██████████████▍                                                         | 6/30 [00:01<00:04,  5.10it/s]

Generation error: 'function' object has no attribute 'dtype'
Generation error: 'function' object has no attribute 'dtype'


Testing:  27%|███████████████████▏                                                    | 8/30 [00:01<00:04,  5.10it/s]

Generation error: 'function' object has no attribute 'dtype'
Generation error: 'function' object has no attribute 'dtype'


Testing:  30%|█████████████████████▌                                                  | 9/30 [00:01<00:04,  5.07it/s]

Generation error: 'function' object has no attribute 'dtype'
Generation error: 'function' object has no attribute 'dtype'


Testing:  37%|██████████████████████████                                             | 11/30 [00:02<00:04,  4.68it/s]

Generation error: 'function' object has no attribute 'dtype'


Testing:  43%|██████████████████████████████▊                                        | 13/30 [00:02<00:03,  4.85it/s]

Generation error: 'function' object has no attribute 'dtype'
Generation error: 'function' object has no attribute 'dtype'


Testing:  47%|█████████████████████████████████▏                                     | 14/30 [00:02<00:03,  4.53it/s]

Generation error: 'function' object has no attribute 'dtype'


Testing:  50%|███████████████████████████████████▌                                   | 15/30 [00:03<00:03,  4.65it/s]

Generation error: 'function' object has no attribute 'dtype'


Testing:  57%|████████████████████████████████████████▏                              | 17/30 [00:03<00:02,  4.80it/s]

Generation error: 'function' object has no attribute 'dtype'
Generation error: 'function' object has no attribute 'dtype'


Testing:  63%|████████████████████████████████████████████▉                          | 19/30 [00:03<00:02,  4.91it/s]

Generation error: 'function' object has no attribute 'dtype'
Generation error: 'function' object has no attribute 'dtype'


Testing:  70%|█████████████████████████████████████████████████▋                     | 21/30 [00:04<00:01,  5.01it/s]

Generation error: 'function' object has no attribute 'dtype'
Generation error: 'function' object has no attribute 'dtype'


Testing:  77%|██████████████████████████████████████████████████████▍                | 23/30 [00:04<00:01,  5.00it/s]

Generation error: 'function' object has no attribute 'dtype'
Generation error: 'function' object has no attribute 'dtype'


Testing:  83%|███████████████████████████████████████████████████████████▏           | 25/30 [00:05<00:00,  5.05it/s]

Generation error: 'function' object has no attribute 'dtype'
Generation error: 'function' object has no attribute 'dtype'


Testing:  87%|█████████████████████████████████████████████████████████████▌         | 26/30 [00:05<00:00,  5.10it/s]

Generation error: 'function' object has no attribute 'dtype'
Generation error: 'function' object has no attribute 'dtype'


Testing:  97%|████████████████████████████████████████████████████████████████████▋  | 29/30 [00:05<00:00,  5.11it/s]

Generation error: 'function' object has no attribute 'dtype'
Generation error: 'function' object has no attribute 'dtype'


Testing: 100%|███████████████████████████████████████████████████████████████████████| 30/30 [00:06<00:00,  4.97it/s]


Generation error: 'function' object has no attribute 'dtype'

Testing: Weight Sharing


Device set to use cuda:0


Using device: GPU


Testing: 100%|███████████████████████████████████████████████████████████████████████| 30/30 [09:31<00:00, 19.04s/it]



Optimization Results
     Technique Latency (ms) Throughput (samples/s) Memory (MB) Inference Success (%) Valid Code (%) Execution Success (%) Quality Score
    Base Model      8030.85                   0.12      8422.1                 100.0           53.3                  43.8          72.7
       Pruning      8068.14                   0.12      8422.1                 100.0           50.0                  46.7          68.0
  Quantization         0.00                   0.00         0.0                   0.0            0.0                   N/A           0.0
Weight Sharing      8204.35                   0.12      8172.1                 100.0            0.0                   N/A          40.0

Results saved to optimization_results.csv
Visualization saved to optimization_results.png

Test pipeline completed successfully!


In [None]:
import torch
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import time
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from tqdm import tqdm
import re
import ast
import json
import os
import subprocess
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.cluster import KMeans
from sklearn.datasets import make_classification, make_blobs
from datasets import load_dataset, Dataset
import torch.nn.utils.prune as prune
import sklearn
import datetime

CPU_MODE = False
TEST_SAMPLE_SIZE = 100
OPTIMIZATION_TECHNIQUES = ["Base Model", "Pruning"]

timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
results_dir = f"optimization_results_{timestamp}"
os.makedirs(results_dir, exist_ok=True)
print(f"All results will be saved in: {results_dir}")

tokenizer = AutoTokenizer.from_pretrained("./final-model")

def load_test_datasets():
    test_prompts = []
    try:
        scidocs_path = "scidocs_data"
        os.makedirs(scidocs_path, exist_ok=True)
        if not os.path.exists(os.path.join(scidocs_path, "paper_metadata_view_cite_read.json")):
            subprocess.run([
                "aws", "s3", "sync", "--no-sign-request",
                "s3://ai2-s2-research-public/specter/scidocs/",
                scidocs_path, "--region", "us-west-2", "--quiet"
            ], check=True)
        with open(os.path.join(scidocs_path, "paper_metadata_view_cite_read.json"), "r") as f:
            scidocs_data = json.load(f)
        for i, (paper_id, content) in enumerate(scidocs_data.items()):
            if i >= 30:
                break
            title = content.get('title', '') or ''
            abstract = content.get('abstract', '') or ''
            if len(title) > 10 and len(abstract) > 200:
                test_prompts.append({
                    "text": (
                        f"Generate a complete, self-contained Python code for text classification. "
                        f"Title: {title}\n"
                        f"Abstract: {abstract[:400]}\n"
                        "Create a synthetic dataset based on the abstract and implement a classification model."
                    ),
                    "source": "scidocs",
                    "type": "classification"
                })
    except Exception as e:
        print(f"SciDocs loading failed: {str(e)}")
    try:
        astronomy = load_dataset("David-Xu/astronomy-stack-dpo-text", split="train")
        for i, example in enumerate(astronomy):
            if i >= 30:
                break
            test_prompts.append({
                "text": (
                    "Generate a complete, self-contained Python code to solve this astronomy problem:\n"
                    f"{example['prompt']}\n"
                    "Create any necessary synthetic data and implement a solution."
                ),
                "source": "astronomy",
                "type": "problem_solving"
            })
    except Exception as e:
        print(f"Astronomy dataset loading failed: {str(e)}")
    try:
        science = load_dataset("millawell/wikipedia_field_of_science", split="train")
        for i, example in enumerate(science):
            if i >= 30:
                break
            test_prompts.append({
                "text": (
                    "Generate a complete, self-contained Python code for scientific text classification:\n"
                    f"Text: {example['text']}\n"
                    "Create a synthetic dataset and implement a classification model."
                ),
                "source": "wikipedia_science",
                "type": "classification"
            })
    except Exception as e:
        print(f"Science dataset loading failed: {str(e)}")
    for i in range(20):
        X, y = make_classification(
            n_samples=100, 
            n_features=4, 
            n_informative=2, 
            n_classes=2,
            random_state=i
        )
        data = pd.DataFrame(X, columns=[f"feature_{j}" for j in range(4)])
        data["target"] = y
        test_prompts.append({
            "text": "Create a RandomForest classifier and show accuracy",
            "data": data,
            "source": "synthetic",
            "type": "classification"
        })
        X, y = make_blobs(n_samples=100, centers=3, cluster_std=1.5, random_state=i)
        data = pd.DataFrame(X, columns=["x", "y"])
        test_prompts.append({
            "text": "Perform K-means clustering on this data",
            "data": data,
            "source": "synthetic",
            "type": "clustering"
        })
        X, _ = make_blobs(n_samples=100, centers=3, cluster_std=1.5, random_state=i)
        outliers = np.random.uniform(low=-10, high=10, size=(5, 2))
        X = np.vstack([X, outliers])
        data = pd.DataFrame(X, columns=["x", "y"])
        test_prompts.append({
            "text": "Detect anomalies using Isolation Forest",
            "data": data,
            "source": "synthetic",
            "type": "outlier_detection"
        })
    print(f"Created {len(test_prompts)} test prompts")
    return test_prompts

def apply_optimization(technique_name):
    try:
        model = AutoModelForCausalLM.from_pretrained("./final-model")
        if technique_name == "Pruning":
            for name, module in model.named_modules():
                if isinstance(module, torch.nn.Linear) and "lora" not in name.lower():
                    try:
                        prune.l1_unstructured(module, name='weight', amount=0.1)
                        prune.remove(module, 'weight')
                    except:
                        continue
            return model
        return model
    except Exception as e:
        print(f"Error applying {technique_name}: {str(e)}")
        return None

def generate_robust_code(generator, prompt_text, task_type):
    if task_type == "classification":
        task_instructions = "Focus on classification using RandomForestClassifier. Create synthetic data if needed."
    elif task_type == "clustering":
        task_instructions = "Use KMeans clustering and visualize results with matplotlib."
    elif task_type == "outlier_detection":
        task_instructions = "Use IsolationForest for outlier detection. Highlight anomalies in visualization."
    elif task_type == "problem_solving":
        task_instructions = "Solve the problem using appropriate scientific computing techniques."
    else:
        task_instructions = "Solve the problem efficiently with appropriate algorithms."
    structured_prompt = f"""
Generate complete, self-contained Python code to solve this task:
{prompt_text}

Specific Requirements:
1. Create any necessary synthetic data if not provided
2. Use only numpy, pandas, sklearn and matplotlib
3. {task_instructions}
4. Create complete, runnable code
5. Print results clearly
6. For visualizations, use plt.savefig('output.png') instead of plt.show()
7. Ensure the code is syntactically correct

Code:
```python
"""
    try:
        output = generator(
            structured_prompt,
            temperature=0.1,
            max_new_tokens=700,
            truncation=True,
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id
        )
        return output[0]['generated_text']
    except Exception as e:
        print(f"Generation error: {str(e)}")
        return ""

def validate_code(generated_code):
    if not generated_code:
        return ""
    if "```python" in generated_code:
        generated_code = generated_code.split("```python")[1].split("```")[0]
    elif "```" in generated_code:
        generated_code = generated_code.split("```")[1].split("```")[0]
    repairs = [
        (r"from sklearn\.\w+ import \*", ""),
        (r"fit\(\)", "fit(X_train, y_train)"),
        (r"predict\(\)", "predict(X_test)"),
        (r"plt\.show\(\)", "plt.savefig('output.png')"),
        (r"import matplotlib\.pyplot as plt", "import matplotlib.pyplot as plt\nplt.switch_backend('Agg')"),
        (r"\.to_csv\('data\.csv'\)", "")
    ]
    for pattern, replacement in repairs:
        generated_code = re.sub(pattern, replacement, generated_code)
    required_imports = [
        "import numpy as np",
        "import pandas as pd",
        "import matplotlib.pyplot as plt"
    ]
    for imp in required_imports:
        if imp not in generated_code:
            generated_code = imp + "\n" + generated_code
    if "from sklearn" not in generated_code and "import sklearn" not in generated_code:
        generated_code = "from sklearn.ensemble import RandomForestClassifier, IsolationForest\n" + \
                         "from sklearn.cluster import KMeans\n" + \
                         "from sklearn.model_selection import train_test_split\n" + \
                         "from sklearn.metrics import accuracy_score, classification_report\n" + generated_code
    if "pd.DataFrame" not in generated_code and "X =" not in generated_code:
        synthetic_data = "\nX = np.random.rand(100, 4)\ny = np.random.randint(0, 2, 100)\n"
        generated_code = generated_code.replace("import numpy as np", "import numpy as np" + synthetic_data, 1)
    return generated_code.strip()


def safe_execute(code: str, data=None):
    if not code:
        return {"status": "error", "message": "Empty code"}
    safe_env = {
        "__builtins__": {
            'print': print, 'range': range, 'len': len, 'str': str, 'int': int,
            'float': float, 'bool': bool, 'list': list, 'dict': dict, 'tuple': tuple,
            'set': set, 'min': min, 'max': max, 'sum': sum, 'abs': abs, 'round': round,
            'enumerate': enumerate, 'zip': zip, '__import__': __import__
        },
        "np": np,
        "pd": pd,
        "plt": plt,
        "sklearn": sklearn,
        "RandomForestClassifier": RandomForestClassifier,
        "IsolationForest": IsolationForest,
        "KMeans": KMeans,
        "train_test_split": train_test_split,
        "accuracy_score": accuracy_score,
        "classification_report": classification_report,
    }
    if data is not None:
        safe_env["data"] = data
    try:
        ast.parse(code)
        exec(code, safe_env)
        return {"status": "success"}
    except Exception as e:
        return {"status": "error", "message": f"{type(e).__name__}: {str(e)}"}

def measure_inference_performance(generator, prompt_text, num_runs=3):
    metrics = {
        "avg_latency": 0,
        "throughput": 0,
        "memory_usage": 0,
        "success_rate": 0
    }
    successes = 0
    latencies = []
    try:
        _ = generator(prompt_text, max_new_tokens=50, truncation=True)
        start_time = time.time()
        for _ in range(num_runs):
            try:
                run_start = time.time()
                output = generator(
                    prompt_text,
                    max_new_tokens=300,
                    truncation=True,
                    pad_token_id=tokenizer.eos_token_id
                )
                latencies.append(time.time() - run_start)
                successes += 1
            except Exception:
                continue
        metrics["avg_latency"] = np.mean(latencies) * 1000 if latencies else 0
        metrics["throughput"] = successes / max(0.001, time.time() - start_time)
        metrics["success_rate"] = successes / num_runs
        if torch.cuda.is_available():
            torch.cuda.reset_peak_memory_stats()
            metrics["memory_usage"] = torch.cuda.max_memory_allocated() / (1024 ** 2)
        else:
            metrics["memory_usage"] = 0
    except Exception as e:
        print(f"Performance measurement failed: {str(e)}")
    return metrics

def evaluate_code_quality(generated_code):
    if not generated_code:
        return {
            "has_imports": False,
            "has_model": False,
            "has_print": False,
            "is_runnable": False,
            "score": 0
        }
    has_imports = any(keyword in generated_code for keyword in 
                      ["import numpy", "import pandas", "import sklearn"])
    has_model = any(keyword in generated_code for keyword in 
                    ["RandomForest", "IsolationForest", "KMeans"])
    has_print = "print(" in generated_code
    has_visualization = "plt.savefig" in generated_code or "plt.plot" in generated_code
    has_data = "X =" in generated_code or "pd.DataFrame" in generated_code
    is_runnable = has_imports and has_model and has_print and has_data
    score = sum([has_imports, has_model, has_print, is_runnable, has_visualization]) / 5
    return {
        "has_imports": has_imports,
        "has_model": has_model,
        "has_print": has_print,
        "has_visualization": has_visualization,
        "has_data": has_data,
        "is_runnable": is_runnable,
        "score": score
    }

def run_test_pipeline():
    test_prompts = load_test_datasets()
    results = {}
    for technique in OPTIMIZATION_TECHNIQUES:
        print(f"\n{'='*40}")
        print(f"Testing: {technique}")
        print(f"{'='*40}")
        model = apply_optimization(technique)
        if model is None:
            print(f"Skipping {technique} due to initialization error")
            continue
        model.eval()
        device = 0 if torch.cuda.is_available() and not CPU_MODE else -1
        print(f"Using device: {'GPU' if device >= 0 else 'CPU'}")
        generator = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            device=device
        )
        tech_results = {
            "inference": {
                "avg_latency": 0,
                "throughput": 0,
                "memory_usage": 0,
                "success_rate": 0
            },
            "quality": {
                "syntax_errors": 0,
                "execution_errors": 0,
                "valid_count": 0,
                "quality_score": 0,
                "scores": []
            }
        }
        if test_prompts:
            perf_metrics = measure_inference_performance(generator, test_prompts[0]["text"])
            tech_results["inference"] = perf_metrics
        for item in tqdm(test_prompts, desc="Testing"):
            try:
                generated = generate_robust_code(generator, item["text"], item.get("type", ""))
                code = validate_code(generated)
                if not code:
                    tech_results["quality"]["syntax_errors"] += 1
                    tech_results["quality"]["scores"].append(0)
                    continue
                quality_metrics = evaluate_code_quality(code)
                quality_score = quality_metrics["score"]
                tech_results["quality"]["scores"].append(quality_score)
                if not quality_metrics["is_runnable"]:
                    tech_results["quality"]["syntax_errors"] += 1
                    continue
                tech_results["quality"]["valid_count"] += 1
                data = item.get("data", None)
                if data is not None:
                    exec_result = safe_execute(code, data)
                    if exec_result["status"] == "error":
                        tech_results["quality"]["execution_errors"] += 1
            except Exception as e:
                tech_results["quality"]["syntax_errors"] += 1
                tech_results["quality"]["scores"].append(0)
                print(f"Error during testing: {str(e)}")
        if tech_results["quality"]["scores"]:
            tech_results["quality"]["quality_score"] = np.mean(tech_results["quality"]["scores"])
        else:
            tech_results["quality"]["quality_score"] = 0
        results[technique] = tech_results
    return results

def present_results(results):
    table_data = []
    for tech, metrics in results.items():
        inf = metrics["inference"]
        qual = metrics["quality"]
        valid_count = qual["valid_count"]
        total_tests = len(qual["scores"]) if "scores" in qual else TEST_SAMPLE_SIZE
        table_data.append({
            "Technique": tech,
            "Latency (ms)": f"{inf['avg_latency']:.2f}",
            "Throughput (samples/s)": f"{inf['throughput']:.2f}",
            "Memory (MB)": f"{inf['memory_usage']:.1f}",
            "Inference Success (%)": f"{inf['success_rate'] * 100:.1f}",
            "Valid Code (%)": f"{valid_count / total_tests * 100:.1f}" if total_tests > 0 else "N/A",
            "Execution Success (%)": f"{(1 - qual['execution_errors'] / max(1, valid_count)) * 100:.1f}" if valid_count > 0 else "N/A",
            "Quality Score": f"{qual['quality_score'] * 100:.1f}"
        })
    results_df = pd.DataFrame(table_data)
    print("\n" + "="*80)
    print("Optimization Results")
    print("="*80)
    print(results_df.to_string(index=False))
    results_filename = f"optimization_results_{timestamp}.csv"
    results_path = os.path.join(results_dir, results_filename)
    results_df.to_csv(results_path, index=False)
    print(f"\nResults saved to {results_path}")
    if not results_df.empty:
        fig, ax = plt.subplots(2, 2, figsize=(15, 12))
        results_df["Latency Value"] = results_df["Latency (ms)"].str.extract(r'(\d+\.?\d*)').astype(float)
        results_df["Valid Code Value"] = results_df["Valid Code (%)"].str.extract(r'(\d+\.?\d*)').astype(float)
        results_df["Exec Success Value"] = results_df["Execution Success (%)"].str.extract(r'(\d+\.?\d*)').astype(float)
        results_df["Quality Value"] = results_df["Quality Score"].str.extract(r'(\d+\.?\d*)').astype(float)
        results_df.plot.bar(x="Technique", y="Latency Value", ax=ax[0, 0], legend=False, color='skyblue')
        ax[0, 0].set_title('Inference Latency')
        ax[0, 0].set_ylabel('Milliseconds')
        results_df.plot.bar(x="Technique", y="Valid Code Value", ax=ax[0, 1], legend=False, color='lightgreen')
        ax[0, 1].set_title('Valid Code Rate')
        ax[0, 1].set_ylabel('Percentage')
        results_df.plot.bar(x="Technique", y="Exec Success Value", ax=ax[1, 0], legend=False, color='salmon')
        ax[1, 0].set_title('Execution Success Rate')
        ax[1, 0].set_ylabel('Percentage')
        results_df.plot.bar(x="Technique", y="Quality Value", ax=ax[1, 1], legend=False, color='purple')
        ax[1, 1].set_title('Code Quality Score')
        ax[1, 1].set_ylabel('Score (0-100)')
        plt.tight_layout()
        viz_filename = f"optimization_results_{timestamp}.png"
        viz_path = os.path.join(results_dir, viz_filename)
        plt.savefig(viz_path, dpi=150)
        print(f"Visualization saved to {viz_path}")
        plt.close()
    return results_df

if __name__ == "__main__":
    print(f"Starting comprehensive optimization comparison with {TEST_SAMPLE_SIZE} test prompts")
    torch.manual_seed(42)
    np.random.seed(42)
    results = run_test_pipeline()
    results_df = present_results(results)
    metrics_filename = f"detailed_metrics_{timestamp}.json"
    metrics_path = os.path.join(results_dir, metrics_filename)
    with open(metrics_path, "w") as f:
        json.dump(results, f, indent=2)
    print(f"Detailed metrics saved to {metrics_path}")
    print("\nTest pipeline completed successfully!")

All results will be saved in: optimization_results_20250708_183526
Starting comprehensive optimization comparison with 100 test prompts
Created 149 test prompts

Testing: Base Model


Device set to use cuda:0


Using device: GPU


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Testing:   4%|██▊                                                                    | 6/149 [01:50<44:04, 18.50s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Testing:  69%|███████████████████████████████████████████████▋                     | 103/149 [32:42<14:21, 18.72s/it]

Mean cluster centers:
[[0.47175284 0.51435111 0.4663811  0.52378434 0.50113873 0.48556265
  0.54970838 0.47680876 0.51478626 0.46740017 0.52251781 0.49635192
  0.52286201 0.51306513 0.52677907 0.55809872 0.50180197 0.52436608
  0.55606249 0.51250068 0.4980419  0.47655803 0.5036204  0.5220877
  0.45410017 0.50189981 0.4387408  0.447872   0.51322152 0.52861912
  0.48961133 0.51623102 0.4902062  0.50374731 0.48919694 0.49882175
  0.49168513 0.44515221 0.54864569 0.51890876 0.50525958 0.46759069
  0.53186059 0.55457609 0.4595122  0.50132009 0.51435303 0.41324168
  0.49440404 0.5055518  0.50436049 0.51856344 0.45662282 0.53559587
  0.56385899 0.55069084 0.4843733  0.50887487 0.46860672 0.5024037
  0.47646654 0.51472214 0.56397182 0.48614216 0.49514596 0.45810999
  0.53292008 0.50017321 0.56608129 0.48712737 0.42363461 0.54237659
  0.51497159 0.532301   0.5270013  0.51251087 0.53205925 0.45781093
  0.47321491 0.48379425 0.49692288 0.55970748 0.43741463 0.40957281
  0.51040471 0.57020577 0.54

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
Testing:  80%|███████████████████████████████████████████████████████              | 119/149 [37:40<09:17, 18.58s/it]

Mean squared error: 1.4081342246764368


  return fit_method(estimator, *args, **kwargs)
Testing:  87%|████████████████████████████████████████████████████████████▏        | 130/149 [41:08<05:55, 18.74s/it]

K-means clustering results:
Cluster 0: [0.58581833 0.37135601 0.303774   0.58579283 0.5239632  0.54581301
 0.46492851 0.45370829 0.47399612 0.4677694 ]
Cluster 1: [0.43740248 0.5831489  0.7579149  0.34255453 0.40161917 0.41156228
 0.57429652 0.53362704 0.45437606 0.57663641]


  return fit_method(estimator, *args, **kwargs)
Testing:  93%|████████████████████████████████████████████████████████████████▎    | 139/149 [43:58<03:12, 19.24s/it]

K-means clustering results:


  return fit_method(estimator, *args, **kwargs)
Testing: 100%|█████████████████████████████████████████████████████████████████████| 149/149 [47:10<00:00, 19.00s/it]



Testing: Pruning


Device set to use cuda:0


Using device: GPU


  return fit_method(estimator, *args, **kwargs)
Testing:  61%|██████████████████████████████████████████▊                           | 91/149 [28:58<18:34, 19.21s/it]

Mean cluster 1: [0.52732336 0.45985468 0.45712897 0.48160642 0.46292294 0.46294736
 0.53930745 0.50201346 0.50265143 0.50964117 0.49299216 0.52483649
 0.54394763 0.51523129 0.52185377 0.45640065 0.49294294 0.54693917
 0.50566728 0.47113967 0.56794936 0.48639048 0.54246779 0.53987011
 0.53768582 0.49649621 0.50441464 0.4168353  0.50925989 0.46224085
 0.43904501 0.48992033 0.52363608 0.47523084 0.49944347 0.50495728
 0.51111161 0.49196451 0.51327946 0.49930577 0.54396564 0.48447745
 0.50751656 0.51488558 0.50173194 0.52058781 0.52450917 0.48625168
 0.48373974 0.46937661 0.43889769 0.5618763  0.5052914  0.53878447
 0.48502037 0.49318434 0.44208701 0.49287475 0.52239023 0.49357236
 0.49618717 0.45956227 0.49419739 0.48784295 0.49810573 0.49953436
 0.48786942 0.49196092 0.50162665 0.40086148 0.49623579 0.49330577
 0.55990727 0.4676051  0.54947615 0.54163637 0.50921208 0.48422713
 0.54843852 0.49771772 0.53071599 0.48388473 0.51435339 0.50039847
 0.5185499  0.50423751 0.54868315 0.43349956 0

  return fit_method(estimator, *args, **kwargs)
Testing:  64%|█████████████████████████████████████████████                         | 96/149 [30:32<16:44, 18.96s/it]

Mean cluster centers:
[[0.59069821 0.74168728 0.62103067 0.47193882 0.68446435 0.41047975
  0.47397019 0.37905344 0.45925417 0.46993349]
 [0.37881737 0.35149037 0.39314329 0.48883852 0.4612476  0.56118882
  0.51938671 0.56697491 0.49745146 0.49265705]]


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
Testing:  89%|█████████████████████████████████████████████████████████████▏       | 132/149 [41:45<05:17, 18.65s/it]

Mean cluster centers:
[[0.52881511 0.47159157 0.72018864 0.57138906 0.52921789 0.56740954
  0.36703902 0.45377664 0.54807298 0.46216845]
 [0.5035153  0.51140753 0.26158121 0.41995083 0.42757688 0.51828902
  0.63345135 0.58448202 0.43398429 0.57890726]]
Mean cluster labels:
[0 0 0 1 0 0 1 0 0 0 1 1 0 0 0 1 0 0 0 0 1 0 1 1 1 0 0 0 0 1 1 1 1 0 0 1 0
 0 0 1 1 1 1 1 0 1 1 1 0 0 0 1 1 0 1 0 0 1 1 0 1 1 0 1 1 1 1 1 0 0 0 1 1 0
 0 0 0 0 1 1 1 1 0 0 1 0 1 0 0 0 1 0 0 1 1 0 1 1 1 1]


  return fit_method(estimator, *args, **kwargs)


Accuracy: 0.1


  return fit_method(estimator, *args, **kwargs)
Testing:  99%|████████████████████████████████████████████████████████████████████▌| 148/149 [46:49<00:18, 18.79s/it]

MSE: 1.2327837249114142


Testing: 100%|█████████████████████████████████████████████████████████████████████| 149/149 [47:08<00:00, 18.98s/it]



Optimization Results
 Technique Latency (ms) Throughput (samples/s) Memory (MB) Inference Success (%) Valid Code (%) Execution Success (%) Quality Score
Base Model     10057.68                   0.10      4208.8                 100.0           43.6                  56.9          65.6
   Pruning      5402.01                   0.19      4209.3                 100.0           39.6                  47.5          62.0

Results saved to optimization_results_20250708_183526/optimization_results_20250708_183526.csv
Visualization saved to optimization_results_20250708_183526/optimization_results_20250708_183526.png
Detailed metrics saved to optimization_results_20250708_183526/detailed_metrics_20250708_183526.json

Test pipeline completed successfully!
