## Install Requirements

In [None]:
!pip install langchain openai
!pip install langchain_community

## Get Data

In [None]:
import pandas as pd
input_file = 'swe.parquet' #full swe-bench data from huggingface

# Load the dataset
data = pd.read_parquet(input_file)
print(len(data))

# Filter the dataset
filtered_data = data
# Save the filtered dataset
filtered_data.to_parquet(output_file)

print(f"Filtered dataset saved to {output_file}.")


## SoluLeakDetector

In [None]:
import os
import pandas as pd
import json
from langchain.prompts import ChatPromptTemplate
from langchain.prompts.chat import SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain.chat_models import ChatOpenAI

# Define the system prompt
define_system_prompt = """
You are a solution leakage detection expert.

TASK:
Your task is to analyze GitHub issue descriptions (`problem_statement`) and related comments (`hints_text`) for solution leakage.

DEFINITION:
Solution leakage occurs when:
1. The solution is explicitly mentioned (e.g., code snippets or direct instructions).
2. The solution is subtly implied (e.g., explanatory text or hints that lead directly to a solution).

EXAMPLES:
Example 1:
Description:
I propose to add the following settings, with the following default values:

LANGUAGE_COOKIE_SECURE = False
LANGUAGE_COOKIE_HTTPONLY = False
LANGUAGE_COOKIE_SAMESITE = None
The default values maintain the current behavior.

These settings do not provide much security value, since the language is not secret or sensitive. This was also discussed briefly here: ​https://github.com/django/django/pull/8380#discussion_r112448195. The reasons I'd like to add them are:

Sometimes auditors require them.
I personally prefer to set them unless I have a reason *not* to.
Browsers are starting to strongly nudge toward HttpOnly and Secure when possible, e.g. ​https://webkit.org/blog/8613/intelligent-tracking-prevention-2-1/.

Expected Output:
{{
  "solution_leakage_detected": true,
  "reason": "The solution is explicitly provided in the description.",
  "extracted_solution": "LANGUAGE_COOKIE_SECURE = False, LANGUAGE_COOKIE_HTTPONLY = False, LANGUAGE_COOKIE_SAMESITE = None"
}}

Example 2:
Description:
Shape of coef_ wrong for linear_model.Lasso when using fit_intercept=False

Steps/Code to Reproduce
Example:

import numpy as np
from sklearn import linear_model

est_intercept = linear_model.Lasso(fit_intercept=True)
est_intercept.fit(np.c_[np.ones(3)], np.ones(3))
assert est_intercept.coef_.shape  == (1,)

est_no_intercept = linear_model.Lasso(fit_intercept=False)
est_no_intercept.fit(np.c_[np.ones(3)], np.ones(3))
assert est_no_intercept.coef_.shape  == (1,)

Expected Output:
{{
  "solution_leakage_detected": false,
  "reason": "The description identifies a bug but does not explicitly provide a solution.",
  "extracted_solution": null
}}

Example 3:
Description:
There is a typo in Poly3DCollection.__init__() that causes a TypeError exception whenever the function is called with shade=True.

matplotlib/lib/mpl_toolkits/mplot3d/art3d.py

Line 908 in f7a8cab

 if facecolors is None and edgecolors in None:
edgecolors in None should be edgecolors is None

Expected Output:
{{
  "solution_leakage_detected": true,
  "reason": "The solution is explicitly provided as a corrected code snippet.",
  "extracted_solution": "edgecolors in None should be edgecolors is None"
}}
"""

# Define the template for the prompt
template = ChatPromptTemplate.from_messages(
    messages=[
        SystemMessagePromptTemplate.from_template(define_system_prompt),
        HumanMessagePromptTemplate.from_template(
            "Analyze the following problem and comments for solution leakage.\n\nProblem Statement:\n{problem_statement}\n\nHints Text:\n{hints_text}"
        ),
    ]
)

# Set the OpenAI API key
key = os.getenv("OPENAI_API_KEY")

# Initialize the LLM with GPT-4
llm = ChatOpenAI(model_name="gpt-4", temperature=0, openai_api_key=key1)

# Create the pipeline
leakage_detection_pipeline = template | llm

# Function to run solution leakage detection
def detect_solution_leakage(problem_statement, hints_text):
    inputs = {
        "problem_statement": problem_statement,
        "hints_text": hints_text
    }
    response = leakage_detection_pipeline.invoke(inputs)
    # Handle response content as plain text
    if hasattr(response, "content"):
        return parse_response_content(response.content)
    return {}

# Function to parse the response content
def parse_response_content(content):
    """
    Parses the response content, which is plain text, into a dictionary.
    """
    try:
        # Attempt JSON parsing if applicable
        parsed_response = json.loads(content)
    except json.JSONDecodeError:
        # Fall back to simple plain text parsing
        lines = content.split("\n")
        parsed_response = {"raw_text": content}
        for line in lines:
            if "solution_leakage_detected" in line.lower():
                parsed_response["solution_leakage_detected"] = "true" in line.lower()
            elif "reason" in line.lower():
                parsed_response["reason"] = line.split(":", 1)[-1].strip()
            elif "extracted_solution" in line.lower():
                parsed_response["extracted_solution"] = line.split(":", 1)[-1].strip()
    return parsed_response

# Function to format results
def format_result(instance_id, problem_index, problem_statement, result, is_leakage_type):
    return {
        "Instance ID": instance_id,
        "Problem Index": problem_index,
        "Leakage Type": is_leakage_type,
        "Problem Statement": problem_statement,
        "Reason": result.get("reason", "No reason provided"),
        "Extracted Solution": result.get("extracted_solution", "No solution extracted")
    }

# Function to process a dataset and analyze rows
def detect_and_organize_results_with_links(file_path, output_file):
    data = pd.read_parquet(file_path)

    if 'instance_id' not in data.columns or 'problem_statement' not in data.columns or 'hints_text' not in data.columns:
        raise ValueError("The dataset must contain 'instance_id', 'problem_statement', and 'hints_text' columns.")

    results = []

    for idx, row in data.iterrows():
        if len(row['problem_statement']) + len(row['hints_text']) > 7000:  # Filter long instances
            print(f"Skipping instance {row['instance_id']} due to excessive length.")
            continue
        instance_id = row['instance_id']
        problem_statement = row['problem_statement']
        hints_text = row['hints_text']
        raw_result = detect_solution_leakage(problem_statement, hints_text)

        # Determine leakage type
        if raw_result.get("solution_leakage_detected", False):
            if any(kw in raw_result.get("reason", "").lower() for kw in ["explicitly mentioned", "explicitly provided", "clear solution", "direct instructions", "patch file", "code snippet"]):
                is_leakage_type = "Solution Leak - Direct"
            elif any(kw in raw_result.get("reason", "").lower() for kw in ["hint", "subtly implied", "suggests"]):
                is_leakage_type = "Solution Leak - Hint"
            else:
                is_leakage_type = "Solution Leak - Direct"
        else:
            is_leakage_type = "No Solution Leak"

        # Organize the output
        formatted_result = format_result(instance_id, idx + 1, problem_statement, raw_result, is_leakage_type)
        results.append(formatted_result)

    # Save results to output file
    with open(output_file, 'w') as f:
        json.dump(results, f, indent=4)

        # Calculate statistics
    stats = {
        "Solution Leak - Direct": sum(1 for r in results if r["Leakage Type"] == "Solution Leak - Direct"),
        "Solution Leak - Hint": sum(1 for r in results if r["Leakage Type"] == "Solution Leak - Hint"),
        "No Solution Leak": sum(1 for r in results if r["Leakage Type"] == "No Solution Leak")
    }

    # Add statistics to the output file
    with open(output_file, 'w') as f:
        json.dump({"results": results, "statistics": stats}, f, indent=4)

    return results

# Example Usage
if __name__ == "__main__":
    file_path = 'filtered_swe_bench_data.parquet'
    output_file = 'results.json'

    try:
        organized_results = detect_and_organize_results_with_links(file_path, output_file)
        print(f"Results saved to {output_file}")
    except Exception as e:
        print(f"Error: {e}")
