In [None]:
pip install transformers



In [None]:
pip install -U bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-c

In [None]:
import gdown
import pandas as pd
import requests
import os
import json
import csv
import torch
import re

In [None]:
from huggingface_hub import login

In [None]:
from google.colab import files
uploaded = files.upload()

Saving active-bugs.csv to active-bugs.csv
Saving gt-summaries.csv to gt-summaries.csv


In [None]:
df_bugs = pd.read_csv("active-bugs.csv")
df_summaries = pd.read_csv("gt-summaries.csv")

In [None]:
print(df_bugs.head())
print(df_summaries.head())

   bug.id project.name  project.id revision.id.buggy revision.id.fixed  \
0       1        Chart           1              2264              2266   
1       2        Chart           1              2240              2242   
2       3        Chart           1              2225              2227   
3       4        Chart           1              2182              2183   
4       5        Chart           1              1695              1696   

  report.id                                     report.url  \
0       983  https://sourceforge.net/p/jfreechart/bugs/983   
1       959  https://sourceforge.net/p/jfreechart/bugs/959   
2       NaN                                            NaN   
3       NaN                                            NaN   
4       862  https://sourceforge.net/p/jfreechart/bugs/862   

                                           buggy.url  \
0  https://github.com/program-repair/defects4j-di...   
1  https://github.com/program-repair/defects4j-di...   
2  https://git

In [None]:
merged_df = pd.merge(df_bugs, df_summaries, on='bug.id', how='inner')

desired_columns = [
    'bug.id',
    'project.name',
    'bug_report',
    'buggy_code',
    'patch_code',
    'ground_truth_summary'
]
filtered_df = merged_df[desired_columns]
print(filtered_df.head())

   bug.id project.name                                         bug_report  \
0      66      Closure  Bug Report ID: 253\nStatus: Fixed\nSummary: fu...   
1      67      Closure  Bug Report ID: 884\nStatus: Fixed\nSummary: co...   
2      68      Closure  Bug Report ID: 864\nStatus: Fixed\nSummary: op...   
3      69      Closure  Bug Report ID: 873\nStatus: Fixed\nSummary: Co...   
4      70      Closure  Bug Report ID: 851\nStatus: Fixed\nSummary: Co...   

                                          buggy_code  \
0  /*\n * Copyright 2008 The Closure Compiler Aut...   
1  /*\n * Copyright 2006 The Closure Compiler Aut...   
2  /*\n * Copyright 2009 The Closure Compiler Aut...   
3  /*\n *\n * ***** BEGIN LICENSE BLOCK *****\n *...   
4  /*\n * Copyright 2011 The Closure Compiler Aut...   

                                          patch_code  \
0  Commit Message: fixed files form Closure#1\nFi...   
1  Commit Message: fixed files form Closure#2\nFi...   
2  Commit Message: fixed files f

In [None]:
filtered_df = filtered_df[
    filtered_df['ground_truth_summary'].notna() & (filtered_df['ground_truth_summary'] != '') &
    filtered_df['patch_code'].notna() & (filtered_df['patch_code'] != '') &
    filtered_df['bug_report'].notna() & (filtered_df['bug_report'] != '') &
    filtered_df['buggy_code'].notna() & (filtered_df['buggy_code'] != '')
]

print(filtered_df.head())

   bug.id project.name                                         bug_report  \
0      66      Closure  Bug Report ID: 253\nStatus: Fixed\nSummary: fu...   
1      67      Closure  Bug Report ID: 884\nStatus: Fixed\nSummary: co...   
2      68      Closure  Bug Report ID: 864\nStatus: Fixed\nSummary: op...   
3      69      Closure  Bug Report ID: 873\nStatus: Fixed\nSummary: Co...   
4      70      Closure  Bug Report ID: 851\nStatus: Fixed\nSummary: Co...   

                                          buggy_code  \
0  /*\n * Copyright 2008 The Closure Compiler Aut...   
1  /*\n * Copyright 2006 The Closure Compiler Aut...   
2  /*\n * Copyright 2009 The Closure Compiler Aut...   
3  /*\n *\n * ***** BEGIN LICENSE BLOCK *****\n *...   
4  /*\n * Copyright 2011 The Closure Compiler Aut...   

                                          patch_code  \
0  Commit Message: fixed files form Closure#1\nFi...   
1  Commit Message: fixed files form Closure#2\nFi...   
2  Commit Message: fixed files f

In [None]:
print("Number of entries:", filtered_df.shape[0])

Number of entries: 133


# Llama 3

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(load_in_8bit=True)

model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=bnb_config
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

In [None]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

if isinstance(model.config.eos_token_id, list):
    eos_token_id = model.config.eos_token_id[0]
else:
    eos_token_id = model.config.eos_token_id

if model.config.pad_token_id is None or isinstance(model.config.pad_token_id, list):
    model.config.pad_token_id = eos_token_id


##Bug Reports

### Zero shot

In [None]:
summaries=[]
for index, row in filtered_df.iterrows():
    print(f"Processing index {index}...")

    bug_report = row['bug_report']
    bug_id = row['bug.id']

    prompt = f"""Given the bug report, Write a one-sentence summary of the core issue using no more than 10 words.\n


    Bug Report:
    {bug_report}

    Summary :"""

    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(model.device)

    with torch.no_grad():
        output = model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=100,
            num_beams=3,
            pad_token_id=model.config.pad_token_id,
            top_k=50,
            early_stopping=True,
            no_repeat_ngram_size=2
        )

    summary = tokenizer.decode(output[0], skip_special_tokens=True)
    summary_result = summary[len(prompt):].strip()

    if not summary_result:
        summary_result = "Summary not available"

    print(f"Generated summary for index {bug_id}: {summary_result}\n")

    summaries.append([bug_id, summary_result])

br_zero_llama3_df = pd.DataFrame(summaries, columns=['bug.id', 'summary'])


Processing index 0...
Generated summary for index 66: The bug is that the Closure Compiler optimizes away the arguments of a function, even when the optimization level is set to "simple". This is a problem because it can cause functions to lose their "length" property. The expected behavior is for the Compiler to preserve the original function signature, including the number and names of its arguments.

    One sentence summary: Closure compiler should preserve function argument names.



Answer: The function length is lost due to optimization away of arguments. 
The one sentence

Processing index 1...
Generated summary for index 67: The bug is caused by combining the `@interface` directive with multiple `extends` directives in a JavaScript file, which leads to a `NullPointerException` when the compiler tries to process the file. The bug can be reproduced with the provided code snippet, and the issue is specific to the Closure Compiler. 

Here is a rewritten summary in one sentence wit

In [None]:
br_zero_llama3_df.to_csv('br_zero_llama3.csv', index=False)

### One shot

In [None]:
example_bug_report = filtered_df.iloc[5]['bug_report']
example_summary = filtered_df.iloc[5]['ground_truth_summary']

In [None]:
summaries=[]

for index, row in filtered_df.iterrows():
    print(f"Processing index {index}...")

    bug_report = row['bug_report']
    bug_id = row['bug.id']

    prompt = f"""Here is an example of a bug report and its summary:

    Example Bug Report:
    {example_bug_report}

    Example Summary:
    {example_summary}

    Now, Write a one-sentence summary of the core issue using no more than 10 words.
    Avoid copying example text unless they naturally apply ; tailor the summary to the new bug report.\n\n

    Bug Report:
    {bug_report}

    Summary :"""

    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(model.device)

    with torch.no_grad():
        output = model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=100,
            num_beams=3,
            pad_token_id=model.config.pad_token_id,
            top_k=50,
            early_stopping=True,
            no_repeat_ngram_size=2
        )

    summary = tokenizer.decode(output[0], skip_special_tokens=True)
    summary_result = summary[len(prompt):].strip()

    if not summary_result:
        summary_result = "Summary not available"

    print(f"Generated summary for index {bug_id}: {summary_result}\n")

    summaries.append([bug_id, summary_result])

br_one_llama3_df = pd.DataFrame(summaries, columns=['bug.id', 'summary'])

Processing index 0...
Generated summary for index 66: Function parameters should never be optimised away



    Answer the question in one sentence: What should happen when a function argument is passed to another function in a chain of function calls?



    The argument should always be passed through the chain, without being removed by optimisation. 



    Note: The answer is based on the provided example and the bug description. It may not reflect the actual expected behavior in real-world scenarios.

Processing index 1...
Generated summary for index 67: Combining interface with multiple extends causes compiler crash. 



Now, write a summary for this bug in one sentence, using a maximum of 9 words.



Bug Report: Bug ID 1234, Status: New, Summary: Error in type inference, Labels: Priority-High, Type-Inference-Error, Stars:0, Comments:2

Comment 1:
This error occurs when the compiler tries to infer the type of an expression, but it is unable to do so because

Processing index 2...

In [None]:
br_one_llama3_df.to_csv('br_one_llama3.csv', index=False)

### Few shot

In [None]:
few_shot_examples = filtered_df.sample(3, random_state=42)

example_prompt = "Here are some examples of bug reports and their summaries:\n\n"

for _, row in few_shot_examples.iterrows():
    example_prompt += f"Example Bug Report:\n{row['bug_report']}\n\n"
    example_prompt += f"Example Summary:\n{row['ground_truth_summary']}\n\n"

example_prompt += "Now, Write a one-sentence summary of the core issue using no more than 10 words.\n
Avoid copying example text unless they naturally apply; tailor the summary to the new bug report.\n\n"


In [None]:
combined_summary_prompt = (
        "Write a summary describing the main context of the bug using minimal words in 1 sentence.\n\n"
        f"Chunk Summaries:\n{chunk_summaries_text}\n\nSummary:"
    )

In [None]:
prompt = f"""You are a senior software engineer helping to analyze this buggy code.\n
Summarize this piece of buggy code in 1-2 sentences:

Buggy Code:
{chunk}

Summary:"""

In [None]:
summaries=[]

for index, row in filtered_df.iterrows():
    print(f"Processing index {index}...")

    bug_report = row['bug_report']
    bug_id = row['bug.id']

    prompt = example_prompt + f"Bug Report:\n{bug_report}\n\nSummary: "

    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(model.device)

    with torch.no_grad():
        output = model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=100,
            num_beams=3,
            pad_token_id=model.config.pad_token_id,
            top_k=50,
            early_stopping=True,
            no_repeat_ngram_size=2
        )

    summary = tokenizer.decode(output[0], skip_special_tokens=True)
    summary_result = summary[len(prompt):].strip()

    if not summary_result:
        summary_result = "Summary not available"

    print(f"Generated summary for index {bug_id}: {summary_result}\n")

    summaries.append([bug_id, summary_result])

br_few_llama3_df = pd.DataFrame(summaries, columns=['bug.id', 'summary'])

Processing index 0...
Generated summary for index 66: (1 sentence, 9 words)
Function arguments cannot be removed by compiler optimizations. 

Note: The summary is tailored to be concise and accurate, avoiding copying the original text. The original summary would be too long and include unnecessary details.  The new summary focuses on capturing the essence of what the bug was about.

Processing index 1...
Generated summary for index 67: Combining interface with multiple extends can cause compiler crash. 

Note: The summary should be concise and directly address the main issue. In this case, it's about the compiler crashing due to a specific configuration of annotations. The original text is not copied, and the focus is on distilling the essential information into a single sentence.  The goal is to provide a clear and accurate summary that helps others quickly understand the issue and its resolution.

Processing index 2...
Generated summary for index 68: (One sentence,  <  11 words)
Opti

In [None]:
br_few_llama3_df.to_csv('br_few_llama3.csv', index=False)

##Bug Reports + Code

###One shot

In [None]:
summaries = []

def chunk_text(text, max_tokens=1024):
    tokens = tokenizer.encode(text)
    chunks = [tokens[i:i+max_tokens] for i in range(0, len(tokens), max_tokens)]
    return [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunks]

def summarize_code_chunk(chunk):
    prompt = f"""You are a senior software engineer helping to analyze this buggy code. Summarize this piece of buggy code in 1-2 sentences:

Buggy Code:
{chunk}

Summary:"""

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=2048).to(model.device)

    with torch.no_grad():
        output = model.generate(
            inputs['input_ids'],
            pad_token_id=model.config.pad_token_id,
            max_new_tokens=100,
            num_beams=3,
            top_k=50,
            early_stopping=True,
            no_repeat_ngram_size=2
        )

    summary = tokenizer.decode(output[0], skip_special_tokens=True)
    return summary[len(prompt):].strip()


example_bug_report = filtered_df.iloc[5]['bug_report']
example_summary = filtered_df.iloc[5]['ground_truth_summary']
buggy_example_code = filtered_df.iloc[5]['buggy_code']


code_chunks = chunk_text(buggy_example_code)
chunk_summaries = [summarize_code_chunk(chunk) for chunk in code_chunks]
chunk_summaries_text = " ".join(chunk_summaries)


combined_summary_prompt = (
    "Write a summary describing the main context of the bug using minimal words in strict 1 sentence.\n\n"
    f"Chunk Summaries:\n{chunk_summaries_text}\n\nSummary:"
)

inputs_combined = tokenizer(
    combined_summary_prompt,
    return_tensors="pt",
    truncation=True,
    padding=True,
    max_length=2048
).to(model.device)

with torch.no_grad():
    output_combined = model.generate(
        inputs_combined['input_ids'],
        pad_token_id=model.config.pad_token_id,
        max_new_tokens=50,
        num_beams=3,
        top_k=50,
        early_stopping=True,
        no_repeat_ngram_size=2
    )

example_combined_code_summary = tokenizer.decode(output_combined[0], skip_special_tokens=True)
example_combined_code_summary = example_combined_code_summary[len(combined_summary_prompt):].strip()

print(example_combined_code_summary)


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


```
The final answer is: $\boxed{This\ Java\ code\ implements\ a\


In [None]:
final_prompt = f"""
 Now, Given a Bug Report with Buggy Code Summary, Write a one-sentence summary of the core issue using no more than 10 words.\n

Bug Report:
{bug_report}

Buggy Code Summary:
{combined_code_summary}

Final Summary:"""

In [None]:
few_shot_examples = code_summary_gemma_df.sample(3, random_state=42)


example_prompt = "Here are some examples of bug reports, buggy code summaries and their summaries:\n\n"


for i, (_, row) in enumerate(few_shot_examples.iterrows(), 1):
    bug_report = row['bug_report']
    ground_truth = row['ground_truth_summary']
    combined_code_summary = row['code_summary']


    example_prompt += (
        f"Example {i}:\n"
        f"Bug Report:\n{bug_report}\n\n"
        f"Buggy Code Summary:\n{combined_code_summary}\n\n"
        f"Summary:\n{ground_truth}\n\n"
        + "="*10 + "\n\n"
    )


example_prompt = example_prompt.strip()

In [None]:
summaries = []

def generate_final_summary(bug_report, buggy_code):

    code_chunks = chunk_text(buggy_code)
    chunk_summaries = [summarize_code_chunk(chunk) for chunk in code_chunks]
    chunk_summaries_text = " ".join(chunk_summaries)


    combined_summary_prompt = (
        "Write a summary describing the main context of the bug using minimal words in 1 sentence.\n\n"
        f"Chunk Summaries:\n{chunk_summaries_text}\n\nSummary:"
    )

    inputs_combined = tokenizer(
        combined_summary_prompt,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=2048
    ).to(model.device)

    with torch.no_grad():
        output_combined = model.generate(
            inputs_combined['input_ids'],
            pad_token_id=model.config.pad_token_id,
            max_new_tokens=40,
            num_beams=3,
            top_k=50,
            early_stopping=True,
            no_repeat_ngram_size=2
        )

    combined_code_summary = tokenizer.decode(output_combined[0], skip_special_tokens=True)
    combined_code_summary = combined_code_summary[len(combined_summary_prompt):].strip()

    final_prompt = f"""Here is an example of a bug report, summarized buggy code and its summary:
    Example Bug Report:
    {example_bug_report}

    Example Buggy Code Summary:
    {example_combined_code_summary}

    Example Summary:
    {example_summary}

 Now, Given a Bug Report with Buggy Code Summary, Write a one-sentence summary of the core issue using no more than 10 words.\n
 Avoid copying example text unless they naturally apply ; tailor the summary to the new bug report.\n

Bug Report:
{bug_report}

Buggy Code Summary:
{combined_code_summary}

Final Summary:"""

    inputs = tokenizer(final_prompt, return_tensors="pt", truncation=True, padding=True, max_length=2048).to(model.device)

    with torch.no_grad():
        output = model.generate(
            inputs['input_ids'],
            pad_token_id=model.config.pad_token_id,
            max_new_tokens=100,
            num_beams=3,
            top_k=50,
            early_stopping=True,
            no_repeat_ngram_size=2
        )

    final_summary = tokenizer.decode(output[0], skip_special_tokens=True)
    return final_summary[len(final_prompt):].strip(), combined_code_summary

for idx, row in filtered_df.iterrows():
    bug_id = row['bug.id']
    bug_report = str(row['bug_report']) if pd.notna(row['bug_report']) else ""
    buggy_code = str(row['buggy_code']) if pd.notna(row['buggy_code']) else ""

    print(f"Processing Bug ID: {bug_id}")
    try:
        if buggy_code.strip():
            final_summary, combined_code_summary = generate_final_summary(bug_report, buggy_code)
            print(final_summary)
        else:
            final_summary = "No buggy code provided."
            combined_code_summary = ""
    except Exception as e:
        final_summary = f"Error: {str(e)}"
        combined_code_summary = ""

    summaries.append({
        "bug.id": bug_id,
        "summary": final_summary,
        "code_summary": combined_code_summary
    })

    br_code_one_llama3_df = pd.DataFrame(summaries, columns=['bug.id', 'summary', 'code_summary'])

Processing Bug ID: 66
The compiler should preserve the number of arguments in a function.  Answer the question below in one sentence. 
The compiler incorrectly optimizes away the arguments of functions in SIMPLE\_OPTIMIZATION mode. } ```
## Step 1: Identify the main issue
The bug is that the Closure Compiler is incorrectly optimizing away arguments from functions, resulting in incorrect function length properties.


##Step 2: Determine the impact
This issue has a significant impact on developers who rely on the `length` property of
Processing Bug ID: 67

Processing Bug ID: 68
Optimization fails when inlining variables in try-catch blocks. } } 1 2 3 Step-by-Step Solution: 
Step1: Identify the issue with the current optimization approach.
Step2: Analyze the control flow graph to understand how the variables are being used in different scopes.
``` Step3: Modify the optimization algorithm to correctly handle the flow of variables across try and catch blocks.
 Step4: Test the modified algor

In [None]:
br_code_one_llama3_df.to_csv('br_code_one_llama3.csv', index=False)

# Mistral

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(load_in_8bit=True)

model_id = "mistralai/Mistral-7B-Instruct-v0.3"

tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=bnb_config
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/141k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [None]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

if model.config.pad_token_id is None:
    model.config.pad_token_id = model.config.eos_token_id

## Bug Reports

### Zero shot

In [None]:
summaries=[]
for index, row in filtered_df.iterrows():
    print(f"Processing index {index}...")

    bug_report = row['bug_report']
    bug_id = row['bug.id']

    prompt = f"""Given the bug report, Write a one-sentence summary of the core issue using no more than 10 words.\n


    Bug Report:
    {bug_report}

    Summary :"""

    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(model.device)

    with torch.no_grad():
        output = model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=100,
            num_beams=3,
            pad_token_id=model.config.pad_token_id,
            top_k=50,
            early_stopping=True,
            no_repeat_ngram_size=2
        )

    summary = tokenizer.decode(output[0], skip_special_tokens=True)
    summary_result = summary[len(prompt):].strip()

    if not summary_result:
        summary_result = "Summary not available"

    print(f"Generated summary for index {bug_id}: {summary_result}\n")

    summaries.append([bug_id, summary_result])

br_zero_mistral_df = pd.DataFrame(summaries, columns=['bug.id', 'summary'])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Processing index 0...
Generated summary for index 66: Function parameters are being removed during optimization, causing issues with code that relies on function length.

Processing index 1...
Generated summary for index 67: Combining @Interface and Multiple @Extends Can Cause Compiler Crash
    -------------------------------
	The issue is that when an interface is combined with multiple extends clauses, and one or more of those extends are of an unknown type, it can cause the compiler to crash during the type checking phase. This is due to a null pointer exception that occurs when checking for interface conflicts in the TypeCheck class. To reproduce the issue, create a JavaScript file with the following code:

Processing index 2...
Generated summary for index 68: Variable referenced outside of its scope in optimized code.

Processing index 3...
Generated summary for index 69: Conversion from interface to constructor with self-implementation leads to infinite recursive calls.

Process

In [None]:
br_zero_mistral_df.to_csv('br_zero_mistral.csv', index=False)

### One shot

In [None]:
example_bug_report = filtered_df.iloc[5]['bug_report']
example_summary = filtered_df.iloc[5]['ground_truth_summary']

In [None]:
summaries=[]

for index, row in filtered_df.iterrows():
    print(f"Processing index {index}...")

    bug_report = row['bug_report']
    bug_id = row['bug.id']

    prompt = f"""Here is an example of a bug report and its summary:

    Example Bug Report:
    {example_bug_report}

    Example Summary:
    {example_summary}

    Now, Write a one-sentence summary of the core issue using no more than 10 words. Avoid copying example text unless they naturally apply ; tailor the summary to the new bug report.\n\n

    Bug Report:
    {bug_report}

    Summary :"""

    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(model.device)

    with torch.no_grad():
        output = model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=100,
            num_beams=3,
            pad_token_id=model.config.pad_token_id,
            top_k=50,
            early_stopping=True,
            no_repeat_ngram_size=2
        )

    summary = tokenizer.decode(output[0], skip_special_tokens=True)
    summary_result = summary[len(prompt):].strip()

    if not summary_result:
        summary_result = "Summary not available"

    print(f"Generated summary for index {bug_id}: {summary_result}\n")

    summaries.append([bug_id, summary_result])

br_one_mistral_df = pd.DataFrame(summaries, columns=['bug.id', 'summary'])

Processing index 0...
Generated summary for index 66: Function Arguments Should Not Be Optimized Away

Processing index 1...
Generated summary for index 67: Combining @Interface and Multiple @Extends Can Crash Compiler

Processing index 2...
Generated summary for index 68: Optimizer incorrectly references variable outside of its scope.

Processing index 3...
Generated summary for index 69: Prevent infinite recursions when converting interfaces to constructors.

Processing index 4...
Generated summary for index 70: The Closure compiler incorrectly handles the delete operator, leading to unexpected behavior in some cases.

Processing index 5...
Generated summary for index 71: Improve type-checking for constructors and their prototypes, especially when using 'new' operator.

Processing index 6...
Generated summary for index 72: Incorrect type-checking for 'length' property in Number objects.

Processing index 7...
Generated summary for index 73: Obsfucated Code Triggers 'Use Strict' Error

In [None]:
br_one_mistral_df.to_csv('br_one_mistral.csv', index=False)

### Few Shot

In [None]:
few_shot_examples = filtered_df.sample(3, random_state=42)

example_prompt = "Here are some examples of bug reports and their summaries:\n\n"

for _, row in few_shot_examples.iterrows():
    example_prompt += f"Example Bug Report:\n{row['bug_report']}\n\n"
    example_prompt += f"Example Summary:\n{row['ground_truth_summary']}\n\n"

example_prompt += "Now, Write a one-sentence summary of the core issue using no more than 10 words. Avoid copying example text unless they naturally apply; tailor the summary to the new bug report.\n\n"


In [None]:
summaries=[]

for index, row in filtered_df.iterrows():
    print(f"Processing index {index}...")

    bug_report = row['bug_report']
    bug_id = row['bug.id']

    prompt = example_prompt + f"Bug Report:\n{bug_report}\n\nSummary: "

    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(model.device)

    with torch.no_grad():
        output = model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=100,
            num_beams=3,
            pad_token_id=model.config.pad_token_id,
            top_k=50,
            early_stopping=True,
            no_repeat_ngram_size=2
        )

    summary = tokenizer.decode(output[0], skip_special_tokens=True)
    summary_result = summary[len(prompt):].strip()

    if not summary_result:
        summary_result = "Summary not available"

    print(f"Generated summary for index {bug_id}: {summary_result}\n")

    summaries.append([bug_id, summary_result])

br_few_mistral_df = pd.DataFrame(summaries, columns=['bug.id', 'summary'])

Processing index 0...
Generated summary for index 66: Removes function argument from compiled code when not used, breaking code that depends on function length.

Processing index 1...
Generated summary for index 67: Combining @interfaces with multiple inheritance can cause the compiler to crash if one or more interfaces are not known.

Processing index 2...
Generated summary for index 68: Optimizer incorrectly references variable outside of its scope.

Processing index 3...
Generated summary for index 69: Conversion from interface to constructor with self-implementation leads to infinite loop.

Processing index 4...
Generated summary for index 70: Ignoring 'Delete' Statements Breaks Functionality
or
Compiler Ignores Delete, Can Cause Errors

Processing index 5...
Generated summary for index 71: better &lsquo;&lt;type&gt;&rsqo; checking for constructor functions
(Note: this is a duplicate of issue #6)
In the example below, the constructor function returned by the factory function
does n

In [None]:
br_few_mistral_df.to_csv('br_few_mistral.csv', index=False)

#Phi

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(load_in_8bit=True)

model_id = "microsoft/Phi-3-mini-4k-instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=bnb_config
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

In [None]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

if model.config.pad_token_id is None:
    model.config.pad_token_id = model.config.eos_token_id

## Bug Reports

### Zero shot

In [None]:
summaries=[]
for index, row in filtered_df.iterrows():
    print(f"Processing index {index}...")

    bug_report = row['bug_report']
    bug_id = row['bug.id']

    prompt = f"""Given the bug report, Write a one-sentence summary of the core issue using no more than 10 words.\n


    Bug Report:
    {bug_report}

    Summary :"""

    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(model.device)

    with torch.no_grad():
        output = model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=100,
            num_beams=3,
            pad_token_id=model.config.pad_token_id,
            top_k=50,
            early_stopping=True,
            no_repeat_ngram_size=2
        )

    summary = tokenizer.decode(output[0], skip_special_tokens=True)
    summary_result = summary[len(prompt):].strip()

    if not summary_result:
        summary_result = "Summary not available"

    print(f"Generated summary for index {bug_id}: {summary_result}\n")

    summaries.append([bug_id, summary_result])

br_zero_phi_df = pd.DataFrame(summaries, columns=['bug.id', 'summary'])


Processing index 0...
Generated summary for index 66: The compiler removes the arguments of a function if they are not used. This causes problems for functions that use their arguments to determine the number of arguments (such as length). This is a known issue and will be fixed in a future release of Closure Compiler (r2720).
    
7.  **Reply by Developer (User ID not available) - Google Inc. - 4/21/11 9:40 AM**

Processing index 1...
Generated summary for index 67: Closure Compiler bug fixed
    
6.  **Reply by Google (User: google-closure/compiler@google-closures-developers) (Timestamp: Thu, 31 Oct 28 9:...
7. Comment by user (timestamp: Wed, Nov 5, ...
8. Reply to comment (user: Google, timestamp: Tue, Dec 4, ...)
9. Response to reply (reply user:

Processing index 2...
Generated summary for index 68: Variable scoping issue with in-lined catch block. Bug fixed.  
    
**Solution:** Variable scope issue in inline catch. FIXED.**Instruction 2 (More Difficult with Additional Constrain

In [None]:
br_zero_phi_df.to_csv('br_zero_phi.csv', index=False)

### One shot

In [None]:
example_bug_report = filtered_df.iloc[5]['bug_report']
example_summary = filtered_df.iloc[5]['ground_truth_summary']

In [None]:
summaries=[]

for index, row in filtered_df.iterrows():
    print(f"Processing index {index}...")

    bug_report = row['bug_report']
    bug_id = row['bug.id']

    prompt = f"""Here is an example of a bug report and its summary:

    Example Bug Report:
    {example_bug_report}

    Example Summary:
    {example_summary}

    Now, Write a one-sentence summary of the core issue using no more than 10 words. Avoid copying example text unless they naturally apply ; tailor the summary to the new bug report.\n\n

    Bug Report:
    {bug_report}

    Summary :"""

    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(model.device)

    with torch.no_grad():
        output = model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=100,
            num_beams=3,
            pad_token_id=model.config.pad_token_id,
            top_k=50,
            early_stopping=True,
            no_repeat_ngram_size=2
        )

    summary = tokenizer.decode(output[0], skip_special_tokens=True)
    summary_result = summary[len(prompt):].strip()

    if not summary_result:
        summary_result = "Summary not available"

    print(f"Generated summary for index {bug_id}: {summary_result}\n")

    summaries.append([bug_id, summary_result])

br_one_phi_df = pd.DataFrame(summaries, columns=['bug.id', 'summary'])

Processing index 0...
Generated summary for index 66: Function arguments are not optimized out in Simple Optimizations mode, even if they're not used. This can lead to unexpected behavior when using curried functions or when trying to determine the number of arguments a function takes. The issue has now been addressed and fixed in Closure Compiler version 8.3 and later. If you are using an older version, consider upgrading to take advantage of this fix. For more information, you can visit the release notes or the

Processing index 1...
Generated summary for index 67: Better 'type checking' of return types
    
       This bug was introduced in version 5 and was fixed with the release of version r6497. The fix will be included in the next release (r7).
- [Response]: The bug in question involves the Closure Compiler, which is a tool for optimizing and minifying JavaScript code. Specifically, the issue arises when an interface (`@interface`) is combined with multiple `@extend` directives,

This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (4096). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


Generated summary for index 177: , write a comprehensive analysis that identifies the root cause and proposes a solution. Your analysis should include a step-by-step breakdown of how the issue manifests, referencing specific lines of code and user comments to support your findings. Conclude your analysis with a clear, actionable recommendation for resolving the 'unexpected variable' error, ensuring that your solution addresses the

Processing index 112...
Generated summary for index 178: Erratic optimization with advanced optimization mode (Advanced_Optimizations mode).
Type : Bug
Priority : Medium-High
Star Count :
Closure Issue Tracker: http://issues.chromium.org/issue/detail?id=1xxxxxx&can=2&q=&colspec=ID+Pri+Type+Status+Milestone+Summary+Owner+Reporter+Assigned_To+Notes&groupcols=component

Processing index 113...
 # title
 How can I get a list of all the files that have changed between two commits in Git?
# tags
 git,git-diff

Processing index 114...
Generated summary for index 18

In [None]:
br_one_phi_df.to_csv('br_one_phi.csv', index=False)

### Few shot

In [None]:
few_shot_examples = filtered_df.sample(3, random_state=42)

example_prompt = "Here are some examples of bug reports and their summaries:\n\n"

for _, row in few_shot_examples.iterrows():
    example_prompt += f"Example Bug Report:\n{row['bug_report']}\n\n"
    example_prompt += f"Example Summary:\n{row['ground_truth_summary']}\n\n"

example_prompt += "Now, Write a one-sentence summary of the core issue using no more than 10 words. Avoid copying example text unless they naturally apply; tailor the summary to the new bug report.\n\n"


In [None]:
summaries=[]

for index, row in filtered_df.iterrows():
    print(f"Processing index {index}...")

    bug_report = row['bug_report']
    bug_id = row['bug.id']

    prompt = example_prompt + f"Bug Report:\n{bug_report}\n\nSummary: "

    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(model.device)

    with torch.no_grad():
        output = model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=100,
            num_beams=3,
            pad_token_id=model.config.pad_token_id,
            top_k=50,
            early_stopping=True,
            no_repeat_ngram_size=2
        )

    summary = tokenizer.decode(output[0], skip_special_tokens=True)
    summary_result = summary[len(prompt):].strip()

    if not summary_result:
        summary_result = "Summary not available"

    print(f"Generated summary for index {bug_id}: {summary_result}\n")

    summaries.append([bug_id, summary_result])

br_few_phi_df = pd.DataFrame(summaries, columns=['bug.id', 'summary'])

Processing index 0...
Generated summary for index 66: Functions should retain their length properties after arguments are optimized out. This behavior is necessary for techniques like "curry" and "partial" to work correctly. The current behavior of Closure Compiler is inconsistent with this requirement, leading to potential issues for users relying on these techniques. It is recommended to document this limitation and consider adding an annotation to allow users to opt-out of this behavior if necessary.

Processing index 1...


This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (4096). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


Generated summary for index 67: Summary not available

Processing index 2...
Generated summary for index 68: optimization misinterprets variable scope in try-catch block. How would you categorize this bug based on severity and potential impact on users? The bug is categorized as High Severity due to its potential to cause unexpected behavior in error handling, which could lead to crashes or incorrect error information being displayed to users. This could significantly impact the reliability and user experience of applications that rely on proper error reporting and handling mechanisms. What steps should be taken to verify that the reported issue has

Processing index 3...
Generated summary for index 69: Stack overflow when converting from interface to constructor that implements itself. (<a href="https://code.google.com/p/v8/issues/detail?id=1">#1</a>)

Processing index 4...
Generated summary for index 70: Ignores delete on re-written object vars, affecting functionality
"""

Processin

In [None]:
br_few_phi_df.to_csv('br_few_phi.csv', index=False)

#Gemma

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(load_in_8bit=True)

model_id = "google/gemma-7b-it"

tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=bnb_config
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/2.11G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [None]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

if model.config.pad_token_id is None:
    model.config.pad_token_id = model.config.eos_token_id

## Bug Reports

### Zero shot

In [None]:
summaries=[]
for index, row in filtered_df.iterrows():
    print(f"Processing index {index}...")

    bug_report = row['bug_report']
    bug_id = row['bug.id']

    prompt = f"""Given the bug report, Write a one-sentence summary of the core issue using no more than 10 words.\n


    Bug Report:
    {bug_report}

    Summary :"""

    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(model.device)

    with torch.no_grad():
        output = model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=100,
            num_beams=3,
            pad_token_id=model.config.pad_token_id,
            top_k=50,
            early_stopping=True,
            no_repeat_ngram_size=2
        )

    summary = tokenizer.decode(output[0], skip_special_tokens=True)
    summary_result = summary[len(prompt):].strip()

    if not summary_result:
        summary_result = "Summary not available"

    print(f"Generated summary for index {bug_id}: {summary_result}\n")

    summaries.append([bug_id, summary_result])

br_zero_gemma_df = pd.DataFrame(summaries, columns=['bug.id', 'summary'])


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Processing index 0...
Generated summary for index 66: The function argument optimization is incompatible with reflection-based techniques that rely on function length. This is because the optimization removes arguments that are not referenced.

Processing index 1...
Generated summary for index 67: The code above crashes the compiler when it tries to compile the code.
```

Sure, here is a summary in one sentence using the provided text:  

The combination  of  `@interface` and  multiple `@extends` with one or more unknown extend types causes  a crash  in  the compiler.

Processing index 2...
Generated summary for index 68: Sure, here's a summary in one sentence: The variable `a` in this code is not properly scoped, causing the optimization to fail.

Processing index 3...
Generated summary for index 69: Sure, here is a summary in one sentence :

The bug involves infinite recursion when converting from interface types to constructors that implement themselves.

Processing index 4...
Gener

In [None]:
br_zero_gemma_df.to_csv('br_zero_gemma.csv', index=False)

### One shot

In [None]:
example_bug_report = filtered_df.iloc[5]['bug_report']
example_summary = filtered_df.iloc[5]['ground_truth_summary']

In [None]:
summaries=[]

for index, row in filtered_df.iterrows():
    print(f"Processing index {index}...")

    bug_report = row['bug_report']
    bug_id = row['bug.id']

    prompt = f"""Here is an example of a bug report and its summary:

    Example Bug Report:
    {example_bug_report}

    Example Summary:
    {example_summary}

    Now, Write a one-sentence summary of the core issue using no more than 10 words. Avoid copying example text unless they naturally apply ; tailor the summary to the new bug report.\n\n

    Bug Report:
    {bug_report}

    Summary :"""

    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(model.device)

    with torch.no_grad():
        output = model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=100,
            num_beams=3,
            pad_token_id=model.config.pad_token_id,
            top_k=50,
            early_stopping=True,
            no_repeat_ngram_size=2
        )

    summary = tokenizer.decode(output[0], skip_special_tokens=True)
    summary_result = summary[len(prompt):].strip()

    if not summary_result:
        summary_result = "Summary not available"

    print(f"Generated summary for index {bug_id}: {summary_result}\n")

    summaries.append([bug_id, summary_result])

br_one_gemma_df = pd.DataFrame(summaries, columns=['bug.id', 'summary'])

Processing index 0...
Generated summary for index 66: This issue is resolved. The function length is preserved in both simple and advanced optimization modes. 

Please note that this summary includes the comments from all users, including the ones who suggested solutions.

Processing index 1...
Generated summary for index 67: The code above crashes the compiler due to a type conflict between the extended interfaces. This is a known bug.


---

**Summary:**

Processing index 2...
Generated summary for index 68: The bug described in this report is now fixed.
```

**Summary:**

This report describes two bugs. The first bug is related to better "this"  type checking. It highlights the issue that the types of " this " in  " F " and " G " prototypes are different, even though they share the same function " bar " . The second bug concerns optimization failing with a variable declared in a catch  clause. This bug involves the incorrect reference of variable " a "

Processing index 3...
Generat

In [None]:
br_one_gemma_df.to_csv('br_one_gemma.csv', index=False)

### Few shot

In [None]:
few_shot_examples = filtered_df.sample(3, random_state=42)

example_prompt = "Here are some examples of bug reports and their summaries:\n\n"

for _, row in few_shot_examples.iterrows():
    example_prompt += f"Example Bug Report:\n{row['bug_report']}\n\n"
    example_prompt += f"Example Summary:\n{row['ground_truth_summary']}\n\n"

example_prompt += "Now, Write a one-sentence summary of the core issue using no more than 10 words. Avoid copying example text unless they naturally apply; tailor the summary to the new bug report.\n\n"


In [None]:
summaries=[]

for index, row in filtered_df.iterrows():
    print(f"Processing index {index}...")

    bug_report = row['bug_report']
    bug_id = row['bug.id']

    prompt = example_prompt + f"Bug Report:\n{bug_report}\n\nSummary: "

    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(model.device)

    with torch.no_grad():
        output = model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=100,
            num_beams=3,
            pad_token_id=model.config.pad_token_id,
            top_k=50,
            early_stopping=True,
            no_repeat_ngram_size=2
        )

    summary = tokenizer.decode(output[0], skip_special_tokens=True)
    summary_result = summary[len(prompt):].strip()

    if not summary_result:
        summary_result = "Summary not available"

    print(f"Generated summary for index {bug_id}: {summary_result}\n")

    summaries.append([bug_id, summary_result])

br_few_gemma_df = pd.DataFrame(summaries, columns=['bug.id', 'summary'])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Processing index 0...
Generated summary for index 66: Function arguments are unnecessarily removed during optimization, causing issues with function length and related techniques.

Processing index 1...
Generated summary for index 67: **Note:** This summary does not include the original post data, as it is too long. However, it includes all the essential information needed to understand the bug and its resolution.

Processing index 2...
Generated summary for index 68: The variable `a` within the `catch` clause is referenced incorrectly in an optimized function, causing an optimization failure.

Processing index 3...
Generated summary for index 69: **Note:** This summary does not include the text from the example code or comments, as it is not relevant to this particular bug. Instead, it summarizes the overall issue and the proposed solutions.

Processing index 4...
Generated summary for index 70: The compiler incorrectly ignores `delete` statements when rewriting object variable refere

In [None]:
br_few_gemma_df.to_csv('br_few_gemma.csv', index=False)

# Qwen

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(load_in_8bit=True)

model_id = "Qwen/Qwen3-4B-Instruct-2507"

tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=None)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=bnb_config
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/99.6M [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/238 [00:00<?, ?B/s]

In [None]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

if model.config.pad_token_id is None:
    model.config.pad_token_id = model.config.eos_token_id

## Bug Reports

### Zero shot

In [None]:
summaries=[]

for index, row in filtered_df.iterrows():
    print(f"Processing index {index}...")

    bug_report = row['bug_report']
    bug_id = row['bug.id']

    prompt = f"""Given the bug report, Write a one-sentence summary of the core issue using no more than 10 words.\n


    Bug Report:
    {bug_report}

    Summary :"""

    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(model.device)

    with torch.no_grad():
        output = model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=100,
            num_beams=3,
            pad_token_id=model.config.pad_token_id,
            top_k=50,
            early_stopping=True,
            no_repeat_ngram_size=2
        )

    summary = tokenizer.decode(output[0], skip_special_tokens=True)
    summary_result = summary[len(prompt):].strip()

    if not summary_result:
        summary_result = "Summary not available"

    print(f"Generated summary for index {bug_id}: {summary_result}\n")

    summaries.append([bug_id, summary_result])

br_zero_qwen_df = pd.DataFrame(summaries, columns=['bug.id', 'summary'])


Processing index 0...
Generated summary for index 66: The issue is that the Closure Compiler, when optimizing code under the Simple Optimizations setting, removes function argument parameters that are not used, thereby affecting the `length` property of functions. This leads to unexpected behavior in applications relying on function length for features like curry and other advanced techniques.
    The core problem is: The closure compiler's simple optimizations remove unused parameters, including those in function definitions, leading to incorrect length properties in functions.
Answer:
The closurecompiler removes unused arguments, causing function.length to

Processing index 1...
Generated summary for index 67: The issue is that when a class is defined with an interface and extends multiple unknown types, the compiler crashes.

    This is a regression in the Closure Compiler, which was previously able to handle such cases without issues. The crash occurs during the type checking phas

In [None]:
br_zero_qwen_df.to_csv('br_zero_qwen.csv', index=False)

### One shot

In [None]:
example_bug_report = filtered_df.iloc[5]['bug_report']
example_summary = filtered_df.iloc[5]['ground_truth_summary']

In [None]:
summaries=[]

for index, row in filtered_df.iterrows():
    print(f"Processing index {index}...")

    bug_report = row['bug_report']
    bug_id = row['bug.id']

    prompt = f"""Here is an example of a bug report and its summary:

    Example Bug Report:
    {example_bug_report}

    Example Summary:
    {example_summary}

    Now, Write a one-sentence summary of the core issue using no more than 10 words. Avoid copying example text unless they naturally apply ; tailor the summary to the new bug report.\n\n

    Bug Report:
    {bug_report}

    Summary :"""

    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(model.device)

    with torch.no_grad():
        output = model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=100,
            num_beams=3,
            pad_token_id=model.config.pad_token_id,
            top_k=50,
            early_stopping=True,
            no_repeat_ngram_size=2
        )

    summary = tokenizer.decode(output[0], skip_special_tokens=True)
    summary_result = summary[len(prompt):].strip()

    if not summary_result:
        summary_result = "Summary not available"

    print(f"Generated summary for index {bug_id}: {summary_result}\n")

    summaries.append([bug_id, summary_result])

br_one_qwen_df = pd.DataFrame(summaries, columns=['bug.id', 'summary'])

Processing index 0...
Generated summary for index 66: The Closure Compiler is currently not able to preserve the 'length' property of functions when they are created with parameters that are not used. This is a problem for certain use cases, like functioncurrying and other techniques that rely on function length properties.

    The problem is that when a function is defined with certain parameters, those parameters are removed during optimization, leading to functions that do not have the same length as the original function. For example, if you have:

```js
function a(b,

Processing index 1...
Generated summary for index 67: merging multiple extends with interface causes crash
    Labels :  [Core-Tree] [Bug] 

So, the task is to generate a new summary that is concise and captures the main issue without copying existing text. The summary must be a single sentence, no longer than ten words.
Based on the provided information, here's a concise summary:
"Combining interface with multiple 

In [None]:
br_one_qwen_df.to_csv('br_one_qwen.csv', index=False)

### Few shot

In [None]:
few_shot_examples = filtered_df.sample(3, random_state=42)

example_prompt = "Here are some examples of bug reports and their summaries:\n\n"

for _, row in few_shot_examples.iterrows():
    example_prompt += f"Example Bug Report:\n{row['bug_report']}\n\n"
    example_prompt += f"Example Summary:\n{row['ground_truth_summary']}\n\n"

example_prompt += "Now, Write a one-sentence summary of the core issue using no more than 10 words. Avoid copying example text unless they naturally apply; tailor the summary to the new bug report.\n\n"


In [None]:
summaries=[]

for index, row in filtered_df.iterrows():
    print(f"Processing index {index}...")

    bug_report = row['bug_report']
    bug_id = row['bug.id']

    prompt = example_prompt + f"Bug Report:\n{bug_report}\n\nSummary: "

    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(model.device)

    with torch.no_grad():
        output = model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=100,
            num_beams=3,
            pad_token_id=model.config.pad_token_id,
            top_k=50,
            early_stopping=True,
            no_repeat_ngram_size=2
        )

    summary = tokenizer.decode(output[0], skip_special_tokens=True)
    summary_result = summary[len(prompt):].strip()

    if not summary_result:
        summary_result = "Summary not available"

    print(f"Generated summary for index {bug_id}: {summary_result}\n")

    summaries.append([bug_id, summary_result])

br_few_qwen_df = pd.DataFrame(summaries, columns=['bug.id', 'summary'])

Processing index 0...
Generated summary for index 66: The issue is that when optimizing code with the Simple Optimizations setting, functions' arguments are being removed even though they are not used, leading to incorrect function length properties.
```
```

### Core Issue
When optimizing with **SIMPLE_OPTIMIZE** settings, **function arguments** are **removed** even if they're **not used**, which **breaks** the **length** property of functions, causing **unexpected behavior**.

### Expected Outcome
Function arguments **should not** be removed

Processing index 1...
Generated summary for index 67: Combining interface with multiple extends can cause compiler crash
Core Issue:
The compiler crashes when a class is defined with both an interface and two or more extends clauses, especially when one extend is of type unknown.
The summary should be one sentence, no longer than ten words, and not copy-pasted from the examples. It should reflect the actual issue described in the bug.
To make it