### Meta-Llama-3-8B-Instruct

In [None]:
import transformers
import pandas as pd
import os, sys
import time
import logging
import json
from tqdm import tqdm
import os
# os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="2"

import torch
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

prompt_template = "Identify keywords from the summary and description of the bug report that can be used to detect duplicates.\n\nOutput format:\nSummary: [Selected Keywords]\nDescription: [Selected Keywords]\n\nSummary: {}\nDescription: {}\n\n"
project = 'spark'

df = pd.read_csv('../data/raw/test_{}.csv'.format(project))
flag_content_df = pd.read_csv(f'../data/ablation/test_{project}_flag_content.csv')

llama3_folder = '../data/keywords/{}/llama3/run_{}'

for run in range(1, 6):
    if not os.path.exists(llama3_folder.format(project, run)):
        os.makedirs(llama3_folder.format(project, run))

    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        bug_id = row['bug_id']
        
        if flag_content_df[flag_content_df['bug_id'] == bug_id]['run_flag'].values[0] == 0:
            continue
        
        if os.path.exists(os.path.join(llama3_folder.format(project, run), f'{bug_id}.txt')):
            continue
                
        messages = [
            # {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
            {
                "role": "user", 
                "content": prompt_template.format(row['short_desc'], row['description'])
            },
        ]

        prompt = pipeline.tokenizer.apply_chat_template(
                messages, 
                tokenize=False, 
                add_generation_prompt=True
        )

        terminators = [
            pipeline.tokenizer.eos_token_id,
            pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
        ]
        try:
            outputs = pipeline(
                prompt,
                max_new_tokens=2048,
                # max_length=2048,
                eos_token_id=terminators,
                do_sample=False,
                top_p=1,
            )
        except Exception as e:
                print(e)
                messages = [
                    {
                        "role": "user", 
                        "content": prompt_template.format(row['short_desc'], row['description'][:2000])
                    },
                ]

                prompt = pipeline.tokenizer.apply_chat_template(
                        messages, 
                        tokenize=False, 
                        add_generation_prompt=True
                )
                
                outputs = pipeline(
                    prompt,
                    max_new_tokens=2048,
                    # max_length=2048,
                    eos_token_id=terminators,
                    do_sample=False,
                    top_p=1,
                )
                
        with open(os.path.join(llama3_folder.format(project, run), f'{bug_id}.txt'), 'w') as f:
            f.write(prompt_template.format(row['short_desc'], row['description']))
            f.write('\n\n>>>>>> Response:\n\n')
            f.write(outputs[0]["generated_text"][len(prompt):])

### Phi-3-mini-128k-instruct

In [None]:
import pandas as pd
from transformers import AutoTokenizer
from tqdm import tqdm

import os
os.environ["CUDA_VISIBLE_DEVICES"]="2"

import torch
torch.random.manual_seed(42)
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
prompt_template = "Identify keywords from the summary and description of the bug report that can be used to detect duplicates.\n\nOutput format:\nSummary: [Selected Keywords]\nDescription: [Selected Keywords]\n\nSummary: {}\nDescription: {}\n\n"
project = 'spark'

df = pd.read_csv('../data/raw/test_{}.csv'.format(project))
flag_content_df = pd.read_csv(f'../data/ablation/test_{project}_flag_content.csv')

phi_folder = '../data/keywords/{}/phi/run_{}'

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-128k-instruct", 
    device_map="cuda", 
    torch_dtype="auto", 
    trust_remote_code=True, 
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")


for run in range(1, 6):
    if not os.path.exists(phi_folder.format(project, run)):
        os.makedirs(phi_folder.format(project, run))

    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        bug_id = row['bug_id']
        
        if flag_content_df[flag_content_df['bug_id'] == bug_id]['run_flag'].values[0] == 0:
            continue
        
        if os.path.exists(os.path.join(phi_folder.format(project, run), f'{bug_id}.txt')):
            continue
                
        messages = [
            {
                "role": "user", 
                "content": prompt_template.format(row['short_desc'], row['description'])
            },
        ]

        try:
            pipe = pipeline(
                "text-generation",
                model=model,
                tokenizer=tokenizer,
            )
            
            generation_args = {
                "max_new_tokens": 2048,
                "return_full_text": False,
                "temperature": 0.0,
                "do_sample": False,
                "top_p": 1,
            }

            output = pipe(messages, **generation_args)
            # print(output[0]['generated_text'])
        
        except Exception as e:
                print(e)
                messages = [
                    {
                        "role": "user", 
                        "content": prompt_template.format(row['short_desc'], row['description'][:2000])
                    },
                ]

                pipe = pipeline(
                    "text-generation",
                    model=model,
                    tokenizer=tokenizer,
                )
                
                generation_args = {
                    "max_new_tokens": 2048,
                    "return_full_text": False,
                    "temperature": 0.0,
                    "do_sample": False,
                    "top_p": 1,
                }

                output = pipe(messages, **generation_args)
                # print(output[0]['generated_text'])
                
        with open(os.path.join(phi_folder.format(project, run), f'{bug_id}.txt'), 'w') as f:
            f.write(prompt_template.format(row['short_desc'], row['description']))
            f.write('\n\n>>>>>> Response:\n\n')
            f.write(output[0]['generated_text'])