### Meta-Llama-3-8B-Instruct

In [1]:
import transformers
import pandas as pd
import os, sys
import time
import logging
import json
from tqdm import tqdm
import os
# os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="2"

import torch
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

prompt_template = "Identify keywords from the summary and description of the bug report that can be used to detect duplicates.\n\nOutput format:\nSummary: [Selected Keywords]\nDescription: [Selected Keywords]\n\nSummary: {}\nDescription: {}\n\n"
project = 'spark'

df = pd.read_csv('../data/raw/test_{}.csv'.format(project))
flag_content_df = pd.read_csv(f'../data/ablation/test_{project}_flag_content.csv')

llama3_folder = '../data/keywords/{}/llama3/run_{}'

for run in range(1, 6):
    if not os.path.exists(llama3_folder.format(project, run)):
        os.makedirs(llama3_folder.format(project, run))

    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        bug_id = row['bug_id']
        
        if flag_content_df[flag_content_df['bug_id'] == bug_id]['run_flag'].values[0] == 0:
            continue
        
        if os.path.exists(os.path.join(llama3_folder.format(project, run), f'{bug_id}.txt')):
            continue
                
        messages = [
            # {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
            {
                "role": "user", 
                "content": prompt_template.format(row['short_desc'], row['description'])
            },
        ]

        prompt = pipeline.tokenizer.apply_chat_template(
                messages, 
                tokenize=False, 
                add_generation_prompt=True
        )

        terminators = [
            pipeline.tokenizer.eos_token_id,
            pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
        ]
        try:
            outputs = pipeline(
                prompt,
                max_new_tokens=2048,
                # max_length=2048,
                eos_token_id=terminators,
                do_sample=False,
                top_p=1,
            )
        except Exception as e:
                print(e)
                messages = [
                    {
                        "role": "user", 
                        "content": prompt_template.format(row['short_desc'], row['description'][:2000])
                    },
                ]

                prompt = pipeline.tokenizer.apply_chat_template(
                        messages, 
                        tokenize=False, 
                        add_generation_prompt=True
                )
                
                outputs = pipeline(
                    prompt,
                    max_new_tokens=2048,
                    # max_length=2048,
                    eos_token_id=terminators,
                    do_sample=False,
                    top_p=1,
                )
                
        with open(os.path.join(llama3_folder.format(project, run), f'{bug_id}.txt'), 'w') as f:
            f.write(prompt_template.format(row['short_desc'], row['description']))
            f.write('\n\n>>>>>> Response:\n\n')
            f.write(outputs[0]["generated_text"][len(prompt):])

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
100%|██████████| 2841/2841 [00:00<00:00, 3936.35it/s]
100%|██████████| 2841/2841 [00:00<00:00, 4003.29it/s]
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
  9%|▉         | 266/2841 [01:08<11:07,  3.86it/s]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
  9%|▉         | 267/2841 [01:10<11:19,  3.79it/s]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
  9%|▉         | 269/2841 [01:12<11:53,  3.60it/s]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 10%|▉         | 272/2841 [01:13<12:05,  3.54it/s]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 10%|▉         | 273/2841 [01:15<12:55,  3.31it/s]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 10%|▉         | 274/2841 [01:17<14:25,  2.97it/s]Setting `pad_token_id` to `eos_token

The size of tensor a (16384) must match the size of tensor b (16882) at non-singleton dimension 3


 51%|█████     | 1445/2841 [21:51<2:25:13,  6.24s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 51%|█████     | 1448/2841 [22:15<2:42:35,  7.00s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 51%|█████     | 1449/2841 [22:33<3:33:45,  9.21s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 51%|█████     | 1450/2841 [22:50<4:09:41, 10.77s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 51%|█████     | 1451/2841 [23:06<4:36:48, 11.95s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 51%|█████     | 1454/2841 [23:24<3:27:27,  8.97s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 51%|█████     | 1455/2841 [23:51<4:46:07, 12.39s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 51%|█████     | 1456/2841 [24:16<5:51:37, 15.23s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end gene

CUDA out of memory. Tried to allocate 9.90 GiB (GPU 0; 47.51 GiB total capacity; 30.98 GiB already allocated; 5.49 GiB free; 41.51 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF


 56%|█████▌    | 1577/2841 [48:11<2:48:17,  7.99s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


The size of tensor a (32768) must match the size of tensor b (33145) at non-singleton dimension 3


 56%|█████▌    | 1578/2841 [49:36<6:22:16, 18.16s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 56%|█████▌    | 1580/2841 [50:30<7:14:04, 20.65s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 56%|█████▌    | 1581/2841 [51:23<9:03:37, 25.89s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 56%|█████▌    | 1582/2841 [52:37<12:15:42, 35.06s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 56%|█████▌    | 1583/2841 [53:46<14:46:46, 42.29s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 56%|█████▌    | 1585/2841 [54:57<13:54:02, 39.84s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 56%|█████▌    | 1586/2841 [56:20<17:02:54, 48.90s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 56%|█████▌    | 1591/2841 [57:19<9:12:28, 26.52s/it] Setting `pad_token_id` to `eos_token_id`:128009 for open-end

CUDA out of memory. Tried to allocate 10.98 GiB (GPU 0; 47.51 GiB total capacity; 34.14 GiB already allocated; 5.07 GiB free; 41.92 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF


 62%|██████▏   | 1755/2841 [2:53:56<13:32:23, 44.88s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 62%|██████▏   | 1756/2841 [2:54:55<14:17:26, 47.42s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 62%|██████▏   | 1760/2841 [2:56:26<10:30:31, 35.00s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 62%|██████▏   | 1761/2841 [2:57:27<11:41:00, 38.94s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 62%|██████▏   | 1763/2841 [2:58:50<11:52:57, 39.68s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 62%|██████▏   | 1765/2841 [2:59:53<11:07:47, 37.24s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 62%|██████▏   | 1766/2841 [3:01:36<14:36:00, 48.89s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 62%|██████▏   | 1770/2841 [3:02:41<9:39:59, 32.49s/it] Setting `pad_token_id` to `eos_token_id`:

CUDA out of memory. Tried to allocate 13.19 GiB (GPU 0; 47.51 GiB total capacity; 37.52 GiB already allocated; 2.89 GiB free; 44.11 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF


 73%|███████▎  | 2072/2841 [7:10:21<3:04:41, 14.41s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 73%|███████▎  | 2073/2841 [7:11:23<3:54:00, 18.28s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 73%|███████▎  | 2082/2841 [7:11:56<2:17:23, 10.86s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 73%|███████▎  | 2083/2841 [7:13:04<3:11:16, 15.14s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 73%|███████▎  | 2084/2841 [7:15:06<5:20:13, 25.38s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 73%|███████▎  | 2087/2841 [7:16:16<5:11:06, 24.76s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 74%|███████▎  | 2092/2841 [7:17:39<4:27:18, 21.41s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 74%|███████▎  | 2093/2841 [7:19:11<5:59:23, 28.83s/it]Setting `pad_token_id` to `eos_token_id`:128009 f

CUDA out of memory. Tried to allocate 139.25 GiB (GPU 0; 47.51 GiB total capacity; 25.14 GiB already allocated; 21.16 GiB free; 25.84 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF


  3%|▎         | 73/2841 [39:24<79:06:22, 102.88s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
  3%|▎         | 75/2841 [44:04<88:40:58, 115.42s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
  3%|▎         | 76/2841 [48:34<111:46:16, 145.52s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
  3%|▎         | 77/2841 [52:34<127:31:11, 166.09s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
  3%|▎         | 79/2841 [56:26<112:44:51, 146.96s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
  3%|▎         | 80/2841 [1:01:07<134:49:50, 175.80s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
  3%|▎         | 88/2841 [1:04:52<54:10:16, 70.84s/it]  Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
  3%|▎         | 89/2841 [1:08:21<66:01:22, 86.37s/it]Setting `pad_token_id` to `eos_token_id`:128009 for ope

### Phi-3-mini-128k-instruct

In [3]:
import pandas as pd
from transformers import AutoTokenizer
from tqdm import tqdm

import os
os.environ["CUDA_VISIBLE_DEVICES"]="2"

import torch
torch.random.manual_seed(42)
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
prompt_template = "Identify keywords from the summary and description of the bug report that can be used to detect duplicates.\n\nOutput format:\nSummary: [Selected Keywords]\nDescription: [Selected Keywords]\n\nSummary: {}\nDescription: {}\n\n"
project = 'spark'

df = pd.read_csv('../data/raw/test_{}.csv'.format(project))
flag_content_df = pd.read_csv(f'../data/ablation/test_{project}_flag_content.csv')

phi_folder = '../data/keywords/{}/phi/run_{}'

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-128k-instruct", 
    device_map="cuda", 
    torch_dtype="auto", 
    trust_remote_code=True, 
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")


for run in range(1, 6):
    if not os.path.exists(phi_folder.format(project, run)):
        os.makedirs(phi_folder.format(project, run))

    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        bug_id = row['bug_id']
        
        if flag_content_df[flag_content_df['bug_id'] == bug_id]['run_flag'].values[0] == 0:
            continue
        
        if os.path.exists(os.path.join(phi_folder.format(project, run), f'{bug_id}.txt')):
            continue
                
        messages = [
            {
                "role": "user", 
                "content": prompt_template.format(row['short_desc'], row['description'])
            },
        ]

        try:
            pipe = pipeline(
                "text-generation",
                model=model,
                tokenizer=tokenizer,
            )
            
            generation_args = {
                "max_new_tokens": 2048,
                "return_full_text": False,
                "temperature": 0.0,
                "do_sample": False,
                "top_p": 1,
            }

            output = pipe(messages, **generation_args)
            # print(output[0]['generated_text'])
        
        except Exception as e:
                print(e)
                messages = [
                    {
                        "role": "user", 
                        "content": prompt_template.format(row['short_desc'], row['description'][:2000])
                    },
                ]

                pipe = pipeline(
                    "text-generation",
                    model=model,
                    tokenizer=tokenizer,
                )
                
                generation_args = {
                    "max_new_tokens": 2048,
                    "return_full_text": False,
                    "temperature": 0.0,
                    "do_sample": False,
                    "top_p": 1,
                }

                output = pipe(messages, **generation_args)
                # print(output[0]['generated_text'])
                
        with open(os.path.join(phi_folder.format(project, run), f'{bug_id}.txt'), 'w') as f:
            f.write(prompt_template.format(row['short_desc'], row['description']))
            f.write('\n\n>>>>>> Response:\n\n')
            f.write(output[0]['generated_text'])

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.21s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
 12%|█▏        | 343/2841 [08:05<4:33:51,  6.58s/it] 

CUDA out of memory. Tried to allocate 11.36 GiB. GPU 


 12%|█▏        | 345/2841 [08:15<4:13:31,  6.09s/it]

CUDA out of memory. Tried to allocate 11.36 GiB. GPU 


 12%|█▏        | 346/2841 [08:22<4:19:57,  6.25s/it]

CUDA out of memory. Tried to allocate 11.36 GiB. GPU 


 26%|██▋       | 751/2841 [31:49<5:58:47, 10.30s/it] 

CUDA out of memory. Tried to allocate 8.27 GiB. GPU 


 27%|██▋       | 753/2841 [31:56<4:20:43,  7.49s/it]

CUDA out of memory. Tried to allocate 8.28 GiB. GPU 


 51%|█████     | 1444/2841 [1:13:57<2:54:27,  7.49s/it] 

CUDA out of memory. Tried to allocate 32.83 GiB. GPU 


 55%|█████▌    | 1572/2841 [1:26:11<5:44:07, 16.27s/it] 

CUDA out of memory. Tried to allocate 18.48 GiB. GPU 


 56%|█████▌    | 1577/2841 [1:26:14<2:45:50,  7.87s/it]

CUDA out of memory. Tried to allocate 117.04 GiB. GPU 


 62%|██████▏   | 1754/2841 [1:36:02<20:01,  1.11s/it]  

CUDA out of memory. Tried to allocate 20.08 GiB. GPU 


 73%|███████▎  | 2068/2841 [1:56:02<07:37,  1.69it/s]  

CUDA out of memory. Tried to allocate 22.18 GiB. GPU 


 79%|███████▉  | 2255/2841 [2:06:56<32:59,  3.38s/it]  


KeyboardInterrupt: 