In [4]:
%%capture
!pip install transformers bitsandbytes datasets sentencepiece accelerate peft flash-attn wandb openai pqdm

In [5]:
!pip install -U typing_extensions
!pip install trl==0.9.6
!pip install dotenv

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Collecting trl==0.9.6
  Downloading trl-0.9.6-py3-none-any.whl.metadata (12 kB)
Collecting tyro>=0.5.11 (from trl==0.9.6)
  Downloading tyro-0.9.19-py3-none-any.whl.metadata (9.9 kB)
Collecting docstring-parser>=0.15 (from tyro>=0.5.11->trl==0.9.6)
  Downloading docstring_parser-0.16-py3-none-any.whl.metadata (3.0 kB)
Collecting rich>=11.1.0 (from tyro>=0.5.11->trl==0.9.6)
  Downloading rich-14.0.0-py3-none-any.whl.metadata (18 kB)
Collecting shtab>=1.5.6 (from tyro>=0.5.11->trl==0.9.6)
  Downloading shtab-1.7.2-py3-none-any.whl.metadata (7.4 kB)
Collecting typeguard>=4.0.0 (from tyro>=0.5.11->trl==0.9.6)
  Downloading typeguard-4.4.2-py3-none-any.whl.metadata (3.8 kB)
Collecting markdown-it-py>=2.2.0 (from rich>=11.1.0->tyro

In [1]:
import os 
os.environ["CUDA_VISIBLE_DEVICES"] = "0" 

import warnings
warnings.filterwarnings("ignore")

import trl
import torch
import datasets
import transformers

import pandas as pd
from random import randint
from datasets import Dataset, load_dataset, DatasetDict

from trl import SFTTrainer, setup_chat_format
from peft import LoraConfig, AutoPeftModelForCausalLM

import wandb
from transformers import (AutoTokenizer,
                          AutoModelForCausalLM,
                          BitsAndBytesConfig,
                          TrainingArguments,
                          pipeline)

from huggingface_hub import login

import os
import json
from openai import OpenAI

In [2]:
print(f"PyTorch version       : {torch.__version__}")
print(f"Transformers version  : {transformers.__version__}")
print(f"TRL version           : {trl.__version__}")
print(f"CUDA available        : {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version      : {torch.version.cuda}")

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

login(
  token=HUGGINGFACE_TOKEN,
  add_to_git_credential=True
)

Token has not been saved to git credential helper.


PyTorch version       : 2.4.1+cu124
Transformers version  : 4.51.3
TRL version           : 0.9.6
CUDA available        : True
CUDA version      : 12.4
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m


In [3]:
dataset = datasets.load_dataset("rlawltjd/korean-nl2bash")
dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'output'],
        num_rows: 8089
    })
})

In [4]:
tokenizer = AutoTokenizer.from_pretrained("allganize/Llama-3-Alpha-Ko-8B-Instruct")
tokenizer.padding_side = 'right'

def get_chat_format(element):
    system_prompt = "You are a helpful programmer assistant that excels at changing Korean text to Bash."
    user_prompt = "Task: {instruction}"
    
    return {
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt.format_map(element)},
            {"role": "assistant", "content": element["output"] + tokenizer.eos_token},
        ]
    }



# 데이터를 일괄적으로 대화 형식으로 변경하는 코드
dataset["train"] = dataset["train"].map(
    get_chat_format,
    remove_columns=["instruction", "output"],
    batched=False
)

split_dataset = dataset["train"].train_test_split(test_size=0.05)

dataset = DatasetDict({
    "train": split_dataset["train"],
    "test": split_dataset["test"]
})

dataset["train"].to_json("train_dataset.json", orient="records")
dataset["test"].to_json("test_dataset.json", orient="records")

print(dataset["train"][345]["messages"])


Creating json from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

[{'content': 'You are a helpful programmer assistant that excels at changing Korean text to Bash.', 'role': 'system'}, {'content': 'Task: 모든 *.java 파일에서 StringBuffer 찾기', 'role': 'user'}, {'content': 'find . -type f -name "*.java" -exec grep -l StringBuffer {} \\;<|end_of_text|>', 'role': 'assistant'}]


In [5]:
dataset["train"], dataset["test"]

(Dataset({
     features: ['messages'],
     num_rows: 7684
 }),
 Dataset({
     features: ['messages'],
     num_rows: 405
 }))

In [6]:
dataset["train"][0]

{'messages': [{'content': 'You are a helpful programmer assistant that excels at changing Korean text to Bash.',
   'role': 'system'},
  {'content': 'Task: "txt"로 끝나지 않는 현재 디렉토리 트리의 모든 일반 파일을 제거합니다.',
   'role': 'user'},
  {'content': "find . -type f -not -name '*txt' -print0 | xargs -0 rm --<|end_of_text|>",
   'role': 'assistant'}]}

In [7]:
dataset = load_dataset("json", data_files="train_dataset.json", split="train")

Generating train split: 0 examples [00:00, ? examples/s]

In [8]:
# Quantization config 세팅 -> 모델이 사용하는 vram을 최소화
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True, 
    bnb_4bit_quant_type="nf4", 
    bnb_4bit_compute_dtype=torch.bfloat16 
)

model_id = "allganize/Llama-3-Alpha-Ko-8B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_id,                                     
    device_map="auto",                            
    attn_implementation="flash_attention_2",        
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config                  
)

tokenizer = AutoTokenizer.from_pretrained(model_id)   
tokenizer.padding_side = 'right'                      

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [9]:
peft_config = LoraConfig(
        lora_alpha=128,                            
        lora_dropout=0.05,                         
        r=256,                                     # Lora의 저차원 공간의 랭크를 지정. 랭크가 높을수록 모델의 표현력과과 계산 비용도 증가.
        bias="none",                               # Lora 적용 시 바이어스를 사용할지 여부를 설정. 
        target_modules=["q_proj", "o_proj",        # Lora를 적용할 모델의 모듈 리스트
                        "k_proj", "v_proj",
                        "up_proj", "down_proj",
                        "gate_proj",
                        ],
        task_type="CAUSAL_LM",                    
)


args = TrainingArguments(
    output_dir="code-llama-7b-text-to-bash-v3", 
    num_train_epochs=5,                   
    # max_steps=100,                          
    per_device_train_batch_size=1,         
    gradient_accumulation_steps=2,          
    gradient_checkpointing=True,            
    optim="adamw_torch_fused",              # 메모리 효율화할 수 있는 fused AdamW 옵티마이저 사용.
    logging_steps=10,                       
    save_strategy="epoch",
    save_total_limit=1,
    learning_rate=2e-4,                     # 학습률 2e-4로 설정 (QLoRA 논문 기반).
    bf16=True,                              
    tf32=True,
    max_grad_norm=0.3,                      
    warmup_ratio=0.03,                      # 워밍업 비율 0.03으로 설정 (QLoRA 논문 기반).
    lr_scheduler_type="cosine",           
    push_to_hub=True,                       
    report_to="wandb",                      
)


max_seq_length = 4096

trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=True,                     
    dataset_kwargs={
        "add_special_tokens": False,  
        "append_concat_token": False, 
    }
)

Generating train split: 0 examples [00:00, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [10]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33msoka27[0m ([33msoka27-hufs[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.


Step,Training Loss
10,1.1981
20,0.8192
30,0.7532
40,0.7121
50,0.6914
60,0.6893
70,0.6568
80,0.5621
90,0.5511
100,0.5309


TrainOutput(global_step=340, training_loss=0.4238887878025279, metrics={'train_runtime': 1167.0027, 'train_samples_per_second': 0.583, 'train_steps_per_second': 0.291, 'total_flos': 1.3663491814588416e+17, 'train_loss': 0.4238887878025279, 'epoch': 5.0})

In [11]:
model.push_to_hub("rlawltjd/code-llama3-7B-text-to-bash-v3")
tokenizer.push_to_hub("rlawltjd/code-llama3-7B-text-to-bash-v3")

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.39G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/rlawltjd/code-llama3-7B-text-to-bash-v3/commit/b05b3e5da175eb06daf42ac5436387b7e55d91e8', commit_message='Upload tokenizer', commit_description='', oid='b05b3e5da175eb06daf42ac5436387b7e55d91e8', pr_url=None, repo_url=RepoUrl('https://huggingface.co/rlawltjd/code-llama3-7B-text-to-bash-v3', endpoint='https://huggingface.co', repo_type='model', repo_id='rlawltjd/code-llama3-7B-text-to-bash-v3'), pr_revision=None, pr_num=None)

In [12]:
del model
del trainer
torch.cuda.empty_cache()

In [14]:
peft_model_id = "./code-llama-7b-text-to-bash-v3"

model = AutoPeftModelForCausalLM.from_pretrained(
  peft_model_id,
  device_map="auto",
  torch_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(peft_model_id)

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DeepseekV3ForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'Gemma3ForConditionalGeneration', 'Gemma3ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'Glm4ForCausalLM', 'GotOcr2ForConditionalGeneration', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoFo

In [15]:
eval_dataset = load_dataset("json", data_files="test_dataset.json", split="train")
rand_idx = randint(0, len(eval_dataset))

prompt = pipe.tokenizer.apply_chat_template(
    eval_dataset[rand_idx]["messages"][:2], 
    tokenize=False, 
    add_generation_prompt=True
    )

outputs = pipe(prompt, 
               max_new_tokens=256, 
               do_sample=False, 
               temperature=0.1, 
               top_k=50, 
               top_p=0.1, 
               eos_token_id=pipe.tokenizer.eos_token_id, 
               pad_token_id=pipe.tokenizer.pad_token_id
               )

print(f"Query:\n{eval_dataset[rand_idx]['messages'][1]['content']}")
print(f"Original Answer:\n{eval_dataset[rand_idx]['messages'][2]['content']}".replace("<|im_end|>", ""))
print(f"Generated Answer:\n{outputs[0]['generated_text'][len(prompt):].strip()}")
eval_dataset[rand_idx]['messages'][2]['content'].replace("<|end_of_text|>", "") == outputs[0]['generated_text'][len(prompt):].strip()

Generating train split: 0 examples [00:00, ? examples/s]

Query:
Task: 'in.txt' 파일에서 정규 표현식 "+\S\+"를 검색하고, 쉼표(',')로 새 줄을 대체하여 일치하는 항목을 출력합니다.
Original Answer:
grep -o "+\S\+" in.txt | tr '\n' ','<|end_of_text|>
Generated Answer:
cat in.txt | grep -o ",+\S\+"


False

In [32]:
question = '"aa.txt"를 삭제하시오.'

messages = [
    {"role": "system", "content": "너는 bash 명령어 생성 전문가야."},
    {"role": "user", "content": question}
]

prompt = pipe.tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

# 생성
outputs = pipe(
    prompt,
    max_new_tokens=64,
    do_sample=False,
    temperature=0.1,
    top_k=50,
    top_p=0.1,
    eos_token_id=pipe.tokenizer.eos_token_id,
    pad_token_id=pipe.tokenizer.pad_token_id
)

# 결과 출력
generated_code = outputs[0]['generated_text'][len(prompt):].strip()
print("Generated Bash Command:\n", generated_code)


Generated Bash Command:
 rm aa.txt


In [16]:
from tqdm import tqdm

def evaluate(sample):
    prompt = pipe.tokenizer.apply_chat_template(
        sample["messages"][:2],
        tokenize=False,
        add_generation_prompt=True)
    outputs = pipe(prompt,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7,
        top_k=50,
        top_p=0.95,
        eos_token_id=pipe.tokenizer.eos_token_id,
        pad_token_id=pipe.tokenizer.pad_token_id)
    predicted_answer = outputs[0]['generated_text'][len(prompt):].strip()
    return (sample["messages"][1]["content"], predicted_answer, sample["messages"][2]["content"])

success_rate = []
number_of_eval_samples = 405

sampled_eval_dataset = eval_dataset.shuffle(seed=42).select(range(405))
for test_data in tqdm(sampled_eval_dataset):
    success_rate.append(evaluate(test_data))

  2%|▏         | 9/405 [00:11<09:02,  1.37s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 405/405 [06:04<00:00,  1.11it/s]


In [17]:
with open("./success_rate-v3.txt", "w") as f:
    for result in success_rate:
        f.write(str(result) + "\n")

In [18]:
generated_result = [temp[1] == temp[2].replace("<|end_of_text|>", "") for temp in success_rate]

In [19]:
accuracy = sum(generated_result)/len(generated_result)
print(f"Accuracy: {accuracy*100:.2f}%")

Accuracy: 12.59%


In [20]:
success_rate = []
with open("success_rate-v3.txt", "r") as f:
    for line in f:
        success_rate.append(eval(line))

In [21]:
openai_evaluation = [(temp[0], temp[1], temp[2].replace("<|end_of_text|>", "")) for temp in success_rate]

In [22]:
openai_evaluation[1]

("Task: '/var/www' 디렉토리 트리 아래의 모든 디렉토리를 찾되, '/var/www/web-release-data'와 '/var/www/web-development-data' 디렉토리와 그들의 하위 디렉토리는 제외해라.",
 'find /var/www -type d \\(! -wholename "/var/www/web-release-data/*" -a! -wholename "/var/www/web-development-data/*" \\)',
 'find /var/www -type d \\( ! -wholename "/var/www/web-release-data/*"  ! -wholename "/var/www/web-development-data/*" \\)')

In [23]:
client = OpenAI()

def one_compare_bash_semantics(problem_description, generated_query, ground_truth_query):
    # ChatGPT에게 물어볼 프롬프트 작성
    prompt = f"""다음 문제와 두 Bash 코드가 의미적으로 동일한 결과를 반환하는지 판단해주세요:

    문제 설명: {problem_description}

    생성된 코드:
    {generated_query}

    정답 코드:
    {ground_truth_query}

    두 코드가 문제에 대해 의미적으로 동일한 결과를 반환한다면 "Yes"라고 대답하고,
    그렇지 않다면 "No"라고 대답한 후 차이점을 설명해주세요.
    코드의 구조나 사용된 함수가 다르더라도 결과가 같다면 의미적으로 동일하다고 판단해주세요."""

    # ChatGPT API 호출
    response = client.chat.completions.create(
        model="gpt-4o-mini",  # 또는 사용 가능한 최신 모델
        messages=[
            {"role": "system", "content": "You are a helpful assistant that compares the semantic meaning of Bash codes in the context of a given problem."},
            {"role": "user", "content": prompt}
        ]
    )

    # ChatGPT의 응답 추출
    answer = response.choices[0].message.content.strip()

    # 결과 처리
    is_correct = 1 if answer.lower().startswith("yes") else 0
    explanation = answer[3:] if is_correct == 1 else answer[2:]

    # JSON 형식으로 결과 반환
    result = {
        "answer": is_correct,
        "explanation": explanation.strip()
    }

    return json.dumps(result, ensure_ascii=False)

# 사용 예시

problem = openai_evaluation[1][0]
truth = openai_evaluation[1][1]
generated = openai_evaluation[1][2]

result = one_compare_bash_semantics(problem, generated, truth)
print(result)

{"answer": 0, "explanation": "차이점 설명:\n두 코드의 `find` 명령어 사용에서 조건의 결합 방식에 차이가 있습니다.\n\n1. **생성된 코드**:\n   ```bash\n   find /var/www -type d \\( ! -wholename \"/var/www/web-release-data/*\"  ! -wholename \"/var/www/web-development-data/*\" \\)\n   ```\n   이 코드는 중괄호 내에서 두 개의 조건을 나열하고 있습니다. 각 조건은 다른 디렉토리를 제외시키기 위해 `! -wholename` 절을 사용합니다. 그러나 이 경우 두 조건이 `AND`로 결합되지 않고 독립적으로 평가됩니다. 여기서 두 조건이 모두 true이어야 최종 조건이 true가 되지 않아, 원하는 결과가 아닐 수 있습니다.\n\n2. **정답 코드**:\n   ```bash\n   find /var/www -type d \\(! -wholename \"/var/www/web-release-data/*\" -a ! -wholename \"/var/www/web-development-data/*\" \\)\n   ```\n   이 코드는 `-a` (AND) 연산자를 사용하여 두 조건을 명시적으로 결합하고 있습니다. 즉, 이 코드는 두 디렉토리에 대해 모두 제외되어야 최종 조건이 true가 되도록 설정되어 있습니다.\n\n결과적으로, 생성된 코드에서는 한 조건이 참일 경우 다른 조건이 어떤 값을 가질지에 상관없이 결과가 영향을 받을 수 있지만, 정답 코드에서는 두 조건이 모두 만족해야 최종적으로 포함되지 않게 됩니다. 따라서, 두 코드는 의미적으로 동일한 결과를 반환하지 않습니다."}


In [24]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [26]:
import os
import json
from pathlib import Path
from openai import OpenAI
from pqdm.processes import pqdm

client = OpenAI()
import re

def extract_json_from_markdown(text):
    if text.strip().startswith("```json"):
        text = re.sub(r"^```json\s*", "", text.strip())
        text = re.sub(r"\s*```$", "", text.strip())
    try:
        return json.loads(text)
    except json.JSONDecodeError:
        text = text.replace('\\', '\\\\')  # \ → \\
        return json.loads(text)


def compare_bash_semantics(idx):
    save_path = f"./results-v3/result_{idx}.json"
    if Path(save_path).exists():
        print("이미 처리된 파일입니다.")
        with open(save_path, "r", encoding="utf-8") as f:
            return json.load(f)  
    else:
        item = openai_evaluation[idx]
        problem_description, generated_query, ground_truth_query = item

        prompt = f"""다음 문제와 두 Bash 코드가 의미적으로 동일한 결과를 반환하는지 판단해주세요:

        문제 설명: {problem_description}

        생성된 코드:
        {generated_query}

        정답 코드:
        {ground_truth_query}

        두 코드가 문제에 대해 의미적으로 동일한 결과를 반환한다면 answer에 "1"라고 대답하고,
        그렇지 않다면 "0"라고 대답한 후 차이점을 explanation에 적으세요.
        코드의 구조나 사용된 함수가 다르더라도 결과가 같다면 의미적으로 동일하다고 판단해주세요."""

        response = client.chat.completions.create(
            model="gpt-4o", 
            messages=[
                {"role": "system", "content": """You are a helpful assistant that compares the semantic meaning of Bash codes in the context of a given problem.
                반드시 아래 형식으로만 응답하세요:
                {
                    "answer": "...",
                    "explanation": "..."
                }
                """},
                {"role": "user", "content": prompt}
            ]
        )

        try:
            raw_content = response.choices[0].message.content
            parsed = extract_json_from_markdown(raw_content)
        
            with open(save_path, "w", encoding="utf-8") as f:
                json.dump(parsed, f, ensure_ascii=False, indent=4)
        
            return parsed
        
        except Exception as e:
            print(f"[{idx}] OpenAI 응답 파싱 실패: {e}")
            print(f"[{idx}] 원본 응답 내용: {raw_content!r}")
        
            with open(save_path, "w", encoding="utf-8") as f:
                json.dump({
                    "answer": "0",
                    "explanation": f"파싱 실패: {str(e)}\n원본 응답: {raw_content}"
                }, f, ensure_ascii=False, indent=4)
        
            return {
                "answer": "0",
                "explanation": f"파싱 실패: {str(e)}\n원본 응답: {raw_content}"
            }



# generated_result에 인덱스 추가
indexed_openai_evaluation = list(range(len((openai_evaluation))))

# pqdm을 사용하여 병렬 처리
results = pqdm(indexed_openai_evaluation, compare_bash_semantics, n_jobs=40)

QUEUEING TASKS | :   0%|          | 0/405 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/405 [00:00<?, ?it/s]

[112] OpenAI 응답 파싱 실패: Expecting ',' delimiter: line 3 column 135 (char 155)
[112] 원본 응답 내용: '{\n    "answer": "0",\n    "explanation": "The generated code and the correct code are not semantically equivalent. The generated code uses \'find . -name \\"*.jpg\\" -type f -exec du -h \'{}\' \\+ | tail -n 1\', which finds all jpg files and gets individual sizes, outputting the human-readable size of the last file. The correct code \'find . -type f -iname \'*.jpg\' -print0 | du -c --files0-from=-\' computes the total size of all jpg files together. The generated code results in a single file size, whereas the correct code results in the total size, which is the answer to the problem."\n}'
[220] OpenAI 응답 파싱 실패: Expecting ',' delimiter: line 3 column 52 (char 72)
[220] 원본 응답 내용: '```json\n{\n    "answer": "0",\n    "explanation": "The first code \'find. -name \\"file.ext\\" -execdir pwd \\\';\\\'\' has a syntax error because there is no space between \'find\' and \'.\', making it an invalid c

COLLECTING RESULTS | :   0%|          | 0/405 [00:00<?, ?it/s]

In [27]:
results[:10]

[{'answer': '0',
  'explanation': '"생성된 코드"는 \'-iname\' 옵션을 사용하여 대소문자를 구분하지 않고 "Tecmint"라는 이름의 디렉토리를 찾습니다. 이는 \'tecmint\', \'TECMINT\' 등과 같은 이름도 모두 찾게 됩니다. 반면, "정답 코드"는 \'-name\' 옵션을 사용하여 대소문자를 구분하여 정확히 \'Tecmint\'라는 이름의 디렉토리만을 찾습니다. 따라서 두 코드는 문제에 대해 의미적으로 동일한 결과를 반환하지 않습니다.'},
 {'answer': '1',
  'explanation': 'Both codes use the `find` command to search for directories under /var/www while excluding certain subdirectories. The only difference between the two is the presence of the `-a` operator in the generated code. In the context of `find`, conditions within parentheses are implicitly ANDed together, making the `-a` operator redundant. Thus, both codes are functionally equivalent in achieving the task of excluding directories with names starting with /var/www/web-release-data and /var/www/web-development-data.'},
 {'answer': '1',
  'explanation': "Both codes find all regular files under the specified directory and change their permissions to 640. The difference between ';' and '+' 

In [28]:
len(results)

405

In [29]:
json_result = []
for result in results:
    json_result.append(result)

df = pd.DataFrame(json_result)

df["answer"] = df["answer"].map(lambda x : int(x))

after_accuracy = df["answer"].sum() / len(df["answer"])
print(f"Accuracy: {after_accuracy*100:.2f}%")

Accuracy: 47.90%
