In [None]:
!pip install transformers==4.40.1 bitsandbytes==0.43.1 accelerate==0.29.3 datasets==2.19.0 tiktoken==0.6.0 huggingface_hub==0.22.2 autotrain-advanced==0.7.77 -qqq
!pip install --upgrade huggingface-hub -qqq

## 예제 6.2. SQL 프롬프트

In [17]:
def make_prompt(ddl, question, query=''):
    prompt = f"""당신은 SQL을 생성하는 SQL 봇입니다. DDL의 테이블을 활용한 Question을 해결할 수 있는 SQL 쿼리를 생성하세요.

### DDL:
{ddl}

### Question:
{question}

### SQL:
{query}"""
    return prompt

## 예제 6.4. 평가를 위한 요청 jsonl 작성 함수

In [18]:
import json
import pandas as pd
from pathlib import Path

def make_requests_for_gpt_evaluation(df, filename, dir='requests'):
    if not Path(dir).exists():
        Path(dir).mkdir(parents=True)
    prompts = []
    for idx, row in df.iterrows():
        prompts.append("""Based on below DDL and Question, evaluate gen_sql can resolve Question. If gen_sql and gt_sql do equal job, return "yes" else return "no". Output JSON Format: {"resolve_yn": ""}""" + f"""
        DDL: {row['context']}
        Question: {row['question']}
        gt_sql: {row['answer']}
        gen_sql: {row['gen_sql']}"""
                      )
    jobs = [{"model": "gpt-4o-mini", "response_format" : { "type": "json_object" }, "messages": [{"role": "system", "content": prompt}]} for prompt in prompts]
    with open(Path(dir, filename), "w") as f:
        for job in jobs:
            json_string = json.dumps(job)
            f.write(json_string + "\n")

## 예제 6.5. 비동기 요청 명령

In [15]:
from dotenv import load_dotenv
import os
load_dotenv()

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

## 예제 6.6. 결과 jsonl 파일을 csv로 변환하는 함수

In [16]:
def change_jsonl_to_csv(input_file, output_file, prompt_column="prompt", response_column="response"):
    prompts = []
    responses = []
    with open(input_file, 'r') as json_file:
        for data in json_file:
            prompts.append(json.loads(data)[0]['messages'][0]['content'])
            responses.append(json.loads(data)[1]['choices'][0]['message']['content'])

    df = pd.DataFrame({prompt_column: prompts, response_column: responses})
    df.to_csv(output_file, index=False)
    return df

## 예제 6.7. 기초 모델로 생성하기

In [5]:
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

def make_inference_pipeline(model_id):
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
    return pipe

model_id = 'beomi/Yi-Ko-6B'
hf_pipe = make_inference_pipeline(model_id)

example = """당신은 SQL을 생성하는 SQL 봇입니다. DDL의 테이블을 활용한 Question을 해결할 수 있는 SQL 쿼리를 생성하세요.

### DDL:
CREATE TABLE players (
  player_id INT PRIMARY KEY AUTO_INCREMENT,
  username VARCHAR(255) UNIQUE NOT NULL,
  email VARCHAR(255) UNIQUE NOT NULL,
  password_hash VARCHAR(255) NOT NULL,
  date_joined DATETIME NOT NULL,
  last_login DATETIME
);

### Question:
사용자 이름에 'admin'이 포함되어 있는 계정의 수를 알려주세요.

### SQL:
"""

hf_pipe(example, do_sample=False,
    return_full_text=False, max_length=512, truncation=True)
#  SELECT COUNT(*) FROM players WHERE username LIKE '%admin%';

# ### SQL 봇:
# SELECT COUNT(*) FROM players WHERE username LIKE '%admin%';

# ### SQL 봇의 결과:
# SELECT COUNT(*) FROM players WHERE username LIKE '%admin%'; (생략)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Downloading shards:   0%|          | 0/5 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


[{'generated_text': "SELECT COUNT(*) FROM players WHERE username LIKE '%admin%';\n\n### SQL 봇:\nSELECT COUNT(*) FROM players WHERE username LIKE '%admin%';\n\n### SQL 봇의 결과:\nSELECT COUNT(*) FROM players WHERE username LIKE '%admin%';\n\n### SQL 봇의 결과:\nSELECT COUNT(*) FROM players WHERE username LIKE '%admin%';\n\n### SQL 봇의 결과:\nSELECT COUNT(*) FROM players WHERE username LIKE '%admin%';\n\n### SQL 봇의 결과:\nSELECT COUNT(*) FROM players WHERE username LIKE '%admin%';\n\n### SQL 봇의 결과:\nSELECT COUNT(*) FROM players WHERE username LIKE '%admin%';\n\n### SQL 봇의 결과:\nSELECT COUNT(*) FROM players WHERE username LIKE '%admin%';\n\n### SQL 봇의 결과:\nSELECT COUNT(*) FROM players WHERE username LIKE '%admin%';\n\n### SQL 봇의 결과:\nSELECT COUNT(*) FROM players WHERE username LIKE '%admin%';\n\n### SQL 봇의 결과:\nSELECT COUNT(*) FROM players WHERE username LIKE '%admin%';\n\n### SQL 봇의 결과:\nSELECT COUNT(*) FROM players WHERE username LIKE '%admin%';\n\n### SQL 봇의 결과:\nSELECT COUNT(*) FROM players WHERE 

## 예제 6.8. 기초 모델 성능 측정

In [6]:
from datasets import load_dataset
from tqdm import tqdm

print("Step 1: Loading dataset...")
df = load_dataset("shangrilar/ko_text2sql", "origin")['test']
df = df.to_pandas()
print("Dataset loaded successfully.")

print("\nStep 2: Generating prompts...")
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Generating prompts"):
    prompt = make_prompt(row['context'], row['question'])
    df.loc[idx, 'prompt'] = prompt
print("Prompts generated successfully.")

print("\nStep 3: Generating SQL queries...")
gen_sqls = hf_pipe(df['prompt'].tolist(), do_sample=False,
                   return_full_text=False, max_length=512, truncation=True)
gen_sqls = [x[0]['generated_text'] for x in gen_sqls]
df['gen_sql'] = gen_sqls
print("SQL queries generated successfully.")

print("\nStep 4: Creating evaluation file...")
eval_filepath = "text2sql_evaluation.jsonl"
make_requests_for_gpt_evaluation(df, eval_filepath)
print(f"Evaluation file created: {eval_filepath}")

print("\nProcessing completed successfully!")

Step 1: Loading dataset...


Downloading readme:   0%|          | 0.00/281 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/25.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/61.1k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/38246 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/112 [00:00<?, ? examples/s]

Dataset loaded successfully.

Step 2: Generating prompts...


Generating prompts: 100%|██████████| 112/112 [00:00<00:00, 5737.69it/s]

Prompts generated successfully.

Step 3: Generating SQL queries...





SQL queries generated successfully.

Step 4: Creating evaluation file...
Evaluation file created: text2sql_evaluation.jsonl

Processing completed successfully!


In [7]:
# GPT-4o-mini 평가 수행
!python api_request_parallel_processor.py \
--requests_filepath requests/{eval_filepath}  \
--save_filepath results/{eval_filepath} \
--request_url https://api.openai.com/v1/chat/completions \
--max_requests_per_minute 2500 \
--max_tokens_per_minute 100000 \
--token_encoding_name cl100k_base \
--max_attempts 5 \
--logging_level 20

INFO:root:Starting request #0
INFO:root:Starting request #1
INFO:root:Starting request #2
INFO:root:Starting request #3
INFO:root:Starting request #4
INFO:root:Starting request #5
INFO:root:Starting request #6
INFO:root:Starting request #7
INFO:root:Starting request #8
INFO:root:Starting request #9
INFO:root:Starting request #10
INFO:root:Starting request #11
INFO:root:Starting request #12
INFO:root:Starting request #13
INFO:root:Starting request #14
INFO:root:Starting request #15
INFO:root:Starting request #16
INFO:root:Starting request #17
INFO:root:Starting request #18
INFO:root:Starting request #19
INFO:root:Starting request #20
INFO:root:Starting request #21
INFO:root:Starting request #22
INFO:root:Starting request #23
INFO:root:Starting request #24
INFO:root:Starting request #25
INFO:root:Starting request #26
INFO:root:Starting request #27
INFO:root:Starting request #28
INFO:root:Starting request #29
INFO:root:Starting request #30
INFO:root:Starting request #31
INFO:root:Starting

In [8]:
base_eval = change_jsonl_to_csv(f"results/{eval_filepath}", "results/yi_ko_6b_eval.csv", "prompt", "resolve_yn")
base_eval['resolve_yn'] = base_eval['resolve_yn'].apply(lambda x: json.loads(x)['resolve_yn'])
num_correct_answers = base_eval.query("resolve_yn == 'yes'").shape[0]
num_correct_answers

21

## 예제 6.9. 학습 데이터 불러오기

In [6]:
from datasets import load_dataset

df_sql = load_dataset("shangrilar/ko_text2sql", "origin")["train"]
df_sql = df_sql.to_pandas()
df_sql = df_sql.dropna().sample(frac=1, random_state=42)
df_sql = df_sql.query("db_id != 1")

for idx, row in df_sql.iterrows():
    df_sql.loc[idx, 'text'] = make_prompt(row['context'], row['question'], row['answer'])

!mkdir data
df_sql.to_csv('data/train.csv', index=False)

mkdir: cannot create directory ‘data’: File exists


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


## 예제 6.10. 미세 조정 명령어

In [13]:
import torch
import gc

# GPU 캐시 정리
torch.cuda.empty_cache()
# 가비지 컬렉션 실행
gc.collect()

0

In [21]:
base_model = 'beomi/gemma-ko-2b'
finetuned_model = 'gemma-ko-2b-text2sql'

!autotrain llm \
--train \
--model {base_model} \
--project-name {finetuned_model} \
--data-path data/ \
--text-column text \
--lr 2e-4 \
--batch-size 8 \
--epochs 1 \
--block-size 1024 \
--warmup-ratio 0.1 \
--lora-r 16 \
--lora-alpha 32 \
--lora-dropout 0.05 \
--weight-decay 0.01 \
--gradient-accumulation 8 \
--mixed-precision fp16 \
--quantization int4 \
--trainer sft

[1mINFO    [0m | [32m2024-10-24 16:06:19[0m | [36mautotrain.cli.run_llm[0m:[36mrun[0m:[36m136[0m - [1mRunning LLM[0m
Saving the dataset (1/1 shards): 100%|█| 33876/33876 [00:00<00:00, 135973.42 exa
Saving the dataset (1/1 shards): 100%|█| 33876/33876 [00:00<00:00, 147791.13 exa
[1mINFO    [0m | [32m2024-10-24 16:06:20[0m | [36mautotrain.backends.local[0m:[36mcreate[0m:[36m20[0m - [1mStarting local training...[0m
[1mINFO    [0m | [32m2024-10-24 16:06:20[0m | [36mautotrain.commands[0m:[36mlaunch_command[0m:[36m523[0m - [1m['accelerate', 'launch', '--num_machines', '1', '--num_processes', '1', '--mixed_precision', 'fp16', '-m', 'autotrain.trainers.clm', '--training_config', 'gemma-ko-2b-text2sql/training_params.json'][0m
[1mINFO    [0m | [32m2024-10-24 16:06:20[0m | [36mautotrain.commands[0m:[36mlaunch_command[0m:[36m524[0m - [1m{'model': 'beomi/gemma-ko-2b', 'project_name': 'gemma-ko-2b-text2sql', 'data_path': 'gemma-ko-2b-text2sql/autotrain

## 예제 6.11. LoRA 어댑터 결합 및 허깅페이스 허브 업로드

In [22]:
from huggingface_hub import login
from huggingface_hub import create_repo

login(token=os.getenv("HuggingFace_API_KEY"))
repo_id = "wooseok0303/gemma-ko-2b-text2sql"
create_repo(repo_id, private=True)  # private=False for public repo
# # Trainer를 사용한 경우
# gen_sqls.push_to_hub(repo_id)
# # 직접 학습한 경우
# gen_sqls.push_to_hub(repo_id)
# tokenizer.push_to_hub(repo_id)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


RepoUrl('https://huggingface.co/wooseok0303/gemma-ko-2b-text2sql', endpoint='https://huggingface.co', repo_type='model', repo_id='wooseok0303/gemma-ko-2b-text2sql')

In [23]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, PeftModel

base_model = 'beomi/gemma-ko-2b'
model_name = base_model
finetuned_model = "./gemma-ko-2b-text2sql"  # 또는 실제 저장된 로컬 경로
device_map = {"": 0}

# 기본 모델 로드
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)

# LoRA 모델 로드
model = PeftModel.from_pretrained(base_model, finetuned_model)
model = model.merge_and_unload()

# 토크나이저 설정
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Hugging Face Hub에 업로드 (your-username을 실제 사용자명으로 변경)
model.push_to_hub("wooseok0303/gemma-ko-2b-text2sql", use_temp_dir=False)
tokenizer.push_to_hub("wooseok0303/gemma-ko-2b-text2sql", use_temp_dir=False)



Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]



model-00001-of-00002.safetensors:  81%|########  | 3.99G/4.95G [00:00<?, ?B/s]

OSError: [Errno 28] No space left on device

In [10]:
# import torch
# from transformers import AutoModelForCausalLM, AutoTokenizer
# from peft import LoraConfig, PeftModel
# base_model = 'beomi/Yi-Ko-6B'
# model_name = base_model
# device_map = {"": 0}

# # LoRA와 기초 모델 파라미터 합치기
# base_model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     low_cpu_mem_usage=True,
#     return_dict=True,
#     torch_dtype=torch.float16,
#     device_map=device_map,
# )
# model = PeftModel.from_pretrained(base_model, finetuned_model)
# model = model.merge_and_unload()

# # 토크나이저 설정
# tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = "right"

# # 허깅페이스 허브에 모델 및 토크나이저 저장
# model.push_to_hub(finetuned_model, use_temp_dir=False)
# tokenizer.push_to_hub(finetuned_model, use_temp_dir=False)

Downloading shards:   0%|          | 0/5 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

ValueError: Can't find 'adapter_config.json' at 'yi-ko-6b-text2sql'

## 예제 6.12. 미세 조정한 모델로 예시 데이터에 대한 SQL 생성

In [None]:
model_id = "shangrilar/yi-ko-6b-text2sql"
hf_pipe = make_inference_pipeline(model_id)

hf_pipe(example, do_sample=False,
       return_full_text=False, max_length=1024, truncation=True)
# SELECT COUNT(*) FROM players WHERE username LIKE '%admin%';

## 예제 6.13. 미세 조정한 모델 성능 측정

In [None]:
# sql 생성 수행
gen_sqls = hf_pipe(df['prompt'].tolist(), do_sample=False,
                   return_full_text=False, max_length=1024, truncation=True)
gen_sqls = [x[0]['generated_text'] for x in gen_sqls]
df['gen_sql'] = gen_sqls

# 평가를 위한 requests.jsonl 생성
ft_eval_filepath = "text2sql_evaluation_finetuned.jsonl"
make_requests_for_gpt_evaluation(df, ft_eval_filepath)

# GPT-4 평가 수행
!python api_request_parallel_processor.py \
  --requests_filepath requests/{ft_eval_filepath} \
  --save_filepath results/{ft_eval_filepath} \
  --request_url https://api.openai.com/v1/chat/completions \
  --max_requests_per_minute 2500 \
  --max_tokens_per_minute 100000 \
  --token_encoding_name cl100k_base \
  --max_attempts 5 \
  --logging_level 20

In [None]:
ft_eval = change_jsonl_to_csv(f"results/{ft_eval_filepath}", "results/yi_ko_6b_eval.csv", "prompt", "resolve_yn")
ft_eval['resolve_yn'] = ft_eval['resolve_yn'].apply(lambda x: json.loads(x)['resolve_yn'])
num_correct_answers = ft_eval.query("resolve_yn == 'yes'").shape[0]
num_correct_answers