In [1]:
!pip install -q transformers datasets accelerate bitsandbytes wandb peft trl

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 MB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.3/318.3 kB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m86.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:

import huggingface_hub
huggingface_hub.login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
import os
import time
import wandb
import json
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, Trainer, TrainingArguments, BitsAndBytesConfig
from datasets import load_dataset, Dataset
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM, SFTConfig
from peft import LoraConfig, get_peft_model, LoraConfig, TaskType

# wandb 초기화
wandb.init(project="Hanghae99_week8_advanced", name="gemma-2-2b-it_lightweight_experiment")

# 하이퍼파라미터/설정값
MODEL_NAME = "google/gemma-2-2b-it"
lora_r = 8

data_path = "corpus.json"

with open(data_path, "r", encoding="utf-8") as f:
    corpus = json.load(f)

dataset = Dataset.from_dict({
    "question": [item["question"] for item in corpus],
    "answer": [item["answer"] for item in corpus]
})

train_split = dataset.train_test_split(test_size=0.2, shuffle=True, seed=42)
train_dataset = train_split['train']
eval_dataset = train_split['test']

# ===== 모델과 토크나이저 로드 (양자화 적용) =====
print(f"Loading model {MODEL_NAME} with 8bit quantization...")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    quantization_config=bnb_config
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

# 모델 파라미터 수 로깅
n_params = sum(p.numel() for p in model.parameters())
wandb.log({f"{MODEL_NAME}_num_params": n_params})
print(f"{MODEL_NAME} parameters: {n_params}")

# ===== 평가용 함수 정의 =====
def evaluate_inference_speed(model, tokenizer, prompt, num_runs=3):
    generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
    times = []
    for i in range(num_runs):
        start = time.time()
        _ = generator(prompt, max_length=100)
        elapsed = time.time() - start
        times.append(elapsed)
        print(f"Run {i+1}: {elapsed:.3f} sec")
    avg_time = sum(times) / len(times)
    return avg_time

# ===== 사전 평가 (Fine-tuning 전 평가) =====
# 정량 평가
instruction = "How does the VIX (Volatility Index) influence trading decisions in the stock market"

print("\nPerforming quantitative evaluation (pre fine-tuning)...")

# 정량적 평가: 추론 속도 측정
avg_time = evaluate_inference_speed(model, tokenizer, instruction)
wandb.log({f"{MODEL_NAME}_avg_inference_time_pre": avg_time})
print(f"Avg Inference Time for {MODEL_NAME} (pre): {avg_time:.3f} sec")

# 정성적 평가: 동일 instruction에 대한 응답 생성
device = 0 if torch.cuda.is_available() else -1
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
print("\nPerforming qualitative evaluation (pre fine-tuning)...")
output = generator(instruction, max_length=250)
response_text = output[0]['generated_text']
wandb.log({f"{MODEL_NAME}_response_pre": response_text})
print(f"Response from {MODEL_NAME} (pre):\n{response_text}")
print("-" * 60)

# ===== Fine-tuning을 위한 준비 =====
def formatting_prompts_func(data):
    prompt_list = []
    for i in range(len(data['question'])):
        prompt_list.append(f"""<bos><start_of_turn>user
{data['question'][i]}<end_of_turn>
<start_of_turn>model
{data['answer'][i]}<end_of_turn><eos>""")
    return prompt_list


response_template = "<start_of_turn>model"

collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

# ===== LoRA 적용 =====

lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=lora_r,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
)


model = get_peft_model(model, lora_config)

# ===== Fine-tuning 설정 및 실행 =====
training_config = SFTConfig(
    output_dir=f"./gemma-2-2b-it_output",
    save_total_limit=1,
    logging_steps=100,
    eval_steps=100,
    max_steps=1000,
    fp16=True,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    save_steps=100,
    save_strategy="steps",
    logging_strategy="steps",
    evaluation_strategy="steps",
)

trainer = SFTTrainer(
    model=model,
    args=training_config,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    formatting_func=formatting_prompts_func,
    data_collator=collator,
)

print(f"\nStarting fine-tuning for {MODEL_NAME} ...")
train_start_time = time.time()
trainer.train()
train_end_time = time.time()
total_training_time = (train_end_time - train_start_time) / 60
wandb.log({"train/total_training_time_min": total_training_time})
print(f"Fine-tuning for {MODEL_NAME} completed.")

# ===== 9. Fine-tuning 후 재평가 =====
print("\nPerforming quantitative evaluation (post fine-tuning)...")
avg_time_post = evaluate_inference_speed(model, tokenizer, instruction)
wandb.log({f"{MODEL_NAME}_avg_inference_time_post": avg_time_post})
print(f"Avg Inference Time for {MODEL_NAME} (post): {avg_time_post:.3f} sec")

print("\nPerforming qualitative evaluation (post fine-tuning)...")
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
output_post = generator(instruction, max_length=250)
response_text_post = output_post[0]['generated_text']
wandb.log({f"{MODEL_NAME}_response_post": response_text_post})
print(f"Response from {MODEL_NAME} (post):\n{response_text_post}")
print("-" * 60)

# ===== 10. 마무리 =====
wandb.finish()


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mcode-sugartoast[0m ([33mcode-sugartoast-sugartoast[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Loading model google/gemma-2-2b-it with 8bit quantization...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
The 'batch_size' attribute of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.


google/gemma-2-2b-it parameters: 1602203904

Performing quantitative evaluation (pre fine-tuning)...
Run 1: 5.494 sec
Run 2: 4.382 sec


Device set to use cuda:0


Run 3: 4.429 sec
Avg Inference Time for google/gemma-2-2b-it (pre): 4.768 sec

Performing qualitative evaluation (pre fine-tuning)...
Response from google/gemma-2-2b-it (pre):
How does the VIX (Volatility Index) influence trading decisions in the stock market?

The VIX, or the CBOE Volatility Index, is a measure of the market's expectation of future volatility. It's often referred to as the "fear index" because it tends to rise when investors are worried about the market's direction. 

Here's how the VIX influences trading decisions:

**1. Reflecting Market Sentiment:**

* **High VIX:** A high VIX indicates high market volatility and fear. This often leads to a "flight to safety" as investors seek out less risky assets like bonds or gold. This can cause stock prices to decline as investors become more cautious.
* **Low VIX:** A low VIX suggests low market volatility and calm. This can lead to increased risk-taking as investors feel more confident about the market's direction.

**2. Pre



Applying formatting function to train dataset:   0%|          | 0/5732 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/5732 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/5732 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/5732 [00:00<?, ? examples/s]

Applying formatting function to eval dataset:   0%|          | 0/1433 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/1433 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/1433 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/1433 [00:00<?, ? examples/s]


Starting fine-tuning for google/gemma-2-2b-it ...


It is strongly recommended to train Gemma2 models with the `eager` attention implementation instead of `sdpa`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.


Step,Training Loss,Validation Loss
100,1.3442,1.074635
200,1.0458,1.027626
300,1.0029,0.999971
400,0.9908,0.9822
500,0.9806,0.965628
600,0.9615,0.956706
700,0.9499,0.945482
800,0.9261,0.941877
900,0.9269,0.936278
1000,0.9385,0.934797


The 'batch_size' argument of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'max_batch_size' argument instead.
Device set to use cuda:0
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausa

Fine-tuning for google/gemma-2-2b-it completed.

Performing quantitative evaluation (post fine-tuning)...
Run 1: 2.743 sec
Run 2: 2.614 sec


Device set to use cuda:0
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 'Jam

Run 3: 2.728 sec
Avg Inference Time for google/gemma-2-2b-it (post): 2.695 sec

Performing qualitative evaluation (post fine-tuning)...
Response from google/gemma-2-2b-it (post):
How does the VIX (Volatility Index) influence trading decisions in the stock market?
The VIX measures market expectations of volatility, and traders use it to gauge market sentiment and anticipate potential price movements.
------------------------------------------------------------


0,1
eval/loss,█▆▄▃▃▂▂▁▁▁
eval/mean_token_accuracy,▁▃▅▅▆▇▇███
eval/runtime,▂▃▁▅█▇▇▇▇█
eval/samples_per_second,▇▆█▄▁▂▂▂▂▁
eval/steps_per_second,▇▆█▄▁▂▂▂▂▁
google/gemma-2-2b-it_avg_inference_time_post,▁
google/gemma-2-2b-it_avg_inference_time_pre,▁
google/gemma-2-2b-it_num_params,▁
train/epoch,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇███
train/global_step,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇██████

0,1
eval/loss,0.9348
eval/mean_token_accuracy,0.72032
eval/runtime,48.8275
eval/samples_per_second,29.348
eval/steps_per_second,7.352
google/gemma-2-2b-it_avg_inference_time_post,2.69495
google/gemma-2-2b-it_avg_inference_time_pre,4.76813
google/gemma-2-2b-it_num_params,1602203904
google/gemma-2-2b-it_response_post,How does the VIX (Vo...
google/gemma-2-2b-it_response_pre,How does the VIX (Vo...
