# 目的
gemma epoch5

ref: 
- https://github.com/unslothai/unsloth
- https://colab.research.google.com/drive/1vIrqH5uYDQwsJ4-OO3DErvuv4pBgVwk4?usp=sharing#scrollTo=6bZsfBuZDeCL
- https://huggingface.co/blog/mlabonne/sft-llama3
- https://huggingface.co/docs/trl/sft_trainer#accelerate-fine-tuning-2x-using-unsloth

In [1]:
# pip install "unsloth[cu121-torch230] @ git+https://github.com/unslothai/unsloth.git"
# -> だめ

# pip uninstall torch
# pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
# pip install "unsloth[cu121-torch240] @ git+https://github.com/unslothai/unsloth.git"
# -> OK

In [2]:
# path setting
EXP_NAME = "e015-causal"
MODEL_NAME = "unsloth/gemma-2-9b-it-bnb-4bit"
# MODEL_NAME = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"
COMPETITION_NAME = "eedi-mining-misconceptions-in-mathematics"

DATA_PATH = "data"
DATASET_NAME = f"{EXP_NAME}-{MODEL_NAME.split('/')[-1].replace('.', '-')}"
OUTPUT_PATH = f"output/causal/{EXP_NAME}"
MODEL_OUTPUT_PATH = f"{OUTPUT_PATH}/trained_model"
ENV_PATH = "env_file"

# experiment parameter
DEBUG = False
TRAINING = True
UPLOAD_DATA_TO_S3 = True
UPLOAD_DATA_TO_KAGGLE = True
REMOVE_LOCAL_FILE = False
WANDB = True

# USE_FOLD = 0
# USE_DATA_RATE = 1.0
# VALID_DATA_SIZE = 3000

# model parameter
TRAINING_MAX_LENGTH = 512
SEED = 42
EPOCH = 5
LR = 2e-04
TRAIN_BS = 1  # 2  # 4  # 16
GRAD_ACC_STEP = 128 // TRAIN_BS  # 仮想的なバッチサイズはTRAIN_BS * GRAD_ACC_STEPとなる
EVAL_BS = 1  # 2  # 4  # 16
# NUM_LABELS = 3

FREEZE_LAYERS = 16

# rola parameter
LORA_R = 16
LORA_ALPHA = LORA_R * 2
LORA_BIAS = "none"

RESUME_FROM_CHECKPOINT = False  # 途中から再開する場合はTrueにする

In [3]:
!nvidia-smi

Wed Oct 16 23:15:41 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.90.07              Driver Version: 550.90.07      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3090 Ti     On  |   00000000:05:00.0 Off |                  Off |
| 31%   51C    P8             24W /  350W |       4MiB /  24564MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [4]:
!python --version

Python 3.10.14


In [5]:
def resolve_path(base_path: str) -> str:
    import os

    cwd = os.getcwd()
    print(cwd)
    if cwd == f"/notebooks":
        print("Jupyter Kernel By VSCode!")
        return f"/notebooks/{COMPETITION_NAME}/{base_path}"
    elif cwd == f"/notebooks/{COMPETITION_NAME}":
        print("nohup!")
        return base_path
    elif cwd == f"/notebooks/{COMPETITION_NAME}/{COMPETITION_NAME}/exp":
        print("Jupyter Lab!")
        return f"../../{base_path}"
    elif cwd == f"/root/{COMPETITION_NAME}/exp/reranker":
        print("VastAi! Reranker")
        return f"../../{base_path}"
    elif cwd == f"/root/{COMPETITION_NAME}/exp/retriever":
        print("VastAi! Retriever")
        return f"../../{base_path}"
    elif cwd == f"/root/{COMPETITION_NAME}/exp/causal":
        print("VastAi! Causal")
        return f"../../{base_path}"
    elif cwd == f"/root/{COMPETITION_NAME}":
        print("VastAi!")
        return base_path
    else:
        raise Exception("Unknown environment")


DATA_PATH = resolve_path(DATA_PATH)
print(DATA_PATH)
OUTPUT_PATH = resolve_path(OUTPUT_PATH)
print(OUTPUT_PATH)
MODEL_OUTPUT_PATH = resolve_path(MODEL_OUTPUT_PATH)
print(MODEL_OUTPUT_PATH)
ENV_PATH = resolve_path(ENV_PATH)
print(ENV_PATH)

/root/eedi-mining-misconceptions-in-mathematics/exp/causal
VastAi! Causal
../../data
/root/eedi-mining-misconceptions-in-mathematics/exp/causal
VastAi! Causal
../../output/causal/e015-causal
/root/eedi-mining-misconceptions-in-mathematics/exp/causal
VastAi! Causal
../../output/causal/e015-causal/trained_model
/root/eedi-mining-misconceptions-in-mathematics/exp/causal
VastAi! Causal
../../env_file


In [6]:
def validate_dataset_name(dataset_name: str) -> None:
    if len(dataset_name) < 6 or len(dataset_name) > 50:
        raise Exception(
            f"データセットの文字列は6~50文字にしてください。現在{len(DATASET_NAME)}文字"
        )
    if "_" in dataset_name or "." in dataset_name:
        raise Exception("datasetの名称に'_'、'.'の使用は禁止です")


validate_dataset_name(DATASET_NAME)

# import

In [7]:
import os
import random

import polars as pl
import numpy as np

import wandb

import torch
from datasets import Dataset
from transformers import TrainingArguments

from trl import SFTTrainer
from unsloth import FastLanguageModel, is_bfloat16_supported

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [8]:
os.environ["TOKENIZERS_PARALLELISM"] = "true"
NUM_PROC = 16  # os.cpu_count()

In [10]:
import torch
import transformers
import datasets
import trl

assert torch.__version__ == "2.4.1+cu121"
assert transformers.__version__ == "4.45.1"
assert datasets.__version__ == "3.0.0"
assert trl.__version__ == "0.11.1"

In [11]:
# Seed the same seed to all
def seed_everything(seed: int):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True


seed_everything(SEED)

In [12]:
from dotenv import load_dotenv

load_dotenv(f"{ENV_PATH}/.env")

True

# Wandb

In [13]:
if WANDB:
    wandb.login(key=os.environ["WANDB_API_KEY"])
    wandb.init(project=COMPETITION_NAME, name=EXP_NAME)
    REPORT_TO = "wandb"
else:
    REPORT_TO = "none"

REPORT_TO

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33msinchir0[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


'wandb'

# Model Tokenizer Load

In [13]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=TRAINING_MAX_LENGTH,
    dtype=None,
    load_in_4bit=True,
)

# Data Import & Preprocess

In [14]:
train_long = pl.read_csv(f"{DATA_PATH}/train_long.csv").filter(
    (pl.col("CorrectAnswer") != pl.col("AnswerAlphabet"))
    & (pl.col("MisconceptionId").is_not_null())
)

In [15]:
if DEBUG:
    train_long = train_long.head(100)

In [37]:
# TODO: CoT(think step by step)をどうFine-Tuningに反映させるべきかを調べる
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Identify the misconception behind Incorrect Answer.

### Input:
Question: {question_text}
Incorrect Answer: {incorrect_answer_text}
Correct Answer: {correct_answer_text}
Construct Name: {construct_name}
Subject Name: {subject_name}

### Response:
{misconception_name}"""

EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN


def formatting_prompts_func(examples):
    texts = []
    for (
        question_text,
        incorrect_answer_text,
        correct_answer_text,
        construct_name,
        subject_name,
        misconception_name,
    ) in zip(
        examples["QuestionText"],
        examples["AnswerText"],  # incorrect_answer_text
        examples["CorrectAnswerText"],
        examples["ConstructName"],
        examples["SubjectName"],
        examples["MisconceptionName"],
    ):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = (
            alpaca_prompt.format(
                question_text=question_text,
                incorrect_answer_text=incorrect_answer_text,
                correct_answer_text=correct_answer_text,
                construct_name=construct_name,
                subject_name=subject_name,
                misconception_name=misconception_name,
            )
            + EOS_TOKEN
        )
        texts.append(text)
    return {
        "text": texts,
    }


train = Dataset.from_polars(train_long).map(
    formatting_prompts_func,
    batched=True,
)

NameError: name 'tokenizer' is not defined

In [18]:
print(train[0]["text"])

In [19]:
print(train[10]["text"])

# Split Train Valid

In [36]:
train, valid = (
    train.filter(lambda x: x["QuestionId"] % 3 != 0, num_proc=NUM_PROC),
    train.filter(lambda x: x["QuestionId"] % 3 == 0, num_proc=NUM_PROC),
)

NameError: name 'train' is not defined

# Add Extra Data

In [31]:
train_generate = pl.read_csv(f"{DATA_PATH}/e001-generate.csv").rename(
    {"IncorrectAnswerText": "AnswerText"}
)

In [32]:
train_generate.head(2)

QuestionText,ConstructName,SubjectName,CorrectAnswerText,AnswerText,MisconceptionName
str,str,str,str,str,str
"""If you have a triangle with tw…","""Use the angle sum property of …","""Geometry""","""\(40^\circ\)""","""\(50^\circ\)""","""Does not know that angles in a…"
"""What is the product of \( \fra…","""Multiply fractions by applying…","""Fractions""","""\( \frac{4}{15} \)""","""\( \frac{5}{12} \)""","""Uses dividing fractions method…"


In [35]:
train

NameError: name 'train' is not defined

In [None]:
train

In [34]:
tmp = pl.concat(
    [train_long.select(pl.col(train_generate.columns)), train_generate], how="vertical"
)

QuestionText,ConstructName,SubjectName,CorrectAnswerText,AnswerText,MisconceptionName
str,str,str,str,str,str
"""\[ 3 \times 2+4-5 \] Where do …","""Use the order of operations to…","""BIDMAS""","""\( 3 \times(2+4)-5 \)""","""Does not need brackets""","""Confuses the order of operatio…"
"""Simplify the following, if pos…","""Simplify an algebraic fraction…","""Simplifying Algebraic Fraction…","""\( -1 \)""","""\( t \)""","""Incorrectly cancels what they …"
"""Simplify the following, if pos…","""Simplify an algebraic fraction…","""Simplifying Algebraic Fraction…","""\( -1 \)""","""\( 1 \)""","""Incorrectly cancels what they …"
"""Simplify the following, if pos…","""Simplify an algebraic fraction…","""Simplifying Algebraic Fraction…","""\( -1 \)""","""Does not simplify""","""Does not recognise when one pa…"
"""What is \( \mathbf{3 . 5 1 6 3…","""Round numbers to two decimal p…","""Rounding to Decimal Places""","""\( 3.52 \)""","""\( 3.51 \)""","""Rounds down instead of up"""
…,…,…,…,…,…
"""What is the result of \( x^3 \…","""Multiplying powers with the sa…","""Exponents and Powers""","""\( x^7 \)""","""\( x^{12} \)""","""When multiplying numbers with …"
"""Which of the following numbers…","""Identify and understand cube n…","""Properties of Numbers""","""\( 8 \)""","""\( 10 \)""","""Does not know what a cube numb…"
"""Which is greater, 2% of 1000 o…","""Compare percentages of differe…","""Percentage Comparison""","""10% of 150""","""2% of 1000""","""Believes that any percentage o…"
"""Which of the following is a cu…","""Identify the structure of poly…","""Algebraic Expressions""","""D) \( x^3 - 5x + 2 \)""","""A) \( x^3 + x^2 + x \)""","""Believes a cubic expression sh…"


# Model Predict Check Before Training

In [21]:
EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN


def formatting_prompts_func_predict(examples):
    texts = []
    for (
        question_text,
        incorrect_answer_text,
        correct_answer_text,
        construct_name,
        subject_name,
    ) in zip(
        examples["QuestionText"],
        examples["AnswerText"],  # incorrect_answer_text
        examples["CorrectAnswerText"],
        examples["ConstructName"],
        examples["SubjectName"],
    ):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = (
            alpaca_prompt.format(
                question_text=question_text,
                incorrect_answer_text=incorrect_answer_text,
                correct_answer_text=correct_answer_text,
                construct_name=construct_name,
                subject_name=subject_name,
                misconception_name="",
            )
            + EOS_TOKEN
        )
        texts.append(text)
    return {
        "text": texts,
    }


valid_for_predict = valid.map(
    formatting_prompts_func_predict,
    batched=True,
)

Map:   0%|          | 0/1460 [00:00<?, ? examples/s]

In [22]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model)  # Enable native 2x faster inference


def predict(valid: Dataset, idx: int, model):
    inputs = tokenizer(
        valid["text"][idx],
        return_tensors="pt",
    ).to("cuda")

    outputs = model.generate(**inputs, max_new_tokens=64, do_sample=True)
    print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
    print("\n### Answer Misconception Name:")
    print(valid["MisconceptionName"][idx])

In [23]:
predict(valid_for_predict, 0, model)

In [24]:
predict(valid_for_predict, 1, model)

In [25]:
predict(valid_for_predict, 2, model)

# Model

In [26]:
model = FastLanguageModel.get_peft_model(
    model,
    r=LORA_R,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        # "o_proj",
        # "gate_proj",
        # "up_proj",
        # "down_proj",
        # "embed_tokens",
        # "lm_head",
    ],
    lora_alpha=LORA_ALPHA,
    lora_dropout=0,  # Supports any, but = 0 is optimized
    bias="none",  # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
    random_state=SEED,
    use_rslora=False,  # We support rank stabilized LoRA
    loftq_config=None,  # And LoftQ
    layers_to_transform=[i for i in range(32) if i >= FREEZE_LAYERS],
)

In [27]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Gemma2ForCausalLM(
      (model): Gemma2Model(
        (embed_tokens): Embedding(256000, 3584)
        (layers): ModuleList(
          (0-15): 16 x Gemma2DecoderLayer(
            (self_attn): Gemma2Attention(
              (q_proj): Linear4bit(in_features=3584, out_features=4096, bias=False)
              (k_proj): Linear4bit(in_features=3584, out_features=2048, bias=False)
              (v_proj): Linear4bit(in_features=3584, out_features=2048, bias=False)
              (o_proj): Linear4bit(in_features=4096, out_features=3584, bias=False)
              (rotary_emb): GemmaFixedRotaryEmbedding()
            )
            (mlp): Gemma2MLP(
              (gate_proj): Linear4bit(in_features=3584, out_features=14336, bias=False)
              (up_proj): Linear4bit(in_features=3584, out_features=14336, bias=False)
              (down_proj): Linear4bit(in_features=14336, out_features=3584, bias=False)
              (act_fn): Pytorc

In [28]:
training_arguments = TrainingArguments(
    output_dir=MODEL_OUTPUT_PATH,
    learning_rate=LR,
    per_device_train_batch_size=TRAIN_BS,
    gradient_accumulation_steps=GRAD_ACC_STEP,
    eval_accumulation_steps=GRAD_ACC_STEP,
    per_device_eval_batch_size=EVAL_BS,
    num_train_epochs=EPOCH,
    weight_decay=0.01,
    eval_strategy="steps",
    eval_steps=0.1,
    save_strategy="steps",
    save_steps=0.1,
    save_total_limit=1,
    logging_steps=2,
    seed=SEED,
    metric_for_best_model="eval_loss",
    warmup_ratio=0.1,
    lr_scheduler_type="cosine_with_restarts",
    report_to=REPORT_TO,
    run_name=EXP_NAME,
    load_best_model_at_end=True,
    fp16=True,
    fp16_full_eval=True,
    gradient_checkpointing=True,
)

In [29]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train,
    eval_dataset=valid,
    dataset_text_field="text",
    max_seq_length=TRAINING_MAX_LENGTH,
    dataset_num_proc=NUM_PROC,
    packing=False,  # Can make training 5x faster for short sequences.
    args=training_arguments,
)

Map (num_proc=16):   0%|          | 0/2910 [00:00<?, ? examples/s]

Map (num_proc=16):   0%|          | 0/1460 [00:00<?, ? examples/s]

In [30]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [31]:
MODEL_OUTPUT_PATH

'output/causal/e015-causal/trained_model'

In [32]:
if TRAINING:
    trainer_stats = trainer.train()
else:
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=MODEL_OUTPUT_PATH + "/lora_model",
        max_seq_length=TRAINING_MAX_LENGTH,
        dtype=None,  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
        load_in_4bit=True,  # Use 4bit quantization to reduce memory usage. Can be False.
    )

Step,Training Loss,Validation Loss
11,2.3683,2.030697
22,1.1672,1.111559
33,0.9754,0.957927
44,0.9155,0.904422
55,0.8787,0.869549
66,0.8314,0.844981
77,0.8441,0.828598
88,0.7938,0.818241
99,0.8215,0.81398
110,0.7854,0.813335


# Predict Valid

In [33]:
predict(valid_for_predict, 0, model)

In [34]:
predict(valid_for_predict, 1, model)

In [35]:
predict(valid_for_predict, 2, model)

# Save

In [36]:
model.save_pretrained(f"{MODEL_OUTPUT_PATH}/lora_model")  # Local saving
tokenizer.save_pretrained(f"{MODEL_OUTPUT_PATH}/lora_model")

('output/causal/e015-causal/trained_model/lora_model/tokenizer_config.json',
 'output/causal/e015-causal/trained_model/lora_model/special_tokens_map.json',
 'output/causal/e015-causal/trained_model/lora_model/tokenizer.model',
 'output/causal/e015-causal/trained_model/lora_model/added_tokens.json',
 'output/causal/e015-causal/trained_model/lora_model/tokenizer.json')

# AWSへのアップロード

In [37]:
# S3へのアップロード
if not DEBUG and UPLOAD_DATA_TO_S3:
    # uninstall
    !sudo rm /usr/bin/aws
    !sudo rm /usr/bin/aws_completer
    !sudo rm -rf /usr/local/aws-cli

    # install
    !curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
    !unzip -o -qq awscliv2.zip
    !sudo ./aws/install --update

    # upload
    output_name = MODEL_OUTPUT_PATH.split("/")[-1]
    os.system(
        f"aws s3 cp --recursive {MODEL_OUTPUT_PATH} s3://{COMPETITION_NAME}/trained_model/{output_name}"
    )

In [38]:
# ダウンロード（参考）
# !sudo rm /usr/bin/aws
# !sudo rm /usr/bin/aws_completer
# !sudo rm -rf /usr/local/aws-cli

# !curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
# !unzip -o -qq awscliv2.zip
# !sudo ./aws/install --update

# !aws s3 cp --recursive s3://automated-essay-scoring/trained_model/e005-regression /notebooks/automated_essay_scoring/trained_models/e005-regression

# Kaggle Datasetへのupload

In [39]:
import os

os.system("mkdir -p ~/.kaggle/")
os.system(f"cp {ENV_PATH}/kaggle.json ~/.kaggle/")
os.system("chmod 600 ~/.kaggle/kaggle.json")

0

In [40]:
if not DEBUG and UPLOAD_DATA_TO_KAGGLE:
    import os
    import json

    from kaggle.api.kaggle_api_extended import KaggleApi

    def dataset_create_new(dataset_name: str, upload_dir: str):
        dataset_metadata = {}
        dataset_metadata["id"] = f"sinchir0/{dataset_name}"
        dataset_metadata["licenses"] = [{"name": "CC0-1.0"}]
        dataset_metadata["title"] = dataset_name
        with open(os.path.join(upload_dir, "dataset-metadata.json"), "w") as f:
            json.dump(dataset_metadata, f, indent=4)
        api = KaggleApi()
        api.authenticate()
        api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode="tar")

    print(f"Create Dataset name:{DATASET_NAME}, output_dir:{MODEL_OUTPUT_PATH}")
    dataset_create_new(dataset_name=DATASET_NAME, upload_dir=MODEL_OUTPUT_PATH)

# ローカルからのデータの削除

In [41]:
# if not DEBUG and REMOVE_LOCAL_FILE:
#     # ローカルからは削除
#     os.system(f"rm -rf {MODEL_OUTPUT_PATH}")

In [42]:
if WANDB:
    wandb.finish()

In [43]:
print("finish Notebook!")