# Prepare Model

In [1]:
# model type (must be huggingface format)
# model_id = "meta-llama/Llama-2-7b-chat-hf"
model_id = "meta-llama/Llama-2-7b-chat-hf"
model_name = "llama2-7b-chat-adapter"

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, load_in_8bit=True, device_map='auto', torch_dtype=torch.float16)
tokenizer.pad_token = tokenizer.eos_token

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

# Prepare Dataset

In [3]:
eval_prompt = """
Column Names are limited to the following:
name, description, team, type, age, location, year, city, rank, status, state, category,
weight, code, club, artist, result, position, country, notes, class, company, album, symbol,
address, duration, format, county, day, gender, industry, language, sex, product, jockey,
region, area, service, teamName, order, isbn, fileSize, grades, publisher, plays, origin,
elevation, affiliation, component, owner, genre,  manufacturer, brand, family, credit, depth,
classification, collection, species, command, nationality, currency, range, affiliate,
birthDate, ranking, capacity, birthPlace, person, creator, operator, religion, education,
requirement, director, sales, continent, organisation
Do not use any column names aside from these.

No pre-amble. Answer is in the following format: answer

Given the following relational table: {}
Guess the column name

Output: {}
"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []
    for input, output in zip(inputs, outputs):
        # print(input, output)
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = eval_prompt.format(input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

from datasets import load_dataset
dataset = load_dataset("sadpineapple/data-profile", split = "train")
train_dataset = dataset.map(formatting_prompts_func, batched = True,)



# Test Raw Model

In [4]:
input_text = eval_prompt.format("1994,2002,2004,2006,2009,2009,2010,2010,2011,2013,2014,2014,2015,2016", "")
model_input = tokenizer(input_text, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100, pad_token_id = tokenizer.eos_token_id)[0], skip_special_tokens=True))


Column Names are limited to the following:
name, description, team, type, age, location, year, city, rank, status, state, category,
weight, code, club, artist, result, position, country, notes, class, company, album, symbol,
address, duration, format, county, day, gender, industry, language, sex, product, jockey,
region, area, service, teamName, order, isbn, fileSize, grades, publisher, plays, origin,
elevation, affiliation, component, owner, genre,  manufacturer, brand, family, credit, depth,
classification, collection, species, command, nationality, currency, range, affiliate,
birthDate, ranking, capacity, birthPlace, person, creator, operator, religion, education,
requirement, director, sales, continent, organisation
Do not use any column names aside from these.

No pre-amble. Answer is in the following format: answer

Given the following relational table: 1994,2002,2004,2006,2009,2009,2010,2010,2011,2013,2014,2014,2015,2016
Guess the column name

Output: 
name

Explanation: The co

# Fine Tune Model with PEFT Adapter

In [5]:
model.train()

def create_peft_config(model):
    from peft import (
        get_peft_model,
        LoraConfig,
        TaskType,
        prepare_model_for_kbit_training,
    )

    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=8,
        lora_alpha=32,
        lora_dropout=0.05,
        target_modules = ["q_proj", "v_proj"]
    )

    # prepare int-8 model for training
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()
    return model, peft_config

# create peft config
model, lora_config = create_peft_config(model)

trainable params: 4,194,304 || all params: 6,742,609,920 || trainable%: 0.06220594176090199


In [6]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = 'text',
    max_seq_length = 4096,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 40,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

# Start training
trainer.train()

Map (num_proc=2):   0%|          | 0/74141 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
1,3.2968
2,3.1149
3,2.8507
4,3.1539
5,3.1585
6,2.6676
7,2.6021
8,2.7468
9,2.671
10,2.5185


TrainOutput(global_step=40, training_loss=1.4316592156887054, metrics={'train_runtime': 213.9155, 'train_samples_per_second': 0.748, 'train_steps_per_second': 0.187, 'total_flos': 2093226128547840.0, 'train_loss': 1.4316592156887054, 'epoch': 0.0021580502016428156})

# Test Fine-tuned Model

In [7]:
model.eval()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))


Column Names are limited to the following:
name, description, team, type, age, location, year, city, rank, status, state, category,
weight, code, club, artist, result, position, country, notes, class, company, album, symbol,
address, duration, format, county, day, gender, industry, language, sex, product, jockey,
region, area, service, teamName, order, isbn, fileSize, grades, publisher, plays, origin,
elevation, affiliation, component, owner, genre,  manufacturer, brand, family, credit, depth,
classification, collection, species, command, nationality, currency, range, affiliate,
birthDate, ranking, capacity, birthPlace, person, creator, operator, religion, education,
requirement, director, sales, continent, organisation
Do not use any column names aside from these.

No pre-amble. Answer is in the following format: answer

Given the following relational table: 1994,2002,2004,2006,2009,2009,2010,2010,2011,2013,2014,2014,2015,2016
Guess the column name

Output: 
age

Explanation: The age

# Save Adapter

In [9]:
# save locally
# model.save_pretrained(output_dir)

# push to huggingface (use specific token and repo name)
model.push_to_hub(f"sadpineapple/{model_name}", token="")

adapter_model.safetensors:   0%|          | 0.00/16.8M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/sadpineapple/llama2-7b-chat-adapter/commit/43890244e0fafb0c5b8bbf4ffb4ae5dc898a51bc', commit_message='Upload model', commit_description='', oid='43890244e0fafb0c5b8bbf4ffb4ae5dc898a51bc', pr_url=None, pr_revision=None, pr_num=None)