In [None]:
import json
import pandas as pd
# import numpy as np

from tqdm import tqdm
from random import sample 
from random import randint
from datasets import load_dataset
from transformers import Trainer
from transformers import AutoTokenizer
from transformers import TrainingArguments
from transformers import AutoModelForCausalLM
from transformers import DataCollatorForLanguageModeling

In [None]:
dir_root = "Data/"

## Train and Test

In [None]:
with open(f"{dir_root}qaa.json","r") as f:
    data = json.load(f)
len(data)

In [None]:
count_of_question = sum(len(item["questions"]) for item  in data)
count_of_test = int(count_of_question * 0.2)
count_of_question, count_of_test

In [None]:
pd.DataFrame([len(item["questions"]) for item  in data]).describe()

### Alpca Style

In [None]:
test_input = []
test_output = []
test_instruction =  []
for i in range(count_of_test):
    if len(data[i]["questions"]) > 1:
        test_instruction.append(data[i]["questions"][-1])
        test_output.append(data[i]["answare"])
        test_input.append(data[i]["name"])

pd.DataFrame({"instruction" : test_instruction, "output" : test_output, "input" : test_input}).to_json(f"{dir_root}qaa_test.json", orient="records")
len(test_input), len(test_output), len(test_instruction), count_of_test

In [None]:
train_input = []
train_output = []
train_instruction =  []
for i in range(len(data)):
    for j in range(len(data[i]["questions"][:-1])):
        train_instruction.append(data[i]["questions"][j])
        train_output.append(data[i]["answare"])
        train_input.append(data[i]["name"])

pd.DataFrame({"instruction" : train_instruction, "output" : train_output, "input" : train_input}).to_json(f"{dir_root}qaa_train.json", orient="records")
len(train_input), len(train_output), len(train_instruction)

In [None]:
len(train_input) + len(test_input), count_of_question

## Prepair Train

In [None]:
def generate_prompt(data_point):
    # taken from https://github.com/tloen/alpaca-lora
    if data_point["instruction"]:
        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
        
### Instruction:
{data_point["instruction"]}

### Input:
{data_point["input"]}

### Response:
{data_point["output"]}"""
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{data_point["instruction"]}

### Response:
{data_point["output"]}"""

In [None]:
def generate_prompt_context(data_point):
    return f"""[INST]Below is an instruction that describes a task! Write a response that appropriately completes the request. To answer the instruction, use the context if available!
### Instruction: 
{data_point["instruction"]}

### Input:
{data_point["input"]}

### Context: 
{data_point["context"]}
[/INST]

### Response:
{data_point["output"]}
"""

In [None]:
generate_prompt(train_base[0])

In [None]:
def train(model, version, data_files, gp):

    output_dir = f"Models/{version}/"
    
    tokenizer = AutoTokenizer.from_pretrained(model)
    tokenizer.pad_token_id = tokenizer.eos_token_id
    print("Loaded: TOKENIZER")

    data = load_dataset("json", data_files=data_files)
    # data = data.map(lambda data_point: {"prompt": tokenizer(gp(data_point))})    
    data = data.shuffle().map(
        lambda data_point: tokenizer(
            gp(data_point),
            truncation=True,
            max_length=256,
            padding="max_length",
        )
    )
    print("Loaded: DATA")

    model = AutoModelForCausalLM.from_pretrained(model, device_map="auto") 
    print("Loaded: MODEL")

    train_arguments = TrainingArguments(
        # per_device_train_batch_size=4,
        # gradient_accumulation_steps=16,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=16,
        warmup_steps=2,
        num_train_epochs=5, # paper uses 3
        learning_rate=2e-4,
        logging_steps=1,
        output_dir=output_dir,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=3)

    trainer = Trainer(
        model=model,
        train_dataset=data["train"],
        eval_dataset=data["test"],
        args=train_arguments,
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    )
    print("Loaded: TRAINER")

    model.config.use_cache = False
    trainer.train(resume_from_checkpoint=False)

    model.save_pretrained(f"{output_dir}endpoint")
    print("saved: MODEL")
    
    log_history = pd.DataFrame(trainer.state.log_history)
    log_history.to_csv(f"{dir_root}log_train_history_{version}.csv")
    print("saved: HISTORY")

## GPTJ-B

In [None]:
train(model = "EleutherAI/gpt-j-6B",
      version = "gptj-6b-v20231214",
      data_files = {"train" : f"{dir_root}qaa_train.json", "test" : f"{dir_root}qaa_test.json"},
      gp=generate_prompt)

## opt-6.7b

In [None]:
train(model = "facebook/opt-6.7b",
      version = "opt-6.7b-v20231214",
      data_files = {"train" : f"{dir_root}qaa_train.json", "test" : f"{dir_root}qaa_test.json"},
      gp=generate_prompt)

## Llama-7b

In [None]:
train(model = "huggyllama/llama-7b",
      version = "llama-7b-2-v20231217",
      data_files = {"train" : f"{dir_root}qaa_train.json", "test" : f"{dir_root}qaa_test.json"},
      gp = generate_prompt)

In [None]:
train(model = "huggyllama/llama-7b",
      version = "llama-7b-context-v20231217",
      data_files = {"train" : f"{dir_root}qaa_train_context.json", "test" : f"{dir_root}qaa_test_context.json"},
      gp = generate_prompt_context)

## Llama-2-7b

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
train(model = "meta-llama/Llama-2-7b-hf",
      version = "llama-2-7b-hf-v20231217",
      data_files = {"train" : f"{dir_root}qaa_train.json", "test" : f"{dir_root}qaa_test.json"},
      gp=generate_prompt)

## Llama-2-7b-hf with context

In [None]:
train(model = "meta-llama/Llama-2-7b-hf",
      version = "llama-2-7b-hf-context-v20231218",
      data_files = {"train" : f"{dir_root}qaa_train_context.json", "test" : f"{dir_root}qaa_test_context.json"},
      gp = generate_prompt_context)