In [None]:
# SETUP Gorilla Raft Dependencies

In [None]:
! ./setup_raft.sh

In [None]:
! pip install dotenv pandas mdc openai datasets transformers PyPDF2 langchain_experimental langchain_openai azure-identity coloredlogs

In [None]:
import os
from dotenv import load_dotenv

# LOAD SETTINGS
load_dotenv("config.env")

print(f"Using multi-model: {os.getenv('OPENAI_BASE_URL')}")

In [None]:
ds_name = "surfing"
format = "chat"

os.environ["DOC_PATH"] = "sample_data/surfing/Surfing-Wikipedia.pdf"

finetuning_train_split = .8
finetuning_valid_split = .1
finetuning_threshold = 65
raft_questions = 2

print("Training Document Path:", os.getenv("DOC_PATH"))

In [None]:
import pandas as pd

# MAP DATASET LOCATION
ds_path = f"dataset/{ds_name}"
ds_output_file = f"{ds_path}.jsonl"

os.environ["DATASET_OUTPUT_PATH"] = ds_path

print("Dataset name:", ds_name)
print("Dataset path:", ds_path)

In [None]:
from math import ceil

qa_threshold = ceil(finetuning_threshold / finetuning_train_split)

print(f"QA threshold: {qa_threshold}")

In [None]:
!python3 .gorilla/raft/raft.py \
    --datapath $DOC_PATH \
    --output $DATASET_OUTPUT_PATH \
    --doctype pdf \
    --chunk_size 512 \
    --questions 2 \
    --distractors 3 \
    --embedding_model $EMBEDDING_MODEL_ID \
    --completion_model $TEACHER_MODEL_ID

In [None]:
raft_arrow_file = f"{ds_path}/data-00000-of-00001.arrow"
os.environ["RAFT_ARROW_FILE"] = raft_arrow_file

dataset_path = f"{ds_path}-files/{ds_name}-full.jsonl"
dataset_path_hf = f"{ds_path}-files/{ds_name}-hf.full.jsonl"
os.environ["DATASET_PATH_HF"] = dataset_path_hf

dataset_path_hf_train = f"{ds_path}-files/{ds_name}-hf.train.jsonl"
dataset_path_hf_valid = f"{ds_path}-files/{ds_name}-hf.valid.jsonl"
dataset_path_hf_eval = f"{ds_path}-files/{ds_name}-hf.eval.jsonl"

dataset_path_ft_train = f"{ds_path}-files/{ds_name}-ft.train.jsonl"
dataset_path_ft_valid = f"{ds_path}-files/{ds_name}-ft.valid.jsonl"

print(f"Reading arrow file {raft_arrow_file}")

In [None]:
! python .gorilla/raft/format.py \
    --input $RAFT_ARROW_FILE \
    --output $DATASET_PATH_HF \
    --output-format hf

In [None]:
hf_full_df = pd.read_json(dataset_path_hf, lines=True)
hf_full_df.head(5)

In [None]:
from IPython.display import display, Markdown
from random import randint

sample_idx = 2#randint(0, len(hf_full_df) - 1)
sample = hf_full_df.iloc[sample_idx]
instruction_md = sample.instruction.replace("<DOCUMENT>", "`<DOCUMENT>`").replace("</DOCUMENT>", "`</DOCUMENT>`")
oracle_context_md = sample.oracle_context.replace("<DOCUMENT>", "`<DOCUMENT>`").replace("</DOCUMENT>", "`</DOCUMENT>`")
sample_answer_md = sample.cot_answer.replace("<ANSWER>", "`<ANSWER>`").replace("##begin_quote##", "`##begin_quote##`").replace("##end_quote##", "`##end_quote##`")
display(Markdown(f"""
## Oracle Context
{oracle_context_md}

## Question
{sample.question}

## CoT Answer
{sample_answer_md}
"""))

In [None]:
# split dataset into 80%/10%/10%
import numpy as np

samples_count = len(hf_full_df)
splits = [int(finetuning_train_split * samples_count), int((finetuning_train_split + finetuning_valid_split) * samples_count)]
print(f"Splitting dataset at {splits}")
hf_train_df, hf_valid_df, hf_eval_df = np.split(hf_full_df, splits)
hf_train_df.to_json(dataset_path_hf_train, orient="records", lines=True)
hf_valid_df.to_json(dataset_path_hf_valid, orient="records", lines=True)
hf_eval_df.to_json(dataset_path_hf_eval, orient="records", lines=True)

In [None]:
! python .gorilla/raft/format.py \
    --input $dataset_path_hf_train \
    --input-type jsonl \
    --output $dataset_path_ft_train \
    --output-format $format \
    --output-completion-prompt-column text\
    --output-completion-completion-column ground_truth

In [None]:
! python .gorilla/raft/format.py \
    --input $dataset_path_hf_valid \
    --input-type jsonl \
    --output $dataset_path_ft_valid \
    --output-format $format \
    --output-completion-prompt-column text\
    --output-completion-completion-column ground_truth

In [None]:
dataset_path_ft_valid_df = pd.read_json(dataset_path_ft_valid, lines=True)
dataset_path_ft_valid_df.head(2)

In [None]:
pd.read_json(dataset_path_hf_eval, lines=True).head(2)