In [None]:
# SETUP Gorilla Raft Dependencies

In [None]:
! ./setup_raft.sh

In [None]:
! pip install dotenv pandas mdc openai datasets transformers PyPDF2 langchain_experimental langchain_openai azure-identity coloredlogs

In [None]:
import os
from math import ceil
from dotenv import load_dotenv

# ---------------------------------------------------------------------
# LOAD CONFIGURATION
# ---------------------------------------------------------------------
load_dotenv("config.env")

# ---------------------------------------------------------------------
# DATASET CONFIGURATION
# ---------------------------------------------------------------------
ds_name = os.getenv("DATASET_NAME")
ds_file = os.getenv("DATASET_FILE")
os.environ["DATAFILE_PATH"] = f"sample_data/{ds_name}/{ds_file}"

# Define dataset output paths
ds_path = f"dataset/{ds_name}"
os.environ["DATASET_OUTPUT_PATH"] = ds_path

# ---------------------------------------------------------------------
# TRAINING PARAMETERS
# ---------------------------------------------------------------------
finetuning_train_split = 0.8
finetuning_valid_split = 0.1
finetuning_threshold = 65
raft_questions = 2
qa_threshold = ceil(finetuning_threshold / finetuning_train_split)

# ---------------------------------------------------------------------
# PRINT CONFIGURATION SUMMARY
# ---------------------------------------------------------------------
print(
    f"""
═══════════════════════════════════════════════════════════════════════
    RAFT Synthetic Dataset Generation - Configuration Overview
═══════════════════════════════════════════════════════════════════════

MODEL & ROUTER
──────────────────────────────────────────────
 Multi-Model Router URL  : {os.getenv('OPENAI_BASE_URL')}
 Teacher Model           : {os.getenv('TEACHER_MODEL_ID')}
 Embedding Model         : {os.getenv('EMBEDDING_MODEL_ID')}

DATASET SETUP
──────────────────────────────────────────────
 Dataset Name            : {ds_name}
 Dataset File            : {ds_file}
 Training Document Path  : {os.getenv('DATAFILE_PATH')}
 Output Dataset Path     : {ds_path}

TRAINING PARAMETERS
──────────────────────────────────────────────
 Finetuning Train Split  : {finetuning_train_split}
 Finetuning Valid Split  : {finetuning_valid_split}
 Finetuning Threshold    : {finetuning_threshold}
 QA Threshold (derived)  : {qa_threshold}
 Questions per Chunk     : {raft_questions}

═══════════════════════════════════════════════════════════════════════
"""
)

In [None]:
!python3 .gorilla/raft/raft.py \
    --datapath $DATAFILE_PATH \
    --output $DATASET_OUTPUT_PATH \
    --doctype pdf \
    --chunk_size 512 \
    --questions 2 \
    --distractors 3 \
    --embedding_model $EMBEDDING_MODEL_ID \
    --completion_model $TEACHER_MODEL_ID

In [None]:
raft_arrow_file = f"{ds_path}/data-00000-of-00001.arrow"
os.environ["RAFT_ARROW_FILE"] = raft_arrow_file

dataset_path_hf = f"{ds_path}-files/{ds_name}-hf.full.jsonl"
os.environ["DATASET_PATH_HF"] = dataset_path_hf

dataset_path_hf_train = f"{ds_path}-files/{ds_name}-hf.train.jsonl"
dataset_path_hf_valid = f"{ds_path}-files/{ds_name}-hf.valid.jsonl"
dataset_path_hf_eval  = f"{ds_path}-files/{ds_name}-hf.eval.jsonl"

dataset_path_ft_train = f"{ds_path}-files/{ds_name}-ft.train.jsonl"
dataset_path_ft_valid = f"{ds_path}-files/{ds_name}-ft.valid.jsonl"

print(
    f"""
Intermediate Dataset Files
--------------------------
RAFT arrow file        : {raft_arrow_file}

HF JSONL (synthetic data)
  Full dataset         : {dataset_path_hf}
  Train split          : {dataset_path_hf_train}
  Valid split          : {dataset_path_hf_valid}
  Eval split           : {dataset_path_hf_eval}

Finetuning JSONL (final RAFT-style)
  Train split          : {dataset_path_ft_train}
  Valid split          : {dataset_path_ft_valid}
"""
)


In [None]:
! python .gorilla/raft/format.py \
    --input $RAFT_ARROW_FILE \
    --output $DATASET_PATH_HF \
    --output-format hf

In [None]:
# Preview the synthetic dataset generated in the HF JSONL stage
import pandas as pd

hf_full_df = pd.read_json(dataset_path_hf, lines=True)
hf_full_df.head(5)

In [None]:
# Display a random sample from the HF dataset to inspect structure and content
from IPython.display import display, Markdown
from random import randint

sample_idx = 2
sample = hf_full_df.iloc[sample_idx]

instruction_md = sample.instruction.replace("<DOCUMENT>", "`<DOCUMENT>`").replace("</DOCUMENT>", "`</DOCUMENT>`")
oracle_context_md = sample.oracle_context.replace("<DOCUMENT>", "`<DOCUMENT>`").replace("</DOCUMENT>", "`</DOCUMENT>`")
sample_answer_md = sample.cot_answer.replace("<ANSWER>", "`<ANSWER>`").replace("##begin_quote##", "`##begin_quote##`").replace("##end_quote##", "`##end_quote##`")

display(Markdown(f"""
## Oracle Context
{oracle_context_md}

## Question
{sample.question}

## CoT Answer
{sample_answer_md}
"""))

In [None]:
# Split the HF JSONL dataset into train/valid/eval splits and write them to disk
import numpy as np

samples_count = len(hf_full_df)
train_cut = int(finetuning_train_split * samples_count)
valid_cut = int((finetuning_train_split + finetuning_valid_split) * samples_count)
splits = [train_cut, valid_cut]

print(
    f"""
Splitting HF dataset
--------------------
Total samples : {samples_count}
Train split   : 0 -> {train_cut}        -> {dataset_path_hf_train}
Valid split   : {train_cut} -> {valid_cut} -> {dataset_path_hf_valid}
Eval split    : {valid_cut} -> {samples_count} -> {dataset_path_hf_eval}
"""
)

hf_train_df, hf_valid_df, hf_eval_df = np.split(hf_full_df, splits)

hf_train_df.to_json(dataset_path_hf_train, orient="records", lines=True)
hf_valid_df.to_json(dataset_path_hf_valid, orient="records", lines=True)
hf_eval_df.to_json(dataset_path_hf_eval, orient="records", lines=True)

In [None]:
! python .gorilla/raft/format.py \
    --input $dataset_path_hf_train \
    --input-type jsonl \
    --output $dataset_path_ft_train \
    --output-format $format \
    --output-completion-prompt-column text\
    --output-completion-completion-column ground_truth

In [None]:
! python .gorilla/raft/format.py \
    --input $dataset_path_hf_valid \
    --input-type jsonl \
    --output $dataset_path_ft_valid \
    --output-format $format \
    --output-completion-prompt-column text\
    --output-completion-completion-column ground_truth

In [None]:
# Inspect finetuning and eval splits
dataset_path_ft_valid_df = pd.read_json(dataset_path_ft_valid, lines=True)
dataset_path_ft_valid_df.head(2)

pd.read_json(dataset_path_hf_eval, lines=True).head(2)