In [1]:
from typing import List
from datasets import load_dataset
from pyprojroot import here
import yaml
from functools import partial

In [4]:
with open("configs/config.yml") as cfg:
    app_config = yaml.load(cfg, Loader=yaml.FullLoader)

In [5]:
app_config

{'raw_data_dir': {'dir': 'data/raw',
  'technical_support_pdf_dir': 'data/raw/technical support.pdf'},
 'json_dir': {'dir': 'data/json',
  'technical_support_qa': 'data/json/technical_support_qa.json',
  'product_user_manual_instruction_response': 'data/json/product_user_manual.json'},
 'interim_dir': {'dir': 'data/interim',
  'cubetriangle_qa': 'data/interim/output.json',
  'cubetriangle_instruction_response': 'data/interim/cubetriangle_instruction_response.jsonl'},
 'model_dir': {'llama2_7b': 'models/converted_llama_models/llama-2-7b',
  'llama2_7b_chat': 'models/converted_llama_models/llama-2-7b-chat'},
 'finetuned_model_dir': 'models/fine_tuned_models/CubeTriangle_{}_{}_{}',
 'llama_cfg': {'max_seq_len': 512, 'max_batch_size': 6},
 'data_type': 'qa_in_input_ids_qa_in_label',
 'llm_function_caller': {'gpt_model': 'gpt-35-turbo-16k',
  'temperature': 0,
  'system_role': 'You are a helpful CubeTriangle chatbot. Your goal is to interact with customers and treat them respectfully. Greet

In [7]:
cubetriangle_qa_interim_dir = str(
    app_config["interim_dir"]["cubetriangle_qa"])
#cubetriangle_instruction_response_interim_dir = str(
    #here(app_config["interim_dir"]["cubetriangle_instruction_response"]))
tokenizer_max_length = 2048

In [10]:
def tokenize_the_data(examples,
                      tokenizer,
                      tokenizer_max_length: int = tokenizer_max_length,
                      column_names: List = ["question", "answer"],
                      data_type: str = "cubetriangle"
                      ):
    # if "question" in examples and "answer" in examples:
    if data_type == "cubetriangle":
        text = examples[column_names[0]][0] + examples[column_names[1]][0]
    elif data_type == "guanaco":
        text = examples["text"][0]
    else:
        raise ValueError(
            "Invalid data_type. Supported values are 'cubetriangle' and 'guanaco'.")

    tokenizer.pad_token = tokenizer.eos_token
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        padding=True,
    )

    max_length = min(
        tokenized_inputs["input_ids"].shape[1],
        tokenizer_max_length
    )
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=max_length
    )
    return tokenized_inputs

In [9]:
def prepare_cubetrianlge_qa_dataset(tokenizer,
                                    tokenizer_max_length: int = tokenizer_max_length,
                                    column_names: List = [
                                        "question", "answer"],
                                    data_dir: str = cubetriangle_qa_interim_dir,
                                    data_type: str = "cubetriangle"
                                    ):
    finetuning_dataset = load_dataset(
        'json', data_files=data_dir, split="train")
    print("Raw dataset shape:", finetuning_dataset)
    # Define a partial function with fixed arguments
    partial_tokenize_function = partial(
        tokenize_the_data,
        tokenizer=tokenizer,
        tokenizer_max_length=tokenizer_max_length,
        column_names=column_names,
        data_type=data_type
    )
    # print("Processed data description:\n")
    # print(finetuning_dataset)
    # print("---------------------------")
    tokenized_dataset = finetuning_dataset.map(
        partial_tokenize_function,
        batched=True,
        batch_size=1,
        drop_last_batch=True
    )
    tokenized_dataset = tokenized_dataset.add_column(
        "labels", tokenized_dataset["input_ids"])
    return tokenized_dataset