In [None]:
# /// script
# requires-python = ">=3.11"
# dependencies = [
#     "importlib",
#     "pandas",
#     "udi-grammar-py",``
# ]
# ///

## Imports + Top-Level Variables

In [None]:
from udi_grammar_py import Chart, Op, rolling
import pandas as pd
import sys
import template_generation
import schema_generation
import template_expansion
import paraphraser
import convert_for_finetuning
import importlib
import json
sys.path.append('.')

UPLOAD_TO_HUGGINGFACE = False # Set to True if you want to upload the training data to Hugging Face
PERFORM_PARAPHRASING = True # paraphrasing is time consuming, so skipping makes it easier to test the rest of the pipeline
ONLY_CACHED = True # if True, only cached data for paraphrasing will be used only matters if PERFORM_PARAPHRASING is True

In [None]:
importlib.reload(template_generation)
df = template_generation.generate()
template_question_count = df.shape[0]
df.head()

## update data schema based on files in ./datasets folder

In [None]:
importlib.reload(schema_generation)
schema_generation.main()

## Contextualize the template training data by putting in real entity names and fields if they satisfy the constraints.

In [None]:
importlib.reload(template_expansion)
with open('./datasets/schema.json') as f:
    schema_list = json.load(f)
    df = template_expansion.expand(df, schema_list)

df.head()

## The paraphraser will use LLM frameork to paraphrase the query_base into several options


In [None]:
importlib.reload(paraphraser)
expanded_question_count = df.shape[0]
if PERFORM_PARAPHRASING:
    df = paraphraser.paraphrase(df, ONLY_CACHED)
else:
    df['query'] = df['query_base']
    df['expertise'] = -1
    df['formality'] = -1
    
paraphrased_question_count = df.shape[0]
df.head()


## Sanity Check output

In [None]:
from IPython.display import display, Markdown

display(Markdown(f"### Generated **{template_question_count:,}** templates and expanded to **{expanded_question_count:,}** questions and paraphrased to **{paraphrased_question_count:,}**."))

## Export as json object

In [None]:
df.to_json('./out/training_data.json', orient='records')

## Upload data to Huggging Face after converting data frame into format expected for fine tuning

In [None]:
importlib.reload(convert_for_finetuning)

with open('./datasets/UDIGrammarSchema.json') as grammar_file:
    grammar_schema = json.load(grammar_file)
    convert_for_finetuning.convert(df, schema_list, grammar_schema, './out/finetuning_data.json', './out/huggingface/', push_to_hub=UPLOAD_TO_HUGGINGFACE)
