In [199]:
# /// script
# requires-python = ">=3.11"
# dependencies = [
#     "importlib",
#     "pandas",
#     "udi-grammar-py",``
# ]
# ///

## Imports + Top-Level Variables

In [200]:
from udi_grammar_py import Chart, Op, rolling
import pandas as pd
import sys
import template_generation
import schema_generation
import template_expansion
import paraphraser
import convert_for_finetuning
import importlib
import json
sys.path.append('.')

UPLOAD_TO_HUGGINGFACE = False # Set to True if you want to upload the training data to Hugging Face
PERFORM_PARAPHRASING = False # paraphrasing is time consuming, so skipping makes it easier to test the rest of the pipeline

In [201]:
importlib.reload(template_generation)
df = template_generation.generate()
template_question_count = df.shape[0]
df.head()

Unnamed: 0,query_template,constraints,spec_template,query_type,creation_method
0,"How many <E> are there, grouped by <F:n>?","[F.c * 2 < E.c, F.c < 4]","{""source"": {""name"": ""<E>"", ""source"": ""<E.url>""...",question,template


## update data schema based on files in ./datasets folder

In [202]:
importlib.reload(schema_generation)
schema_generation.main()

## Contextualize the template training data by putting in real entity names and fields if they satisfy the constraints.

In [203]:
importlib.reload(template_expansion)
with open('./datasets/schema.json') as f:
    schema_list = json.load(f)
    df = template_expansion.expand(df, schema_list)

df.head()

Unnamed: 0,query_template,constraints,spec_template,query_type,creation_method,query_base,spec,solution,dataset_schema
0,"How many <E> are there, grouped by <F:n>?","[F.c * 2 < E.c, F.c < 4]","{""source"": {""name"": ""<E>"", ""source"": ""<E.url>""...",question,template,"How many donors are there, grouped by weight_u...","{""source"": {""name"": ""donors"", ""source"": ""./dat...","{'E': {'entity': 'donors', 'url': './data/hubm...",hubmap
0,"How many <E> are there, grouped by <F:n>?","[F.c * 2 < E.c, F.c < 4]","{""source"": {""name"": ""<E>"", ""source"": ""<E.url>""...",question,template,"How many donors are there, grouped by sex?","{""source"": {""name"": ""donors"", ""source"": ""./dat...","{'E': {'entity': 'donors', 'url': './data/hubm...",hubmap
0,"How many <E> are there, grouped by <F:n>?","[F.c * 2 < E.c, F.c < 4]","{""source"": {""name"": ""<E>"", ""source"": ""<E.url>""...",question,template,"How many donors are there, grouped by serum_cr...","{""source"": {""name"": ""donors"", ""source"": ""./dat...","{'E': {'entity': 'donors', 'url': './data/hubm...",hubmap
0,"How many <E> are there, grouped by <F:n>?","[F.c * 2 < E.c, F.c < 4]","{""source"": {""name"": ""<E>"", ""source"": ""<E.url>""...",question,template,"How many donors are there, grouped by rh_blood...","{""source"": {""name"": ""donors"", ""source"": ""./dat...","{'E': {'entity': 'donors', 'url': './data/hubm...",hubmap
0,"How many <E> are there, grouped by <F:n>?","[F.c * 2 < E.c, F.c < 4]","{""source"": {""name"": ""<E>"", ""source"": ""<E.url>""...",question,template,"How many donors are there, grouped by patholog...","{""source"": {""name"": ""donors"", ""source"": ""./dat...","{'E': {'entity': 'donors', 'url': './data/hubm...",hubmap


## The paraphraser will use LLM frameork to paraphrase the query_base into several options


In [204]:
importlib.reload(paraphraser)
expanded_question_count = df.shape[0]
if PERFORM_PARAPHRASING:
    df = paraphraser.paraphrase(df)
else:
    df['query'] = df['query_base']
    df['expertise'] = -1
    df['formality'] = -1
    
paraphrased_question_count = df.shape[0]
df.head()


Unnamed: 0,query_template,constraints,spec_template,query_type,creation_method,query_base,spec,solution,dataset_schema,query,expertise,formality
0,"How many <E> are there, grouped by <F:n>?","[F.c * 2 < E.c, F.c < 4]","{""source"": {""name"": ""<E>"", ""source"": ""<E.url>""...",question,template,"How many donors are there, grouped by weight_u...","{""source"": {""name"": ""donors"", ""source"": ""./dat...","{'E': {'entity': 'donors', 'url': './data/hubm...",hubmap,"How many donors are there, grouped by weight_u...",-1,-1
0,"How many <E> are there, grouped by <F:n>?","[F.c * 2 < E.c, F.c < 4]","{""source"": {""name"": ""<E>"", ""source"": ""<E.url>""...",question,template,"How many donors are there, grouped by sex?","{""source"": {""name"": ""donors"", ""source"": ""./dat...","{'E': {'entity': 'donors', 'url': './data/hubm...",hubmap,"How many donors are there, grouped by sex?",-1,-1
0,"How many <E> are there, grouped by <F:n>?","[F.c * 2 < E.c, F.c < 4]","{""source"": {""name"": ""<E>"", ""source"": ""<E.url>""...",question,template,"How many donors are there, grouped by serum_cr...","{""source"": {""name"": ""donors"", ""source"": ""./dat...","{'E': {'entity': 'donors', 'url': './data/hubm...",hubmap,"How many donors are there, grouped by serum_cr...",-1,-1
0,"How many <E> are there, grouped by <F:n>?","[F.c * 2 < E.c, F.c < 4]","{""source"": {""name"": ""<E>"", ""source"": ""<E.url>""...",question,template,"How many donors are there, grouped by rh_blood...","{""source"": {""name"": ""donors"", ""source"": ""./dat...","{'E': {'entity': 'donors', 'url': './data/hubm...",hubmap,"How many donors are there, grouped by rh_blood...",-1,-1
0,"How many <E> are there, grouped by <F:n>?","[F.c * 2 < E.c, F.c < 4]","{""source"": {""name"": ""<E>"", ""source"": ""<E.url>""...",question,template,"How many donors are there, grouped by patholog...","{""source"": {""name"": ""donors"", ""source"": ""./dat...","{'E': {'entity': 'donors', 'url': './data/hubm...",hubmap,"How many donors are there, grouped by patholog...",-1,-1


## Export as json object

In [205]:
df.to_json('./out/training_data.json', orient='records')

## Upload data to Huggging Face after converting data frame into format expected for fine tuning

In [206]:
importlib.reload(convert_for_finetuning)

with open('./datasets/UDIGrammarSchema.json') as grammar_file:
    grammar_schema = json.load(grammar_file)
    convert_for_finetuning.convert(df, schema_list, grammar_schema, './out/finetuning_data.json', './out/huggingface/', push_to_hub=UPLOAD_TO_HUGGINGFACE)


Saving the dataset (1/1 shards): 100%|██████████| 86/86 [00:00<00:00, 1709.81 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 86/86 [00:00<00:00, 1759.66 examples/s]


## Sanity Check output

In [207]:
from IPython.display import display, Markdown

display(Markdown(f"### Generated **{template_question_count:,}** templates and expanded to **{expanded_question_count:,}** questions and paraphrased to **{paraphrased_question_count:,}**."))

### Generated **1** templates and expanded to **86** questions and paraphrased to **86**.