In [1]:
%pip install openai


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
from openai import OpenAI
import os
import glob
import json
import random

In [3]:
# Builds all the data into a single JSONL file
def collectTQLFiles(input_directory, output_file):
    system_message = {
        "role": "system",
        "content": "You are a helpful assistant, returning correct TQL content."
    }

    user_message = {
        "role": "user",
        "content": "Please share the TQL content."
    }

    tql_paths = glob.glob(os.path.join(input_directory, "*.tql"))

    with open(output_file, "w", encoding="utf-8") as out_f:
        for tql_file in tql_paths:
            with open(tql_file, "r", encoding="utf-8") as f:
                tql_content = f.read()

            messages = [
                system_message,
                user_message,
                {
                    "role": "assistant",
                    "content": tql_content
                }
            ]

            record = {
               
                "messages": messages
            }
            out_f.write(json.dumps(record, ensure_ascii=False) + "\n")


input_dir = "/Users/Shantanu/downloads/txt2tql"
output_file = "tql_data.json"

collectTQLFiles(input_dir, output_file)

def validate_jsonl(file_path):
    try:
        with open(file_path, 'r') as file:
            for line in file:
                json.loads(line)
        print("JSON Lines file is valid.")
    except json.JSONDecodeError as e:
        print(f"Invalid JSON in line: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")

validate_jsonl('tql_data.jsonl')

JSON Lines file is valid.


In [4]:
# More usefully, generates training, validation, and test splits
def combine_files(input_dir):
    """Reads all .tql files in input_dir and returns a list of file contents."""
    file_pattern = os.path.join(input_dir, "*.tql")
    files = glob.glob(file_pattern)
    if not files:
        raise FileNotFoundError(f"No .tql files found in directory: {input_dir}")
    
    corpus = []
    for file in files:
        with open(file, "r", encoding="utf-8") as f:
            content = f.read().strip()
            if content:
                corpus.append(content)
    return corpus


def split_corpus(corpus, train_ratio, val_ratio, test_ratio):
    """Shuffles and splits the corpus into training, validation, and test sets."""
    total = len(corpus)
    random.shuffle(corpus)
    
    train_end = int(total * train_ratio)
    val_end = train_end + int(total * val_ratio)
    
    train_set = corpus[:train_end]
    val_set = corpus[train_end:val_end]
    test_set = corpus[val_end:]
    
    return train_set, val_set, test_set

def write_split(output_dir, filename, data):
    """Writes the list of samples to a file in the output directory."""
    os.makedirs(output_dir, exist_ok=True)
    file_path = os.path.join(output_dir, filename)
    with open(file_path, "w", encoding="utf-8") as f:
        # Join samples with two newlines (adjust if needed)
        f.write("\n\n".join(data))

corpus = combine_files(input_dir)
train_set, val_set, test_set = split_corpus(corpus, 0.6, 0.2, 0.2)
write_split("/Users/Shantanu/Documents/GitHub/txt2tql", "train.tql", train_set)
write_split("/Users/Shantanu/Documents/GitHub/txt2tql", "val.tql", val_set)
write_split("/Users/Shantanu/Documents/GitHub/txt2tql", "test.tql", test_set)

In [8]:
# ignore this chunk: this is just to test the API
client = OpenAI(
  api_key="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"
)

# completion = client.chat.completions.create(
#   model="gpt-4o-mini",
#   store=True,
#   messages=[
#     {"role": "user", "content": "write a haiku about ai"}
#   ]
# )
# print(completion.choices[0].message);

In [None]:
def generate_tql_queries(client, tql_content, num_queries=1):
    """Generate natural language queries that would map to the given TQL."""
    try:
        completion = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are an expert at generating natural language queries that would map to TQL queries. Generate realistic user questions that would require this TQL to answer them."},
                {"role": "user", "content": f"Given this TQL query:\n\n{tql_content}\n\nGenerate {num_queries} natural language question(s) that this TQL query would answer."}
            ]
        )
        return completion.choices[0].message.content
    except Exception as e:
        print(f"Error generating queries: {e}")
        return None

def process_tql_files_with_queries(input_dir, output_file, client, queries_per_tql=3):
    """Process all TQL files and generate corresponding queries."""
    tql_paths = glob.glob(os.path.join(input_dir, "*.tql"))
    
    with open(output_file, "w", encoding="utf-8") as out_f:
        for tql_file in tql_paths:
            with open(tql_file, "r", encoding="utf-8") as f:
                tql_content = f.read().strip()
            
            # Generate queries for this TQL
            queries = generate_tql_queries(client, tql_content, queries_per_tql)
            
            if queries:
                record = {
                    "tql": tql_content,
                    "generated_queries": queries,
                    "file_name": os.path.basename(tql_file)
                }
                out_f.write(json.dumps(record, ensure_ascii=False) + "\n")

# Example usage (with your API key stored securely)
# client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))  # Get API key from environment variable
input_dir = "/Users/Shantanu/downloads/txt2tql"
output_file = "tql_with_queries.jsonl"

process_tql_files_with_queries(input_dir, output_file, client, queries_per_tql=3)

OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable