In [1]:
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pprint
import subprocess

from IPython.display import display, Markdown
from sympy import symbols, Function

import sys
sys.path.append('../')
from functions import gpt

In [2]:
purpose = "- The data is for expressions related to neural ordinary differential equation and holographic quantum chromodynamics"

annotation_name = 'a3'
math_text = """a_{i}(t)=\frac{\partial \mathcal{L}}{\partial x_{i}(t)}"""

paper_short_name = 'hashimoto'
dataset_name = f'dataset'

base_name = "2021_Hashimoto_Neural_ODE_and_holographic_QCD_PUB"
work_bucket = "AdS-CFT"
project_folder = "diygenomics-projects"
sub_category = "math"

model = 'gpt-4'
output_file = f'{paper_short_name}_{annotation_name}.py'

# A1 and A2 feed A4 and A8; A3 feeds A5, A6, and A7

In [3]:
data_path = os.getenv('DATA_PATH')
file_path = lambda *args: os.path.join(data_path, project_folder, sub_category, work_bucket, base_name, 'mathpix', 'generated_code', *args)

In [4]:
os.makedirs(file_path(), exist_ok=True)

In [5]:
dataset_prompt = f"""You are an expert mathematician and data scientist.
- I want to generate a dataset as input for a math expression
- The data need to return results that are not NaN or None
{purpose}
- Please format your response in JSON. You only speak JSON. Do not write text that isn't JSON.
- The output of this will be used as input for another expression. 
- Be consice in your output so that it can be feed into the next expression.
"""

In [6]:
dataset = gpt.chat_create(dataset_prompt, math_text, model, output_json=True)

In [7]:
dataset

{'dataset': [{'time': 1, 'x_i(t)': 0.5, 'a_i(t)': 0.2},
  {'time': 2, 'x_i(t)': 0.6, 'a_i(t)': 0.3},
  {'time': 3, 'x_i(t)': 0.7, 'a_i(t)': 0.4},
  {'time': 4, 'x_i(t)': 0.8, 'a_i(t)': 0.5},
  {'time': 5, 'x_i(t)': 0.9, 'a_i(t)': 0.6}]}

In [8]:
math_plus_data = f'math expression: {math_text}\ndataset: {dataset}'

In [9]:
system_prompt_primer = """You are an expert mathematician, data scientist and prompt engineer."""

user_prompt_primer = f"""I have this math expression: 

{math_text}

I have this dataset: 

{dataset}

- Make sure that you use the variable name 'dataset'.
- Please help me create a prompt that will allow an LLM to generate python code that can be executed to print the results of applying 
the dataset to the python code.
- Do not include any unnecessary comments. 
- Only include the required output that can be directly used as input to another call.
- Do not add anything prior to providing the prompt. 
- Only include the prompt.
- Do not say Sure
"""

In [10]:
# user_prompt = gpt.chat_create(system_prompt_primer, user_prompt_primer, model, output_json=False)

In [11]:
code_prompt = f"""You are an expert mathematician and data scientist.
- Please format your response in the python coding language. You only speak python. Do not write text that isn't python.
- Do not include any comments in your code.
- Do not include the dataset in your response. 
- The dataset variable name will be {dataset_name}.
{purpose}
- Make sure that the generated code prints the results.
- Check your work and make that the dataset works with the python code that you provide.
- Check the first key in the dataset. 
- The first key is '{next(iter(dataset))}'. 
- It is very important that you make sure that you access the dataset dict with the provided key -> '{next(iter(dataset))}'.
- Make sure that your code prints results that were fully executed and return numbers or lists of numbers
"""

# user_prompt = f"""Generate Python code that takes as input a list of dictionaries, where each dictionary represents a data point in a dataset. 
# Each dictionary has three key-value pairs: 'time', 'x_i', and 'a_i'. 
# The code should implement the mathematical expression {math_text}. 
# Use the following dataset as an example for testing:
# {dataset}
# """

In [12]:
python_code = gpt.chat_create(code_prompt, math_plus_data, model, output_json=False)

In [13]:
display(Markdown(python_code))

import pandas as pd

df = pd.DataFrame(dataset['dataset'])

print(df)

In [14]:
pretty_dataset = json.dumps(dataset, indent=4)

with open(file_path(output_file), 'w') as f:
    f.write(f'{dataset_name} = {pretty_dataset}\n\n')
    f.write(python_code)

In [15]:
command = ['python', file_path(output_file)]
result = subprocess.run(command, text=True, capture_output=True)

output = result.stdout
error_output = result.stderr

if result.returncode != 0:
    print("Error output:", error_output)
else:
    print("Output:", output)

Output:    time  x_i(t)  a_i(t)
0     1     0.5     0.2
1     2     0.6     0.3
2     3     0.7     0.4
3     4     0.8     0.5
4     5     0.9     0.6

