In [1]:
# imports
import re
import os
import sys
import io
import json
from dotenv import load_dotenv
from openai import OpenAI
import anthropic
import gradio as gr
from pathlib import Path
from datetime import datetime
import requests
import subprocess
from IPython.display import Markdown, display, update_display
import ollama

In [2]:
load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY')

OPENAI_MODEL = "gpt-4o-mini"
MODEL_LLAMA = "llama3.2"

openai = OpenAI()

In [3]:
system_prompt = """You are a helpful assistant who can generate dataset if given business problem."""

In [4]:
def get_user_prompt_tabular(business_problem, dataset_format, file_format, num_samples):

    user_message = f"""
    The business problem is: {business_problem}. \n
    The dataset is expected to be in {dataset_format}.
    For the dataset types such as tabular or time series implement python code for creating the dataset.
    If the generated dataset contains several entities, i.e. products, users write the output for these entities into separate files.
    The dependencies for python code should include only standard python libraries such as numpy, pandas and built-in-libraries.
    The output dataset is stored as a {file_format} file and contains {num_samples} samples.\n
    """
    return user_message


def get_user_prompt_text(business_problem, dataset_format, file_format):

    user_message = f"""
    The business problem is: {business_problem}. \n
    The dataset is expected to be in {dataset_format}.
    For the text type return the generated dataset and the python code to write the output to the files.
    If the generated dataset contains several entities, i.e. products, users write the output for these entities into separate files.
    The dependencies for python code should include only standard python libraries such as numpy, pandas and built-in-libraries.
    The output dataset is stored as a {file_format}
    """
    return user_message

def select_user_prompt(business_problem, dataset_format, file_format, num_samples):
    user_prompt = ""
    if dataset_format == "Text":
        user_prompt = get_user_prompt_text(business_problem, dataset_format, file_format)
    elif dataset_format in ["Tabular", "Time-series"]:
        user_prompt = get_user_prompt_tabular(business_problem, dataset_format, file_format, num_samples)
    return user_prompt

In [5]:
def stream_gpt(business_problem, dataset_format, file_format, num_samples):

    user_prompt = select_user_prompt(
                    business_problem, dataset_format, file_format, num_samples
                )
    stream = openai.chat.completions.create(
        model=OPENAI_MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
        stream=True,
    )

    response = ""
    for chunk in stream:
        response += chunk.choices[0].delta.content or ""
        yield response
    return response


def stream_llama(business_problem, dataset_format, file_format, num_samples):
    user_prompt = select_user_prompt(
                    business_problem, dataset_format, file_format, num_samples
                )
    message = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
      ]
    stream = ollama.chat(model=MODEL_LLAMA, messages=message, stream=True)
    
    response = ""
    for chunk in stream:
        response += chunk['message']['content']
        yield response
    return response

def generate_dataset(business_problem, dataset_format, file_format, num_samples, model):
    if model == "GPT":
        result = stream_gpt(business_problem, dataset_format, file_format, num_samples)
    elif model == "Llama":
        result = stream_llama(business_problem, dataset_format, file_format, num_samples)
    else:
        raise ValueError("Unknown model")
    for stream_so_far in result:
        yield stream_so_far
    return result

In [6]:
def extract_code(text):
    """Extract Python code from a markdown-style code block."""
    match = re.search(r"```python(.*?)```", text, re.DOTALL)
    return match.group(1).strip() if match else ""

def get_conda_python():
    """Get the Python interpreter path in the active Conda environment."""
    conda_prefix = os.environ.get("CONDA_PREFIX")
    if not conda_prefix:
        raise EnvironmentError("No active Conda environment found.")
    return os.path.join(conda_prefix, "bin", "python")

def execute_code_in_conda_env(text):
    """Execute extracted Python code inside the active Conda environment."""
    # python_interpreter = get_conda_python()
    python_interpreter = "C:\\Users\\Aquib\\anaconda3\\envs\\llms\\python.exe"
    
    if not os.path.exists(python_interpreter):
        raise EnvironmentError("Conda Python interpreter not found.")

    code_str = extract_code(text)
    if not code_str:
        print("No valid Python code found in the input text.")
        return
    
    command = [python_interpreter, "-c", code_str]

    try:
        result = subprocess.run(command, check=True, capture_output=True, text=True)
        print("Output:", result.stdout)
        print("Errors:", result.stderr)
    except subprocess.CalledProcessError as e:
        print(f"Error while executing code: {e}")

# Example usage
code_string = """```python
print('Hello from Conda virtual environment!')
```"""

execute_code_in_conda_env(code_string)

Output: Hello from Conda virtual environment!

Errors: 


In [7]:
with gr.Blocks() as ui:
    gr.Markdown("## Create a dataset for a business problem")
    with gr.Column():
        business_problem = gr.Textbox(label="Business problem", lines=2)
        dataset_type = gr.Dropdown(
            ["Tabular", "Time-series", "Text"], label="Dataset modality"
        )
        dataset_format = gr.Dropdown(["JSON", "csv", "parquet", "Markdown"], label="Output format")
        num_samples = gr.Number(label="Number of samples (for tabular and time-series data)", value=10, precision=0)
        model = gr.Dropdown(["GPT", "Llama"], label="Select model", value="GPT")
    with gr.Row():
        dataset_run = gr.Button("Create a dataset")
        code_run = gr.Button("Execute code for a dataset")
    with gr.Row():
        dataset_out = gr.Textbox(label="Generated Dataset")
        code_out = gr.Textbox(label="Executed code")
    dataset_run.click(
        generate_dataset,
        inputs=[business_problem, dataset_type, dataset_format, num_samples, model],
        outputs=[dataset_out]
    )
    code_run.click(execute_code_in_conda_env, inputs=[dataset_out], outputs=[code_out])

In [8]:
ui.launch(inbrowser=True)

* Running on local URL:  http://127.0.0.1:7862

To create a public link, set `share=True` in `launch()`.


