In [10]:
from model_suggestor import ModelSuggestor
from typing import Dict, List, Tuple
from dotenv import load_dotenv
import guidance
import os
import re

load_dotenv()

api_key = os.getenv("api_key")
organization = os.getenv("organization")

# set the default language model used to execute guidance programs
gpt4 = guidance.llms.OpenAI(api_key=api_key, organization=organization, model="gpt-4")
gpt3 = guidance.llms.OpenAI(api_key=api_key, organization=organization, model="gpt-3.5-turbo-16k")
davinci = guidance.llms.OpenAI(api_key=api_key, organization=organization, model="text-davinci-003")
davinci2 = guidance.llms.OpenAI(api_key=api_key, organization=organization, model="davinci")

In [11]:
modeler = ModelSuggestor()

In [12]:
import pandas as pd
import numpy as np

variable_names = [
    "sleep_quality", 
    "heart_rate_variability", 
    "age", 
    "income_level", 
    "residential_area", 
    "medication", 
    "physical_activity_level", 
    "mental_stress_level", 
    "occupational_stress_level", 
    "ambient_temperature",
    "sleep_hygiene_practices",
    "chronic_health_conditions", 
    "menstrual_cycle_phase", 
    "health_status", 
    "lifestyle_factors", 
    "caffeine_consumption", 
    "nutritional_status", 
    "gender", 
    "education_level", 
]

# Create an empty DataFrame with the specified column names
df = pd.DataFrame(columns=variable_names)

# # Generate some sample data
# sample_data = np.random.randn(10, len(variable_names))

# # Populate the DataFrame with the sample data
# df = pd.DataFrame(sample_data, columns=variable_names)

# Set the data type of each column to float
df = df.astype(float)

variables_and_datatypes : Dict[str, str] = {}

# Iterate over the columns and their corresponding datatypes
for column, datatype in df.items():
    variables_and_datatypes[column] = datatype.dtype

for key, value in variables_and_datatypes.items():
    print(f"{key}: {value}\n")

sleep_quality: float64

heart_rate_variability: float64

age: float64

income_level: float64

residential_area: float64

medication: float64

physical_activity_level: float64

mental_stress_level: float64

occupational_stress_level: float64

ambient_temperature: float64

sleep_hygiene_practices: float64

chronic_health_conditions: float64

menstrual_cycle_phase: float64

health_status: float64

lifestyle_factors: float64

caffeine_consumption: float64

nutritional_status: float64

gender: float64

education_level: float64



In [13]:
import dodiscover as dd

In [14]:
variables_and_descriptions = modeler.suggest_descriptions(variable_names=variable_names, llm=gpt4)

In [15]:
for variable, description in variables_and_descriptions.items():
    print(f"{variable}: {description}")

sleep_quality: The subjective rating of the individual's quality of sleep, typically measured on a scale (for example 1-5, with 1 being poor quality and 5 being excellent).
heart_rate_variability: The variability in the time interval between consecutive heartbeats in milliseconds. This can be used as a measure of the body's autonomic nervous system function and stress level.
age: The age of the individual or subject in the dataset, typically measured in years.
income_level: The income level of the individual, typically categorized into ranges like low, middle, and high, based on their total annual income.
residential_area: The residential area where the individual or subject of the study lives, typically categorized by urban, suburban, or rural settings.
medication: The type or name of the medication that the patient is currently taking or has been prescribed.
physical_activity_level: The level of physical activity performed by the individual, often categorized into levels such as low,

In [16]:
treatment="sleep_quality"
outcome="heart_rate_variability"

In [17]:
latent_variables_explanation = modeler.suggest_latent_confounders(variables_and_descriptions=variables_and_descriptions, treatment=treatment, outcome=outcome, llm=gpt4)

In [22]:
for confounder, explanation in latent_variables_explanation.items():
    print(f"{confounder}: {explanation}")

In [20]:
confounders_and_descriptions = modeler.suggest_confounders(variables_and_descriptions=variables_and_descriptions, llm=gpt4, treatment="sleep_quality", outcome="heart_rate_variability")

In [21]:
for confounder, description in confounders_and_descriptions.items():
    print(f"{confounder}: {description}")

In [None]:
relevant_relationships_and_descriptions = modeler.suggest_variable_relationships(variables_and_descriptions=variables_and_descriptions, latent_confounders_descriptions=confounders_and_descriptions, llm=gpt4)

KeyError: 'relationships'

In [23]:
for relationship, description in relevant_relationships_and_descriptions.items():
    print(f"{relationship}: {description}") 

NameError: name 'relevant_relationships_and_descriptions' is not defined

In [None]:
import networkx as nx

g = nx.DiGraph()
g.add_nodes_from(variable_names)
g.add_edges_from(relevant_relationships_and_descriptions.keys())

'''show graph'''
import matplotlib.pyplot as plt

nx.draw(g, with_labels=True)
plt.show()


In [None]:
vars = {**variables_and_descriptions, **variables_and_descriptions}

print(vars)

In [24]:
generate_relationships = guidance('''
{{#system~}}
You are a helpful assistant with expertise in causal inference. Given two variables along with their description, your task is to identify which variable is the parent and which is the child. and explain the reasoning behind that causal connection. During your analysis, consider various causal factors to guide your assessment.
------------------------------------------
Input:

    Dataset with descriptions
        name_of_first_variable: Description of first variable.
        ...
        name_of_nth_confounder: Description of nth variable.     

    Selected variables
        first_variable
        second_variable    

Output (if there is a causal relationship):
    <parent>variable_a</parent>
    <child>variable_b</child>
    <explanation>Explanation for why and how variable_a is the parent of variable_b.</explanation>

Output (if there is no causal relationship): 
    <null>
{{~/system}}

{{#user~}}
Dataset schema with descriptions
{{variables_and_descriptions}}

Selected variables
{{variable_a}}
{{variable_b}}
{{~/user}}

{{#assistant~}}
{{gen 'relationships' temperature=0.7}}
{{~/assistant}} 

''')


In [31]:
relevant_variables_and_descriptions = list(variables_and_descriptions.keys()) + list(latent_variables_explanation.keys())

generate_relationships = generate_relationships()

relationships_and_descriptions: Dict[Tuple[str, str], str] = {}

for variable_a in relevant_variables_and_descriptions:

    for variable_b in relevant_variables_and_descriptions:

        if variable_a != variable_b:
            
            output = generate_relationships(variables_and_descriptions=relevant_variables_and_descriptions, variable_a=variable_a, variable_b=variable_b, llm=gpt4)

            parent_match = re.search(r'<parent>(.*?)</parent>', output['relationships'])
            child_match = re.search(r'<child>(.*?)</child>', output['relationships'])
            explanation_match = re.search(r'<explanation>(.*?)</explanation>', output['relationships'])

            if parent_match and child_match and explanation_match:
                relationships_and_descriptions[(parent_match.group(1), child_match.group(1))] = explanation_match.group(1)

relationships_and_descriptions

