In [1]:
from model_suggestor import ModelSuggestor
from typing import Dict, List, Tuple
from dotenv import load_dotenv
import guidance
import os
import re

load_dotenv()

api_key = os.getenv("api_key")
organization = os.getenv("organization")

# set the default language model used to execute guidance programs
gpt4 = guidance.llms.OpenAI(api_key=api_key, organization=organization, model="gpt-4")
gpt3 = guidance.llms.OpenAI(api_key=api_key, organization=organization, model="gpt-3.5-turbo-16k")
davinci = guidance.llms.OpenAI(api_key=api_key, organization=organization, model="text-davinci-003")
davinci2 = guidance.llms.OpenAI(api_key=api_key, organization=organization, model="davinci")

In [2]:
modeler = ModelSuggestor()

In [4]:
import pandas as pd
import numpy as np

variable_names = [
    "sleep_quality", 
    "heart_rate_variability", 
    "age", 
    "income_level", 
    "residential_area", 
    "medication", 
    "physical_activity_level", 
    "mental_stress_level", 
    "occupational_stress_level", 
    "ambient_temperature",
    "sleep_hygiene_practices",
    "chronic_health_conditions", 
    "menstrual_cycle_phase", 
    "health_status", 
    "lifestyle_factors", 
    "caffeine_consumption", 
    "nutritional_status", 
    "gender", 
    "education_level", 
]

# Create an empty DataFrame with the specified column names
df = pd.DataFrame(columns=variable_names)

# # Generate some sample data
# sample_data = np.random.randn(10, len(variable_names))

# # Populate the DataFrame with the sample data
# df = pd.DataFrame(sample_data, columns=variable_names)

# Set the data type of each column to float
df = df.astype(float)

variables_and_datatypes : Dict[str, str] = {}

# Iterate over the columns and their corresponding datatypes
for column, datatype in df.items():
    variables_and_datatypes[column] = datatype.dtype

for key, value in variables_and_datatypes.items():
    print(f"{key}: {value}\n")

sleep_quality: float64

heart_rate_variability: float64

age: float64

health_status: float64

lifestyle_factors: float64

caffeine_consumption: float64

nutritional_status: float64

gender: float64

education_level: float64

income_level: float64

residential_area: float64

medication: float64

physical_activity_level: float64

mental_stress_level: float64

occupational_stress_level: float64

ambient_temperature: float64

sleep_hygiene_practices: float64

chronic_health_conditions: float64

menstrual_cycle_phase: float64



In [5]:
import dodiscover as dd

context_builder = dd.make_context()
context_builder.observed_variables(variables_and_datatypes)

<dodiscover.context_builder.ContextBuilder at 0x1e8a2ff3400>

In [6]:
variables_and_descriptions = modeler.suggest_descriptions(variable_names=variable_names, llm=gpt4)

In [7]:
context_builder.observed_variables_descriptions(variables_and_descriptions)

<dodiscover.context_builder.ContextBuilder at 0x1e8a2ff3400>

In [8]:
for variable, description in variables_and_descriptions.items():
    print(f"{variable}: {description}")

sleep_quality: The subjective rating of the individual's quality of sleep, typically measured on a scale (for example 1-5, with 1 being poor quality and 5 being excellent).
heart_rate_variability: The variability in the time interval between consecutive heartbeats in milliseconds. This can be used as a measure of the body's autonomic nervous system function and stress level.
age: The age of the individual or subject in the dataset, typically measured in years.
health_status: The overall health condition of the individual, often categorized as 'excellent', 'good', 'fair', or 'poor'.
lifestyle_factors: The lifestyle_factors column represents various lifestyle behaviors or conditions of the individual that may impact their health. These can include factors like smoking, alcohol consumption, physical activity level, diet, sleep patterns, etc.
caffeine_consumption: The amount of caffeine, typically measured in milligrams, consumed by the individual on a daily basis. This could include caffe

In [9]:
treatment="sleep_quality"
outcome="heart_rate_variability"

In [10]:
(latent_variables_descriptions, latent_variables_explanation) = modeler.suggest_latent_confounders(variables_and_descriptions=variables_and_descriptions, treatment=treatment, outcome=outcome, llm=gpt4)

In [11]:
for confounder, description in latent_variables_descriptions.items():
    print(f"{confounder}: {description}")

genetic_factors: Individual genetic factors such as predisposition to certain health conditions, metabolism rate, or inherent sleep patterns can influence both sleep quality and heart rate variability. These factors are not directly observed in the dataset but can significantly influence both variables.
stressors_not_captured_in_dataset: There may be sources of stress not captured in the dataset, such as personal relationship stress, financial stress, or other life events, which can affect both sleep quality and heart rate variability.
circadian_rhythm_disorders: Individuals may have undiagnosed circadian rhythm disorders, which are disruptions in a person's "internal body clock" that regulate sleep-wake cycles and can influence sleep quality and heart rate variability.
undiagnosed_medical_conditions: Individuals may have undiagnosed medical conditions, such as sleep apnea or heart disease, that can influence both sleep quality and heart rate variability.


In [12]:
for confounder, explanation in latent_variables_explanation.items():
    print(f"{confounder}: {explanation}")

genetic_factors: Genetic factors could affect sleep quality by predisposing an individual to certain sleep disorders or influencing natural sleep patterns. These same genetic factors could also affect heart rate variability by influencing metabolic rate, inherent stress levels, or susceptibility to certain heart conditions. If not accounted for, these unobserved genetic factors could confound the relationship between sleep quality and heart rate variability.
stressors_not_captured_in_dataset: These unobserved stressors could reduce sleep quality by causing worry or anxiety that prevents restful sleep. Simultaneously, these stressors could increase heart rate variability by causing the body to be in a state of heightened stress or alertness. Without accounting for these latent stressors, the relationship between sleep quality and heart rate variability may be confounded.
circadian_rhythm_disorders: Circadian rhythm disorders could lead to poor sleep quality due to disturbed sleep-wake c

In [13]:
confounders_and_descriptions = modeler.suggest_confounders(variables_and_descriptions=variables_and_descriptions, llm=gpt4, treatment="sleep_quality", outcome="heart_rate_variability")

In [14]:
context_builder.observed_variables(variables_and_datatypes)

<dodiscover.context_builder.ContextBuilder at 0x1e8a2ff3400>

In [15]:
for confounder, description in confounders_and_descriptions.items():
    print(f"{confounder}: {description}")

age: Age could be a confounder as it can influence both sleep quality and heart rate variability. As people age, sleep patterns can change and sleep disorders become more common. Additionally, heart rate variability also tends to decrease with age.
health_status: Health status can influence both sleep quality and heart rate variability. Poor health can lead to poorer sleep quality and also affect heart rate variability.
lifestyle_factors: Lifestyle factors such as physical activity, diet, and smoking can influence both sleep quality and heart rate variability. For example, a sedentary lifestyle or poor diet can contribute to poor sleep and lower heart rate variability.
caffeine_consumption: Caffeine consumption can influence both sleep quality and heart rate variability. High caffeine intake can lead to poor sleep and also increase heart rate variability.


In [16]:
relevant_relationships_and_descriptions = modeler.suggest_variable_relationships(variables_and_descriptions=variables_and_descriptions, latent_confounders_descriptions=confounders_and_descriptions, treatment="sleep_quality", outcome="heart_rate_variability", llm=gpt4)

TypeError: Tuple[t0, t1, ...]: each t must be a type. Got ['sleep_quality'].

In [17]:
for relationship, description in relevant_relationships_and_descriptions.items():
    print(f"{relationship}: {description}") 

NameError: name 'relevant_relationships_and_descriptions' is not defined

In [17]:
import networkx as nx

g = nx.DiGraph()
g.add_nodes_from(variable_names)
g.add_edges_from(relevant_relationships_and_descriptions.keys())

'''show graph'''
import matplotlib.pyplot as plt

nx.draw(g, with_labels=True)
plt.show()


NameError: name 'relevant_relationships_and_descriptions' is not defined