In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from openai import OpenAI
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from dotenv import load_dotenv
load_dotenv()

#print(pd.__version__)

dataframe = pd.read_csv('health_data_v1.csv')
print(dataframe.head())
print(dataframe.info())

# Features for the model
features = ['heart_rate', 'activity_level', 'age', 'cholesterol', 'diabetes']

# Extract the features from the data frame
X = dataframe[features]
print(X.head())

# Identify the label
label = 'risk_level'

# Extract the label from the data frame 
y = dataframe[label]
print(y.head())

# Split the data into training and testing sets 
# Set the split ratio
test_size = 0.2

# random_state for reproducibility of the split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=22)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

# Create the model
model = RandomForestClassifier(n_estimators=100, random_state=22)

# Train the model
model.fit(X_train, y_train)

# Function to predict health risks

def predict_health_risk(heart_rate, activity_level, age, cholesterol, diabetes):
    input_data = pd.DataFrame({
        'heart_rate': [heart_rate],
        'activity_level': [activity_level],
        'age': [age],
        'cholesterol': [cholesterol],
        'diabetes': [diabetes]
    })
    prediction = model.predict(input_data)[0]
    risk = 'high' if prediction == 1 else 'low'
    return risk

# Initialize OpenAI client to make api calls
client = OpenAI()

# Function to generate health advice using OpenAI (gpt-4o)
def generate_health_advice(heart_rate, activity_level, age, cholesterol, diabetes):
    risk = predict_health_risk(heart_rate, activity_level, age, cholesterol, diabetes)
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": f"Provide health advice for a person with heart rate {heart_rate}, activity level {activity_level}, age {age}, cholesterol {cholesterol}, diabetes {diabetes} and risk level {risk}."
            }
        ]
    )
    return response.choices[0].message.content

# Model evaluation
# Predict on the test set
y_pred = model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


# Example usage
# Test values for one patient
test_heart_rate = 105
test_activity_level = 2
test_age = 40
test_cholesterol = 260
test_diabetes = 1

# Check the model's risk prediction
predicted_risk = predict_health_risk(
    test_heart_rate,
    test_activity_level,
    test_age,
    test_cholesterol,
    test_diabetes
)

print("Predicted risk level:", predicted_risk)

# AI‑generated health advice
advice = generate_health_advice(
    test_heart_rate,
    test_activity_level,
    test_age,
    test_cholesterol,
    test_diabetes
)

print("\nHealth advice:")
print(advice)


   heart_rate  activity_level  age  cholesterol  diabetes  risk_level
0          57               2   30          253         0           1
1          46               1   40          181         0           1
2          50               1   89          247         0           1
3          78               4   46          157         0           0
4          58               7   28          239         0           0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   heart_rate      500 non-null    int64
 1   activity_level  500 non-null    int64
 2   age             500 non-null    int64
 3   cholesterol     500 non-null    int64
 4   diabetes        500 non-null    int64
 5   risk_level      500 non-null    int64
dtypes: int64(6)
memory usage: 23.6 KB
None
   heart_rate  activity_level  age  cholesterol  diabetes
0          57               2 