In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Based on the Figma design, here are the key features we need:

def generate_health_dataset(num_samples=10000):
    np.random.seed(42)

    # User Demographics (from user profile)
    ages = np.random.randint(18, 70, num_samples)
    genders = np.random.choice(['Male', 'Female', 'Other'], num_samples, p=[0.48, 0.48, 0.04])
    weights = np.random.randint(45, 120, num_samples)
    heights = np.random.randint(150, 200, num_samples)
    bmi = weights / ((heights/100) ** 2)

    # Health Goals (from user profile/onboarding)
    health_goals = np.random.choice([
        'Weight Loss', 'Muscle Gain', 'Stress Reduction',
        'Better Sleep', 'Improved Fitness', 'General Wellness'
    ], num_samples)

    # Symptoms (from daily check-in/symptom tracker)
    symptoms_list = [
        'headache', 'fatigue', 'stress', 'muscle_pain', 'digestive_issues',
        'poor_sleep', 'low_energy', 'anxiety', 'back_pain', 'neck_pain'
    ]

    symptoms_data = {}
    for symptom in symptoms_list:
        symptoms_data[symptom] = np.random.choice([0, 1], num_samples, p=[0.7, 0.3])

    # Lifestyle Habits (from habit tracker)
    sleep_hours = np.random.normal(7, 1.5, num_samples)
    sleep_hours = np.clip(sleep_hours, 4, 10)

    water_intake = np.random.randint(1, 10, num_samples)  # glasses per day
    exercise_minutes = np.random.randint(0, 120, num_samples)
    steps = np.random.randint(2000, 15000, num_samples)
    screen_time = np.random.randint(2, 12, num_samples)  # hours

    # Previous Tip Engagement (from analytics)
    engagement_metrics = {
        'clicks_last_7_days': np.random.randint(0, 15, num_samples),
        'completion_rate': np.random.uniform(0.1, 1.0, num_samples),
        'avg_rating': np.random.uniform(1, 5, num_samples),
        'preferred_categories': np.random.choice([
            'nutrition', 'exercise', 'hydration', 'sleep', 'mindfulness', 'general'
        ], num_samples)
    }

    # Time-based features
    days_since_registration = np.random.randint(1, 365, num_samples)
    last_engagement_days = np.random.randint(0, 30, num_samples)

    # Create the main dataset
    data = {
        'user_id': range(1, num_samples + 1),
        'age': ages,
        'gender': genders,
        'weight': weights,
        'height': heights,
        'bmi': bmi,
        'health_goal': health_goals,
        'sleep_hours': sleep_hours,
        'water_intake': water_intake,
        'exercise_minutes': exercise_minutes,
        'daily_steps': steps,
        'screen_time_hours': screen_time,
        'clicks_last_7_days': engagement_metrics['clicks_last_7_days'],
        'completion_rate': engagement_metrics['completion_rate'],
        'avg_tip_rating': engagement_metrics['avg_rating'],
        'preferred_category': engagement_metrics['preferred_categories'],
        'days_since_registration': days_since_registration,
        'days_since_last_engagement': last_engagement_days,
    }

    # Add symptoms
    data.update(symptoms_data)

    # Create target variable: Health Tip Categories
    # Based on the patterns in the data, assign appropriate tip categories
    tip_categories = []

    for i in range(num_samples):
        # Logic to determine the most relevant tip category based on user data
        if data['fatigue'][i] == 1 and data['sleep_hours'][i] < 6:
            tip_categories.append('sleep')
        elif data['water_intake'][i] < 4:
            tip_categories.append('hydration')
        elif data['exercise_minutes'][i] < 30:
            tip_categories.append('exercise')
        elif data['stress'][i] == 1 or data['anxiety'][i] == 1:
            tip_categories.append('mindfulness')
        elif data['bmi'][i] > 25 and data['health_goal'][i] == 'Weight Loss':
            tip_categories.append('nutrition')
        else:
            # Fallback to preferred category or general
            tip_categories.append(data['preferred_category'][i])

    data['recommended_tip_category'] = tip_categories

    df = pd.DataFrame(data)
    return df

# Generate the dataset
health_dataset = generate_health_dataset(10000)

# Display dataset info
print("Dataset Overview:")
print(f"Shape: {health_dataset.shape}")
print("\nFirst few rows:")
print(health_dataset.head())
print("\nColumn types:")
print(health_dataset.dtypes)
print("\nTarget distribution:")
print(health_dataset['recommended_tip_category'].value_counts())

Dataset Overview:
Shape: (10000, 29)

First few rows:
   user_id  age  gender  weight  height        bmi       health_goal  \
0        1   56  Female      48     194  12.753746       Muscle Gain   
1        2   69    Male      46     181  14.041085       Muscle Gain   
2        3   46    Male      45     155  18.730489  Stress Reduction   
3        4   32  Female     119     196  30.976676  General Wellness   
4        5   60    Male     109     197  28.086269       Muscle Gain   

   sleep_hours  water_intake  exercise_minutes  ...  fatigue  stress  \
0     7.116644             6                49  ...        0       1   
1     7.614317             3                89  ...        0       0   
2     5.915332             9                64  ...        0       0   
3     4.706525             4                84  ...        0       0   
4     4.914463             8                90  ...        1       1   

   muscle_pain  digestive_issues  poor_sleep low_energy  anxiety  back_pain  \
0

In [2]:
#min max range of numeric columns
print("\nFeature Ranges (Min–Max):")
numeric_cols = health_dataset.select_dtypes(include=[np.number]).columns

for col in numeric_cols:
    min_val = health_dataset[col].min()
    max_val = health_dataset[col].max()
    print(f"{col:25} →  Min: {min_val:.2f}   |   Max: {max_val:.2f}")



Feature Ranges (Min–Max):
user_id                   →  Min: 1.00   |   Max: 10000.00
age                       →  Min: 18.00   |   Max: 69.00
weight                    →  Min: 45.00   |   Max: 119.00
height                    →  Min: 150.00   |   Max: 199.00
bmi                       →  Min: 11.36   |   Max: 52.89
sleep_hours               →  Min: 4.00   |   Max: 10.00
water_intake              →  Min: 1.00   |   Max: 9.00
exercise_minutes          →  Min: 0.00   |   Max: 119.00
daily_steps               →  Min: 2001.00   |   Max: 14998.00
screen_time_hours         →  Min: 2.00   |   Max: 11.00
clicks_last_7_days        →  Min: 0.00   |   Max: 14.00
completion_rate           →  Min: 0.10   |   Max: 1.00
avg_tip_rating            →  Min: 1.00   |   Max: 5.00
days_since_registration   →  Min: 1.00   |   Max: 364.00
days_since_last_engagement →  Min: 0.00   |   Max: 29.00
headache                  →  Min: 0.00   |   Max: 1.00
fatigue                   →  Min: 0.00   |   Max: 1.00
stress 

In [3]:
# Create a companion dataset with actual health tips
def create_health_tips_dataset():


    health_tips = {
        'nutrition': [
            "Include more leafy greens in your meals today",
            "Try to have your last meal 3 hours before bedtime",
            "Add a source of protein to every meal",
            "Reduce processed sugar intake today",
            "Include healthy fats like avocado or nuts",
            "Drink a glass of water before each meal",
            "Try a new vegetable this week",
            "Plan your meals for the next 3 days"
        ],
        'exercise': [
            "Take a 15-minute walk after lunch",
            "Try 10 minutes of stretching this morning",
            "Do 3 sets of bodyweight squats today",
            "Take the stairs whenever possible",
            "Try a 5-minute desk workout every hour",
            "Go for a brisk 20-minute walk",
            "Do 10 minutes of yoga before bed",
            "Park farther away to get more steps"
        ],
        'hydration': [
            "Drink a glass of water as soon as you wake up",
            "Keep a water bottle at your desk today",
            "Add lemon or cucumber to your water for flavor",
            "Set hourly reminders to drink water",
            "Drink one glass before each meal",
            "Track your water intake today",
            "Carry a reusable water bottle with you",
            "Drink herbal tea instead of coffee this afternoon"
        ],
        'sleep': [
            "Create a relaxing bedtime routine tonight",
            "Avoid screens 1 hour before bed",
            "Keep your bedroom cool and dark",
            "Try reading a book before sleep",
            "Establish a consistent sleep schedule",
            "Practice deep breathing for 5 minutes before bed",
            "Avoid caffeine after 2 PM",
            "Write down worries before bedtime to clear your mind"
        ],
        'mindfulness': [
            "Practice 5 minutes of deep breathing today",
            "Take a mindful walk during your break",
            "Try a 10-minute guided meditation",
            "Practice gratitude by listing 3 things you're thankful for",
            "Do one thing at a time with full attention",
            "Take 3 conscious breaths before responding to stress",
            "Notice tension in your body and consciously relax it",
            "Spend 10 minutes in nature without distractions"
        ],
        'general': [
            "Stand up and stretch every 30 minutes",
            "Spend time outdoors today",
            "Connect with a friend or loved one",
            "Learn something new today",
            "Practice good posture while working",
            "Take regular breaks from screens",
            "Express gratitude to someone today",
            "Do one thing that brings you joy"
        ]
    }

    tips_data = []
    tip_id = 1

    for category, tips in health_tips.items():
        for tip in tips:
            tips_data.append({
                'tip_id': tip_id,
                'category': category,
                'tip_text': tip,
                'difficulty_level': np.random.choice(['Easy', 'Medium'], p=[0.7, 0.3]),
                'time_required': np.random.randint(5, 30),  # minutes
                'engagement_score': np.random.uniform(3.5, 5.0)
            })
            tip_id += 1

    return pd.DataFrame(tips_data)

# Create tips dataset
tips_dataset = create_health_tips_dataset()
print("Health Tips Dataset:")
print(tips_dataset.head(10))

Health Tips Dataset:
   tip_id   category                                           tip_text  \
0       1  nutrition      Include more leafy greens in your meals today   
1       2  nutrition  Try to have your last meal 3 hours before bedtime   
2       3  nutrition              Add a source of protein to every meal   
3       4  nutrition                Reduce processed sugar intake today   
4       5  nutrition          Include healthy fats like avocado or nuts   
5       6  nutrition            Drink a glass of water before each meal   
6       7  nutrition                      Try a new vegetable this week   
7       8  nutrition                Plan your meals for the next 3 days   
8       9   exercise                  Take a 15-minute walk after lunch   
9      10   exercise          Try 10 minutes of stretching this morning   

  difficulty_level  time_required  engagement_score  
0             Easy             20          4.629582  
1             Easy             20          4.

In [13]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

def preprocess_dataset(df):
    df_processed = df.copy()
    categorical_columns = ['gender', 'health_goal', 'preferred_category']
    label_encoders = {}

    for col in categorical_columns:
        le = LabelEncoder()
        df_processed[col] = le.fit_transform(df_processed[col])
        label_encoders[col] = le
    target_encoder = LabelEncoder()
    df_processed['tip_category_encoded'] = target_encoder.fit_transform(
        df_processed['recommended_tip_category']
    )

    feature_columns = [
        'age', 'gender', 'weight', 'height', 'bmi', 'health_goal',
        'sleep_hours', 'water_intake', 'exercise_minutes', 'daily_steps',
        'screen_time_hours', 'clicks_last_7_days', 'completion_rate',
        'avg_tip_rating', 'preferred_category', 'days_since_registration',
        'days_since_last_engagement', 'headache', 'fatigue', 'stress',
        'muscle_pain', 'digestive_issues', 'poor_sleep', 'low_energy',
        'anxiety', 'back_pain', 'neck_pain'
    ]

    # Spliting
    X = df_processed[feature_columns]
    y = df_processed['tip_category_encoded']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Scaleing
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    return {
        'X_train': X_train_scaled,
        'X_test': X_test_scaled,
        'y_train': y_train,
        'y_test': y_test,
        'feature_names': feature_columns,
        'label_encoders': label_encoders,
        'target_encoder': target_encoder,
        'scaler': scaler
    }

In [12]:
# Preprocessing
processed_data = preprocess_dataset(health_dataset)

print("Preprocessed dataset shapes:")
print(f"X_train: {processed_data['X_train'].shape}")
print(f"X_test: {processed_data['X_test'].shape}")
print(f"y_train: {processed_data['y_train'].shape}")
print(f"y_test: {processed_data['y_test'].shape}")

Preprocessed dataset shapes:
X_train: (8000, 27)
X_test: (2000, 27)
y_train: (8000,)
y_test: (2000,)


In [14]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report, confusion_matrix

def create_mlp_model(input_dim, num_classes):
    """Create the MLP model architecture as specified"""
    model = Sequential([
        Dense(128, activation='relu', input_shape=(input_dim,)),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.3),
        Dense(32, activation='relu'),
        Dense(num_classes, activation='softmax')
    ])

    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    return model


In [16]:
X_train = processed_data['X_train']
X_test = processed_data['X_test']
y_train = processed_data['y_train']
y_test = processed_data['y_test']

input_dim = X_train.shape[1]
num_classes = len(health_dataset['recommended_tip_category'].unique())

model = create_mlp_model(input_dim, num_classes)

model.summary()
history = model.fit(
    X_train, y_train,
    epochs=50,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)

Epoch 1/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 19ms/step - accuracy: 0.4027 - loss: 1.4994 - val_accuracy: 0.6900 - val_loss: 0.8817
Epoch 2/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.6460 - loss: 0.9565 - val_accuracy: 0.7275 - val_loss: 0.7157
Epoch 3/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.6995 - loss: 0.7980 - val_accuracy: 0.7606 - val_loss: 0.6042
Epoch 4/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step - accuracy: 0.7196 - loss: 0.7093 - val_accuracy: 0.7869 - val_loss: 0.5180
Epoch 5/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.7497 - loss: 0.6239 - val_accuracy: 0.8150 - val_loss: 0.4476
Epoch 6/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.7784 - loss: 0.5548 - val_accuracy: 0.8481 - val_loss: 0.3940
Epoch 7/50
[1m200/200

In [None]:
# Evaluateing
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"\nTest Accuracy: {test_accuracy:.4f}")

In [None]:

# predictions
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

# Classification
print("\nClassification Report:")
print(classification_report(y_test, y_pred_classes,
                          target_names=processed_data['target_encoder'].classes_))

In [None]:
def recommend_health_tip(user_data, model, tips_df, preprocessor):
    """Recommend health tips for a user"""

    # Preprocessing
    user_processed = preprocessor['scaler'].transform([user_data])

    #prediction
    probabilities = model.predict(user_processed)[0]

    #Get top 3 categories
    top_categories_idx = np.argsort(probabilities)[-3:][::-1]
    top_categories = preprocessor['target_encoder'].inverse_transform(top_categories_idx)
    top_probabilities = probabilities[top_categories_idx]

    # Get tips for top category
    recommended_tips = tips_df[tips_df['category'] == top_categories[0]].sample(3)

    return {
        'top_categories': list(zip(top_categories, top_probabilities)),
        'recommended_tips': recommended_tips[['tip_text', 'difficulty_level', 'time_required']].to_dict('records')
    }

# Example usage with sample user data
sample_user = processed_data['X_test'][0]
recommendation = recommend_health_tip(sample_user, model, tips_dataset, processed_data)

print("Health Tip Recommendation:")
print(f"Top categories: {recommendation['top_categories']}")
print("Recommended tips:")
for i, tip in enumerate(recommendation['recommended_tips'], 1):
    print(f"{i}. {tip['tip_text']} ({tip['difficulty_level']}, {tip['time_required']}min)")

In [None]:
'''# export_artifacts.py  (or notebook cell)
import joblib, json
from pathlib import Path

# Replace variable names below with your notebook variable names if different
# model  -> your trained Keras model variable
# processed_data -> dict you built that contains label_encoders, scaler, etc.

# Create the main directory if it doesn't exist
Path("healthsnap_api").mkdir(parents=True, exist_ok=True)

# 1) Save Keras model (SavedModel format)
model.save("healthsnap_api/model_keras.keras")

# 2) Save preprocessing
Path("healthsnap_api/artifacts").mkdir(parents=True, exist_ok=True)
joblib.dump(processed_data.get('label_encoders', {}), "healthsnap_api/artifacts/label_encoders.joblib")
if processed_data.get('scaler', None) is not None:
    joblib.dump(processed_data['scaler'], "healthsnap_api/artifacts/scaler.joblib")

# 3) Save the feature order the model expects (very important)
feature_cols = processed_data.get('feature_columns', None)
if feature_cols is None:
    # If you don't have it, build it manually here from your dataset column order:
    feature_cols = ["age", "gender", "weight", "height", "bmi", "health_goal",
        "sleep_hours", "water_intake", "exercise_minutes", "daily_steps",
        "screen_time_hours", "clicks_last_7_days", "completion_rate",
        "avg_tip_rating", "preferred_category", "days_since_registration",
        "days_since_last_engagement", "headache", "fatigue", "stress",
        "muscle_pain", "digestive_issues", "poor_sleep", "low_energy",
        "anxiety", "back_pain", "neck_pain"]
with open("healthsnap_api/artifacts/feature_columns.json","w") as f:
    json.dump(feature_cols, f)

print("Saved model and artifacts to ./healthsnap_api/")

In [None]:
model.save("model.keras")



import pickle

In [None]:
import pickle
with open("label_encoder.pkl", "wb") as f:
  pickle.dump(processed_data['target_encoder'], f)


In [None]:
import joblib
import pickle
joblib.dump(processed_data['scaler'], "scaler.joblib")
joblib.dump(processed_data['label_encoders'], "label_encoders.joblib")

with open("target_encoder.pkl", "wb") as f:
    pickle.dump(processed_data['target_encoder'], f)
import json
with open("feature_columns.json", "w") as f:
    json.dump(processed_data['feature_names'], f)


In [None]:
import pickle
try:
    with open("model.pkl", "wb") as f:
        pickle.dump(model, f)
    print("Model saved as model.pkl (with potential limitations)")
except Exception as e:
    print(f"Error saving model as pickle: {e}")
    print("Saving Keras models as .keras or .h5 is the recommended approach.")

In [None]:

model.save("model.keras")

import pickle
with open("scaler.pkl", "wb") as f:
    pickle.dump(processed_data['scaler'], f)

with open("target_encoder.pkl", "wb") as f:
    pickle.dump(processed_data['target_encoder'], f)

with open("label_encoders.pkl", "wb") as f:
    pickle.dump(processed_data['label_encoders'], f)
