<a href="https://colab.research.google.com/github/snehanshastri/SEHAT_ML_APP/blob/main/SEHAT_ML_APP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.mixture import GaussianMixture
from scipy.cluster.hierarchy import linkage, fcluster
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

df = pd.read_csv("/content/enhanced_women_nutrition_dataset_fixed.csv")
df.columns = df.columns.str.strip().str.replace(" ", "_")
df["Activity_Level"] = df["Physical_Activity"].map({"No Activity": 0, "Light": 1, "Moderate": 2, "Heavy": 3})
df["Age_Group"] = df["Age_Group"].map({"18-30": 0, "31-45": 1, "46-60": 2, "60+": 3})


df["BMI"] = df["Weight_kg"] / ((df["Height_cm"] / 100) ** 2)


categorical_features = ["Occupation", "Residence_Type", "Diet_Type", "Menstrual_Health", "Health_Issues"]
label_encoders = {}

for col in categorical_features:
    df[col] = df[col].fillna("Unknown")
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))

    if 'Unknown' not in le.classes_:
        le.classes_ = np.append(le.classes_, 'Unknown')
    label_encoders[col] = le


features = ["Age_Group", "BMI", "Meals_Per_Day", "Activity_Level", "Occupation",
            "Residence_Type", "Diet_Type", "Menstrual_Health", "Health_Issues"]
X = df[features].dropna()


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


# Print first 5 rows of scaled data for cross verification
print("First 5 rows of scaled data:\n", X_scaled[:5])


# Hierarchical Clustering
linkage_matrix = linkage(X_scaled, method='ward')
df["Cluster"] = fcluster(linkage_matrix, t=4, criterion='maxclust')

# Gaussian Mixture Model
gmm = GaussianMixture(n_components=4, random_state=42, n_init=10) # Increased n_init to 10
df["GMM_Cluster"] = gmm.fit_predict(X_scaled)


cluster_profiles = df.groupby("GMM_Cluster")[features].mean()
print("\nCluster sizes:\n", df["GMM_Cluster"].value_counts())
silhouette_avg = silhouette_score(X_scaled, df["GMM_Cluster"])
calinski_harabasz = calinski_harabasz_score(X_scaled, df["GMM_Cluster"])
davies_bouldin = davies_bouldin_score(X_scaled, df["GMM_Cluster"])


print(f"Silhouette Score: {silhouette_avg:.4f}")
print(f"Calinski-Harabasz Index: {calinski_harabasz:.4f}")
print(f"Davies-Bouldin Index: {davies_bouldin:.4f}")

cluster_recommendations = {
    0: "Cluster 0: Focus on balanced meals with adequate protein, complex carbohydrates, and healthy fats. \n Given your likely younger age and low activity, prioritize nutrient-dense foods and light exercise.",
    1: "Cluster 1: You are likely moderately active and of average BMI, so emphasize whole foods, lean proteins, and a variety of fruits and vegetables.\n Balance your calorie intake with your activity level to maintain a healthy weight.\n Incorporate regular moderate intensity exercise",
    2: "Cluster 2: Given your higher BMI and lower activity levels, concentrate on portion control, fiber-rich foods, and reduced intake of processed sugars and unhealthy fats.\n Increase your activity levels gradually to boost metabolism.\n Focus on whole grains, lean proteins, and a variety of low-glycemic fruits and vegetables.",
    3: "Cluster 3: Being active, you should maintain a diet rich in protein to support muscle recovery and energy.\n Ensure you get enough complex carbs for fuel and healthy fats for overall well-being.\n Hydration and electrolytes are essential for peak performance."
}


print("\nCluster Profiles:\n", cluster_profiles)


age_group = input("Enter Age Group (18-30/31-45/46-60/60+): ")
height = float(input("Enter Height in cm: "))
weight = float(input("Enter Weight in kg: "))
meals = int(input("Meals Per Day: "))
activity = int(input("Physical Activity (0-No, 1-Light, 2-Moderate, 3-Heavy): "))
occupation = input("Enter Occupation (Student/Office Worker/Athlete/Homemaker): ")
residence = input("Enter Residence Type (Urban/Rural/Coastal/Mountain): ")
diet_type = input("Enter Diet Type (Vegetarian/Non-Vegetarian/Vegan): ")
menstrual_health = input("Enter Menstrual Health Issue (None/PCOS/Irregular/Heavy Flow): ")
health_issues = input("Enter Any Health Issues (None/Diabetes/Anemia/Hypertension): ")

bmi = weight / ((height / 100) ** 2)
age_numeric = {"18-30": 0, "31-45": 1, "46-60": 2, "60+": 3}.get(age_group, 0)


user_inputs = [
    age_numeric, bmi, meals, activity,
    label_encoders["Occupation"].transform([occupation if occupation in label_encoders["Occupation"].classes_ else "Unknown"])[0],
    label_encoders["Residence_Type"].transform([residence if residence in label_encoders["Residence_Type"].classes_ else "Unknown"])[0],
    label_encoders["Diet_Type"].transform([diet_type if diet_type in label_encoders["Diet_Type"].classes_ else "Unknown"])[0],
    label_encoders["Menstrual_Health"].transform([menstrual_health if menstrual_health in label_encoders["Menstrual_Health"].classes_ else "Unknown"])[0],
    label_encoders["Health_Issues"].transform([health_issues if health_issues in label_encoders["Health_Issues"].classes_ else "Unknown"])[0]
]

user_data = scaler.transform([user_inputs])
user_cluster = gmm.predict(user_data)[0]

recommendation = f"{cluster_recommendations[user_cluster]} \n Your cluster has the following average characteristics: {cluster_profiles.loc[user_cluster].to_dict()}"

print(f"\nYour Calculated BMI: {bmi:.2f}")
print(f"You belong to Cluster {user_cluster}.")
print("Recommended Actions:")
print(recommendation)

First 5 rows of scaled data:
 [[ 0.35491198  0.95046986 -0.05190556  0.40192985  1.36674228  0.4216487
  -0.05423987  0.4275089  -0.40308116]
 [ 1.2247943   1.14584977  0.64952092 -1.36868622 -1.34504796 -1.40367034
   1.17848437 -0.47440861 -1.30281588]
 [-1.38485264 -0.25887716 -0.75333204  0.40192985 -0.6671004   1.33430822
   1.17848437 -1.37632612  1.39638829]
 [ 0.35491198  0.39334951  1.35094741 -0.48337819  0.68879472 -1.40367034
  -1.2869641   1.32942641 -1.30281588]
 [ 0.35491198 -1.16823794 -1.45475852  0.40192985  0.01084716 -1.40367034
   1.17848437  1.32942641 -1.30281588]]

Cluster sizes:
 GMM_Cluster
2    148
3    122
0    116
1    114
Name: count, dtype: int64
Silhouette Score: 0.0427
Calinski-Harabasz Index: 28.6806
Davies-Bouldin Index: 4.5491

Cluster Profiles:
              Age_Group        BMI  Meals_Per_Day  Activity_Level  Occupation  \
GMM_Cluster                                                                    
0             0.491379  24.704999       3.00862

