In [41]:
import pandas as pd
import numpy as np

# Seed for reproducibility
np.random.seed(42)

# Sample size
n_samples = 100

# Generate synthetic gut microbiome profiles as abundance percentages for various bacteria types
gut_microbes = {
    'Bacteroides': np.random.normal(30, 5, n_samples),
    'Firmicutes': np.random.normal(40, 10, n_samples),
    'Lactobacillus': np.random.normal(10, 2, n_samples),
    'Proteobacteria': np.random.normal(5, 1, n_samples),
    'Actinobacteria': np.random.normal(5, 1, n_samples),
    'Verrucomicrobia': np.random.normal(3, 0.5, n_samples),
    'Others': 100 - (np.random.normal(30, 5, n_samples) +
                     np.random.normal(40, 10, n_samples) +
                     np.random.normal(10, 2, n_samples) +
                     np.random.normal(5, 1, n_samples) +
                     np.random.normal(5, 1, n_samples) +
                     np.random.normal(3, 0.5, n_samples))
}
# Demographics and Health Data
demographics = {
    'Age': np.random.randint(20, 70, n_samples),
    'Gender': np.random.choice(['Male', 'Female'], n_samples),
    'BMI': np.round(np.random.normal(25, 3, n_samples), 1)
}

# Dietary Intake Preferences (e.g., high-protein, high-carb, etc.)
dietary_prefs = {
    'Protein_Intake': np.random.choice(['Low', 'Moderate', 'High'], n_samples, p=[0.3, 0.5, 0.2]),
    'Fiber_Intake': np.random.choice(['Low', 'Moderate', 'High'], n_samples, p=[0.2, 0.6, 0.2]),
    'Carbohydrate_Intake': np.random.choice(['Low', 'Moderate', 'High'], n_samples, p=[0.4, 0.4, 0.2]),
    'Fat_Intake': np.random.choice(['Low', 'Moderate', 'High'], n_samples, p=[0.3, 0.5, 0.2])
}

# Allergies Information
allergies = {
    'Dairy_Allergy': np.random.choice([0, 1], n_samples, p=[0.85, 0.15]),
    'Gluten_Allergy': np.random.choice([0, 1], n_samples, p=[0.9, 0.1]),
    'Nut_Allergy': np.random.choice([0, 1], n_samples, p=[0.9, 0.1])
}

# Combine all data into a DataFrame
df = pd.DataFrame({**gut_microbes, **demographics, **dietary_prefs, **allergies})

# Define the logic for the Suggested_Meal_Plan based on the generated data
def suggest_meal_plan(row):
    if row['Carbohydrate_Intake'] == 'Low' or row['BMI'] > 30:
        return 'Keto-Friendly Low-Sugar Plan'
    elif row['Fiber_Intake'] == 'High' or row['Age'] > 50:
        return 'Gut-Health Fiber Boost'
    elif row['Bacteroides'] > df['Bacteroides'].median() and row['Firmicutes'] > df['Firmicutes'].median():
        return 'Mediterranean Gut Balance'
    elif 18.5 <= row['BMI'] <= 24.9:
        return 'Metabolic Maintenance Diet'
    elif row['Actinobacteria'] > df['Actinobacteria'].median() and row['Lactobacillus'] > df['Lactobacillus'].median() and row['Dairy_Allergy'] == 1:
        return 'Dairy-Free Anti-Inflammation Plan'
    elif row['Gluten_Allergy'] == 1 and row['Fiber_Intake'] == 'Low':
        return 'Digestive Relief (Low-FODMAP)'
    elif row['Age'] > 45 and row['Firmicutes'] > df['Firmicutes'].median() and row['Fat_Intake'] in ['Moderate', 'Low']:
        return 'Heart Health Plus'
    else:
        return 'Metabolic Maintenance Diet'

# Apply the suggestion logic to each row
df['Suggested_Meal_Plan'] = df.apply(suggest_meal_plan, axis=1)

# Display first few rows of the dataset
df.head()

Unnamed: 0,Bacteroides,Firmicutes,Lactobacillus,Proteobacteria,Actinobacteria,Verrucomicrobia,Others,Age,Gender,BMI,Protein_Intake,Fiber_Intake,Carbohydrate_Intake,Fat_Intake,Dairy_Allergy,Gluten_Allergy,Nut_Allergy,Suggested_Meal_Plan
0,32.483571,25.846293,10.715575,4.171005,3.405572,3.463089,4.298686,64,Male,21.7,Low,Moderate,Moderate,High,0,0,0,Gut-Health Fiber Boost
1,29.308678,35.793547,11.121569,4.439819,4.400625,3.954708,3.069657,61,Female,18.5,Moderate,High,Moderate,Moderate,0,0,0,Gut-Health Fiber Boost
2,33.238443,36.572855,12.166102,5.747294,5.005244,2.300716,8.370603,66,Male,26.2,Low,Low,Moderate,Moderate,0,0,0,Gut-Health Fiber Boost
3,37.615149,31.977227,12.107604,5.61037,5.046981,3.281485,14.669254,32,Female,32.5,Moderate,Moderate,Low,High,1,0,0,Keto-Friendly Low-Sugar Plan
4,28.829233,38.387143,7.244661,4.979098,4.549935,2.674679,19.924696,41,Female,25.0,Moderate,High,Moderate,Low,0,0,0,Gut-Health Fiber Boost


In [43]:
# Save the DataFrame to a CSV file
df.to_csv('microbe_gut_health_data.csv', index=False)

print("Gut Microbe dataset generated and saved as 'microbe_gut_health_data.csv'.")

Gut Microbe dataset generated and saved as 'microbe_gut_health_data.csv'.


In [45]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_selection import SelectFromModel

# Seed for reproducibility
np.random.seed(42)

# Encode categorical columns using LabelEncoder
label_columns = ['Gender', 'Protein_Intake', 'Fiber_Intake', 'Carbohydrate_Intake', 
                 'Fat_Intake', 'Suggested_Meal_Plan']

# Initialize LabelEncoder
encoder = LabelEncoder()

# Apply encoding to all categorical columns
for col in label_columns:
    df[col] = encoder.fit_transform(df[col])

# Separate features (X) and target variable (y)
X = df.drop(columns=['Suggested_Meal_Plan'])  # All columns except target
y = df['Suggested_Meal_Plan']  # Target column (Suggested_Meal_Plan)

In [47]:
# Train RandomForest to get feature importances
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X, y)

# Get feature importances
importances = rf_model.feature_importances_

# Sort features by importance
indices = np.argsort(importances)[::-1]

# Print feature ranking
print("Feature ranking (by importance):")
for i in range(X.shape[1]):
    print(f"{i+1}. {X.columns[indices[i]]} ({importances[indices[i]]})")

# Step 3: Select the top N important features (for example, top 10)
top_n_features = 7  # You can change this to select more or fewer features
selected_features = X.columns[indices[:top_n_features]]
print(f"\nSelected Top {top_n_features} Features: {selected_features.tolist()}")


Feature ranking (by importance):
1. Carbohydrate_Intake (0.21878884316020422)
2. Age (0.1389793765266559)
3. Bacteroides (0.08957288065253959)
4. Firmicutes (0.08257067985811098)
5. Proteobacteria (0.07568012183332457)
6. Lactobacillus (0.06579460807835502)
7. Verrucomicrobia (0.05941567663044929)
8. Others (0.05592719940614672)
9. BMI (0.05187953214950239)
10. Actinobacteria (0.04528907832140786)
11. Protein_Intake (0.030900799628147586)
12. Fiber_Intake (0.026908213794099998)
13. Fat_Intake (0.024904009540350377)
14. Dairy_Allergy (0.010570862871416177)
15. Gender (0.010182297593628861)
16. Gluten_Allergy (0.00649461808778917)
17. Nut_Allergy (0.006141201867871363)

Selected Top 7 Features: ['Carbohydrate_Intake', 'Age', 'Bacteroides', 'Firmicutes', 'Proteobacteria', 'Lactobacillus', 'Verrucomicrobia']


In [49]:
# Step 4: Split the data into training and testing sets (80% train, 20% test)
X_selected = X[selected_features]  # Use only selected important features
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Step 5: Standardize the features (scaling)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [62]:
# # Step 6: Train the model on selected features and scaled data
# rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
# rf_model.fit(X_train_scaled, y_train)

# # Step 7: Make predictions on the test set
# y_pred = rf_model.predict(X_test_scaled)

# # Step 8: Evaluate the model
# accuracy = accuracy_score(y_test, y_pred)
# print(f"Accuracy: {accuracy:.4f}")
# print(classification_report(y_test, y_pred))

Accuracy: 0.8000
              precision    recall  f1-score   support

           0       0.75      0.86      0.80         7
           1       1.00      0.86      0.92         7
           2       0.67      1.00      0.80         2
           3       0.67      0.50      0.57         4

    accuracy                           0.80        20
   macro avg       0.77      0.80      0.77        20
weighted avg       0.81      0.80      0.80        20



In [53]:
# # Example: Predict the meal plan for the first 5 rows in the test set
# new_data = X_test_scaled[:5]
# predictions = rf_model.predict(new_data)

# # Decode the predicted meal plans back to their original labels
# predicted_meal_plans = encoder.inverse_transform(predictions)
# print(predicted_meal_plans)

In [55]:
# # Step 1: Get the predicted values for the test set
# y_pred = rf_model.predict(X_test_scaled)

# # Step 2: Decode the predicted meal plans back to their original labels
# predicted_meal_plans = encoder.inverse_transform(y_pred)

# # Step 3: Decode the actual meal plans from the test set
# actual_meal_plans = encoder.inverse_transform(y_test)

# # Step 4: Create a DataFrame to compare actual vs. predicted values
# comparison_df = pd.DataFrame({
#     'Actual Meal Plan': actual_meal_plans,
#     'Predicted Meal Plan': predicted_meal_plans
# })

# # Step 5: Print out the comparison of the first few rows
# print(comparison_df.head())


In [57]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score

# Initialize XGBoost model
xgb_model = XGBClassifier(random_state=42)

# Train the model on the scaled features
xgb_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_xgb = xgb_model.predict(X_test_scaled)

# Evaluate the model
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"XGBoost Accuracy: {accuracy_xgb:.4f}")
print(classification_report(y_test, y_pred_xgb))


XGBoost Accuracy: 0.8500
              precision    recall  f1-score   support

           0       0.86      0.86      0.86         7
           1       1.00      0.86      0.92         7
           2       0.67      1.00      0.80         2
           3       0.75      0.75      0.75         4

    accuracy                           0.85        20
   macro avg       0.82      0.87      0.83        20
weighted avg       0.87      0.85      0.85        20



In [59]:
# Predict with the XGBoost model on the test data
y_pred_xgb = xgb_model.predict(X_test_scaled)

# Decode the predicted labels (back to original labels)
predicted_meal_plans = encoder.inverse_transform(y_pred_xgb)

# Decode the actual meal plans (test set labels)
actual_meal_plans = encoder.inverse_transform(y_test)

# Create a DataFrame to compare actual vs predicted values
comparison_df = pd.DataFrame({
    'Actual Meal Plan': actual_meal_plans,
    'Predicted Meal Plan': predicted_meal_plans
})

# Display the comparison for the first few rows
print(comparison_df.head())


               Actual Meal Plan         Predicted Meal Plan
0  Keto-Friendly Low-Sugar Plan      Gut-Health Fiber Boost
1     Mediterranean Gut Balance   Mediterranean Gut Balance
2        Gut-Health Fiber Boost      Gut-Health Fiber Boost
3    Metabolic Maintenance Diet  Metabolic Maintenance Diet
4    Metabolic Maintenance Diet  Metabolic Maintenance Diet
