In [35]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Load dataset
data = pd.read_csv('tkpi.csv')

# Display the first few rows of the original dataset
print("Original Dataset:\n", data.head())

# Step 2: Clean the dataset by removing the first row and non-numeric columns
df_cleaned = data.drop([0])  # Drop the header-like row

# Step 3: Remove rows where 'KODE' is NaN (to avoid errors during filtering)
df_cleaned = df_cleaned.dropna(subset=['KODE'])

# Step 4: Filter data where 'KODE' starts with letters A-H using regex
df_cleaned = df_cleaned[df_cleaned['KODE'].str.match(r'^[A-Ha-h]')]
df_cleaned = df_cleaned[~df_cleaned['NAMA BAHAN'].str.contains(r'anak|babi|darah', case=False, na=False)]

# Step 5: Select only the desired columns: 'KODE', 'NAMA BAHAN', 'SUMBER', and numeric columns
desired_columns = ['KODE', 'NAMA BAHAN', 'SUMBER', 'ENERGI', 'PROTEIN', 'LEMAK', 'KH', 'SERAT']
df_final = df_cleaned[desired_columns]

# Step 6: Save the filtered data to a new CSV file
df_final.to_csv('tkpi_filtered.csv', index=False)

# Step 7: Load the newly saved CSV file (filtered dataset)
filtered_data = pd.read_csv('tkpi_filtered.csv')

# Display the first few rows of the filtered data
print("Filtered Dataset:\n", filtered_data.head())

# Step 8: Select relevant numeric columns for clustering
numeric_columns = ['ENERGI', 'PROTEIN', 'LEMAK', 'KH', 'SERAT']

# Step 9: Convert the selected columns to numeric, forcing errors='coerce' to handle non-numeric values properly
df_numeric = filtered_data[numeric_columns].apply(pd.to_numeric, errors='coerce')

# Step 10: Handle missing values by filling them with the mean of the column
df_numeric = df_numeric.fillna(df_numeric.mean())

# Step 11: Normalize the data using StandardScaler
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_numeric)

# Step 12: Apply K-Means clustering with 3 clusters
kmeans = KMeans(n_clusters=3, random_state=0)
filtered_data['Cluster'] = kmeans.fit_predict(df_scaled)

# Step 13: Save the clustered data to a new CSV file
filtered_data.to_csv('tkpi_filtered.csv', index=False)
print("Clustered data saved to 'tkpi_filtered_clustered.csv'")

# Step 14: Print the first few rows of the clustered data
print("Clustered Dataset:\n", filtered_data.head())

# Step 15: Show new cluster centroids
print("New Cluster Centroids:\n", kmeans.cluster_centers_)

print(f"Number of rows in the original dataset (tkpi.csv): {len(data)}")
print(f"Number of rows in the filtered dataset (tkpi_filtered.csv): {len(df_final)}")



Original Dataset:
     NO   KODE                         NAMA BAHAN       SUMBER   AIR ENERGI  \
0  NaN    NaN                                NaN          NaN   (g)  (Kal)   
1    1  AR001               Beras giling, mentah   KZGMI-2001    12    357   
2    2  AR002    Beras giling var pelita, mentah  KZGPI- 1990  11.4    369   
3    3  AR003  Beras giling var rojolele, mentah  KZGPI- 1990    12    357   
4    4  AR004                Beras hitam, mentah   KZGMI-2001  12.9    351   

  PROTEIN LEMAK    KH SERAT  ... TEMBAGA  SENG RETINOL  B-KAR KAR-TOTAL  \
0     (g)   (g)   (g)   (g)  ...    (mg)  (mg)   (mcg)  (mcg)     (mcg)   
1     8.4   1.7  77.1   0.2  ...     0.1   0.5       0      0         0   
2     9.5   1.4  77.1   0.4  ...       0     0       0      0         0   
3     8.4   1.7  77.1   0.2  ...    0.14   0.1     NaN      0        80   
4       8   1.3  76.9  20.1  ...     0.1   1.6       0      0         0   

  THIAMIN RIBOFLAVIN NIASIN VIT_C Unnamed: 25  
0    (mg)    

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.cluster import KMeans

# calculate bmr
def calculate_bmr(sex, age, weight, height, activity_level):
    if sex == 'male':
        bmr = 88.362 + (13.397 * weight) + (4.799 * height) - (5.677 * age)
    else:
        bmr = 447.593 + (9.247 * weight) + (3.098 * height) - (4.330 * age)
    
    # penyesuaian bmr berdasar level aktivitas
    activity_multiplier = {
        'sedentary': 1.2,
        'light': 1.375,
        'moderate': 1.55,
        'active': 1.725,
        'very active': 1.9
    }
    bmr *= activity_multiplier.get(activity_level, 1.2)
    
    return bmr

# bmr berdasar tujuan diet
def adjust_bmr_for_goal(bmr, goal):
    if goal == 'lose':
        return bmr * 0.8  # dikurangi 20%
    elif goal == 'gain':
        return bmr * 1.2  # ditambah 20% 
    else:
        return bmr  # Maintain weight

# load dataset
data = pd.read_csv('tkpi_filtered.csv')  

# Data Preprocessing 
def preprocess_data_with_kmeans(data):
    # Check if the columns are strings and replace commas if needed
    for column in ['ENERGI', 'PROTEIN', 'LEMAK', 'KH', 'SERAT']:
        if data[column].dtype == 'object':  # Only apply .str.replace if column is an object (string)
            data[column] = data[column].str.replace(',', '.').astype(float)
        else:
            data[column] = data[column].astype(float)  # Ensure numeric columns are float
    
    # Drop rows with missing values
    data = data.dropna()

    # Add the 'Cluster' feature generated by K-Means clustering
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data[['ENERGI', 'PROTEIN', 'LEMAK', 'KH', 'SERAT', 'Cluster']])
    scaled_df = pd.DataFrame(scaled_data, columns=['ENERGI', 'PROTEIN', 'LEMAK', 'KH', 'SERAT', 'Cluster'])
    
    return scaled_df, scaler

# Preprocess the dataset with cluster feature
processed_data, scaler = preprocess_data_with_kmeans(data)

# split fitur (X) and target (y)
X = processed_data[['PROTEIN', 'LEMAK', 'KH', 'SERAT','Cluster']] 
y = processed_data['ENERGI']  

# training dan testing 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training (Random Forest Regressor)
def train_random_forest(X_train, y_train):
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    return rf_model

rf_model = train_random_forest(X_train, y_train)

# Model Evaluation
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f"Mean Squared Error (MSE): {mse}")
    return mse

evaluate_model(rf_model, X_test, y_test)

# Model Optimization using GridSearchCV
def optimize_model(X_train, y_train):
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, 30],
        'min_samples_split': [2, 5, 10]
    }
    rf_model = RandomForestRegressor(random_state=42)
    grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)
    
    print("Best Parameters:", grid_search.best_params_)
    return grid_search.best_estimator_

optimized_rf_model = optimize_model(X_train, y_train)

# Final evaluation with optimized model
evaluate_model(optimized_rf_model, X_test, y_test)

# mencari makanan berdasar kalori dari kategori tertentu
def find_foods_by_category(target_calories, data, category_prefix, max_items=5):
    total_calories = 0
    selected_foods = pd.DataFrame()

    # Filter data berdasar kategori
    category_data = data[data['KODE'].str.startswith(tuple(category_prefix))]

    if category_data.empty:
        print(f"No data available for categories {category_prefix}")
        return selected_foods  # Return an empty DataFrame if no data is available

    # looping untuk mencari kombinasi makanan agar mencapai terget kalori
    while total_calories < target_calories and len(selected_foods) < max_items:
        # Randomly sample foods from the dataset in the specific category
        possible_food = category_data.sample(n=1, replace=False)
        selected_foods = pd.concat([selected_foods, possible_food])
        total_calories = selected_foods['ENERGI'].sum()

        # Stop when we exceed the target calories
        if total_calories >= target_calories:
            break

    return selected_foods

# rekomendasi makanan dengan porsi seimbang 
def recommend_balanced_meals(bmr, data, protein_threshold=15, carb_threshold=20, fat_threshold=20, fiber_threshold=20):
    breakfast_calories = bmr * 0.25
    lunch_calories = bmr * 0.35
    dinner_calories = bmr * 0.3
    snack_calories = bmr * 0.1  

    
    high_protein_low_carb = data[
        (data['PROTEIN'] >= protein_threshold) & 
        (data['KH'] <= carb_threshold) & 
        (data['LEMAK'] <= fat_threshold) & 
        (data['SERAT'] <= fiber_threshold)
    ]

    
    # Karbohidrat should not be filtered by the same conditions, so no need for high protein, low carb filter for 'A', 'B'
    karbo_data = data[data['KODE'].str.startswith(('A', 'B'))]

    # pembagian alokasi porsi per kategori dalam sekali makan
    def allocate_category_calories(total_calories):
        protein_calories = total_calories * 0.3
        nabati_calories = total_calories * 0.2
        sayur_calories = total_calories * 0.3
        karbo_calories = total_calories * 0.2
        return protein_calories, nabati_calories, sayur_calories, karbo_calories, 

    # Breakfast
    breakfast_protein_cal, breakfast_nabati_cal, breakfast_sayur_cal, breakfast_karbo_cal = allocate_category_calories(breakfast_calories)
    breakfast_protein = find_foods_by_category(breakfast_protein_cal, high_protein_low_carb, category_prefix=['F', 'G', 'H'])
    breakfast_nabati = find_foods_by_category(breakfast_nabati_cal, high_protein_low_carb, category_prefix=['C'])
    breakfast_sayur = find_foods_by_category(breakfast_sayur_cal, data, category_prefix=['D'], max_items=2)
    breakfast_karbo = find_foods_by_category(breakfast_karbo_cal, karbo_data, category_prefix=['AR', 'BR'])  
    breakfast_foods = pd.concat([breakfast_protein, breakfast_nabati, breakfast_sayur, breakfast_karbo])

    # Lunch
    lunch_protein_cal, lunch_nabati_cal, lunch_sayur_cal, lunch_karbo_cal = allocate_category_calories(lunch_calories)
    lunch_protein = find_foods_by_category(lunch_protein_cal, high_protein_low_carb, category_prefix=['F', 'G', 'H'])
    lunch_nabati = find_foods_by_category(lunch_nabati_cal, high_protein_low_carb, category_prefix=['C'])
    lunch_sayur = find_foods_by_category(lunch_sayur_cal, data, category_prefix=['D'], max_items=2)
    lunch_karbo = find_foods_by_category(lunch_karbo_cal, karbo_data, category_prefix=['AR', 'BR'])  
    lunch_foods = pd.concat([lunch_protein, lunch_nabati, lunch_sayur, lunch_karbo])

    # Dinner
    dinner_protein_cal,dinner_nabati_cal, dinner_sayur_cal, dinner_karbo_cal = allocate_category_calories(dinner_calories)
    dinner_protein = find_foods_by_category(dinner_protein_cal, high_protein_low_carb, category_prefix=['F', 'G', 'H'])
    dinner_nabati = find_foods_by_category(dinner_nabati_cal, high_protein_low_carb, category_prefix=['C'])
    dinner_sayur = find_foods_by_category(dinner_sayur_cal, data, category_prefix=['D'], max_items=2)
    dinner_carbo = find_foods_by_category(dinner_karbo_cal, karbo_data, category_prefix=['AR', 'BR']) 
    dinner_foods = pd.concat([dinner_protein, dinner_nabati, dinner_sayur, dinner_carbo])

    # Snacks
    fruit_snacks = find_foods_by_category(snack_calories, data, category_prefix=['ER'], max_items=3)

    # menghitung total kalori
    total_calories = (breakfast_foods['ENERGI'].sum() +
                      lunch_foods['ENERGI'].sum() +
                      dinner_foods['ENERGI'].sum() +
                      fruit_snacks['ENERGI'].sum())

    # memastikan total kalori mendekati bmr yang dibutuhkan
    if total_calories > bmr:
        excess_calories = total_calories - bmr
        # Adjust meals down if exceeded
        all_foods = pd.concat([breakfast_foods, lunch_foods, dinner_foods, fruit_snacks])
        all_foods = all_foods[all_foods['ENERGI'].cumsum() <= bmr]
        total_calories = all_foods['ENERGI'].sum()

    print(f"Total Calories from Meals: {total_calories:.2f} vs Adjusted BMR: {bmr:.2f}")
    
    return breakfast_foods, lunch_foods, dinner_foods, fruit_snacks

# user input
user_input = {
    'sex': 'female',
    'age': 24,
    'weight': 72,  # in kg
    'height': 163,  # in cm
    'activity_level': 'light',  # activity level (sedentary, light, moderate, active, very active)
    'goal': 'lose'  # can be 'lose', 'gain', or 'maintain'
}

# hitung bmr
bmr = calculate_bmr(user_input['sex'], user_input['age'], user_input['weight'], user_input['height'], user_input['activity_level'])

# penyesuaian bmr
bmr_adjusted = adjust_bmr_for_goal(bmr, user_input['goal'])

# rekomendasi diet
breakfast, lunch, dinner, snacks = recommend_balanced_meals(bmr_adjusted, data)

# Print
print("=== Informasi Pengguna ===")
print(f"BMR (sebelum penyesuaian): {bmr:.2f} kalori")
print(f"Tujuan: {user_input['goal'].capitalize()} berat badan")
print(f"BMR yang disesuaikan (kebutuhan kalori per hari): {bmr_adjusted:.2f} kalori")
print("\nRekomendasi Makanan")
print("\nBreakfast:")
print(breakfast[['NAMA BAHAN','ENERGI', 'PROTEIN', 'LEMAK', 'KH']])
print("\nLunch:")
print(lunch[['NAMA BAHAN','ENERGI', 'PROTEIN', 'LEMAK', 'KH']])
print("\nDinner:")
print(dinner[['NAMA BAHAN','ENERGI', 'PROTEIN', 'LEMAK', 'KH']])
print("\nSnacks (Buah-buahan):")
print(snacks[['NAMA BAHAN','ENERGI', 'PROTEIN', 'LEMAK', 'KH']])



Mean Squared Error (MSE): 0.04943127850606681
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best Parameters: {'max_depth': 30, 'min_samples_split': 2, 'n_estimators': 100}
Mean Squared Error (MSE): 0.04943127850606681
Total Calories from Meals: 1420.00 vs Adjusted BMR: 1665.87
=== Informasi Pengguna ===
BMR (sebelum penyesuaian): 2082.34 kalori
Tujuan: Lose berat badan
BMR yang disesuaikan (kebutuhan kalori per hari): 1665.87 kalori

Rekomendasi Makanan

Breakfast:
                           NAMA BAHAN  ENERGI  PROTEIN  LEMAK    KH
854  Ikan kamera (kakap merah), segar    86.0     19.7    0.8   0.0
885                 Ikan telan, segar    95.0     15.8    0.7   6.5
369       Tempe kedelai murni, mentah   201.0     20.8    8.8  13.5
458             Daun ubi putih, segar    46.0      2.8    0.2   8.2
593                        Sayur garu   178.0      2.6   13.9  10.6
145             Hofa/Ubi hutan, segar   127.0      1.2    0.5  29.5

Lunch:
                      NAMA BAH