# GIM Mess Food-Waste Forecasting — Decision Tree Baseline

Run this notebook in **Google Colab** to train a simple, explainable model and export plots + model artifacts.

In [None]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt
from pathlib import Path
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib, json
pd.set_option('display.max_columns', 200)
DATA_1 = Path('../data/mess_waste_GIM_500.csv')
DATA_2 = Path('../data/mess_waste_GIM_daily_exams.csv')
DATA = DATA_1 if DATA_1.exists() else DATA_2
print('Using data:', DATA)


In [None]:
df = pd.read_csv(DATA, parse_dates=['date']).sort_values('date').reset_index(drop=True)
df.head()

In [None]:
df2 = df.copy()
df2['is_exam']    = (df2['event_type']=='Exam').astype(int)
df2['is_holiday'] = (df2['event_type']=='Holiday').astype(int)
df2['is_fest']    = (df2['event_type']=='Fest').astype(int)

df2['sweet_milk']  = (df2['sweet_type']=='Milk-based').astype(int)
df2['sweet_fried'] = (df2['sweet_type']=='Fried').astype(int)
df2['sweet_halwa'] = (df2['sweet_type']=='Halwa-type').astype(int)

df2['yesterday_waste'] = df2['food_waste_kg'].shift(1)
df2 = df2.dropna(subset=['yesterday_waste']).reset_index(drop=True)

FEATURES = ['cooked_kg','temp_c','rain_mm','humidity','is_weekend','is_exam','is_holiday','is_fest','sweet_milk','sweet_fried','sweet_halwa','yesterday_waste']
X = df2[FEATURES]
y = df2['food_waste_kg']
len(X), X.head(2)

In [None]:
n = len(df2)
i_tr = int(0.70*n); i_va = int(0.85*n)
X_train, y_train = X.iloc[:i_tr], y.iloc[:i_tr]
X_val,   y_val   = X.iloc[i_tr:i_va], y.iloc[i_tr:i_va]
X_test,  y_test  = X.iloc[i_va:], y.iloc[i_va:]
dates_test = df2['date'].iloc[i_va:]
len(X_train), len(X_val), len(X_test)

In [None]:
def eval_block(y_true, y_hat, name):
    mae = mean_absolute_error(y_true, y_hat)
    rmse = mean_squared_error(y_true, y_hat, squared=False)
    r2 = r2_score(y_true, y_hat)
    print(f"{name}: MAE={mae:.2f} kg | RMSE={rmse:.2f} kg | R²={r2:.3f}")
    return {'MAE': float(mae), 'RMSE': float(rmse), 'R2': float(r2)}

candidates = []
for md in [3,4,5,6,7]:
    for msl in [1,2,3,5]:
        m = DecisionTreeRegressor(max_depth=md, min_samples_leaf=msl, random_state=42)
        m.fit(X_train, y_train)
        p = m.predict(X_val)
        mae = mean_absolute_error(y_val, p)
        candidates.append((mae, md, msl))
best_mae, best_md, best_msl = sorted(candidates)[0]
best_tree = DecisionTreeRegressor(max_depth=best_md, min_samples_leaf=best_msl, random_state=42)
best_tree.fit(pd.concat([X_train, X_val]), pd.concat([y_train, y_val]))
test_pred = best_tree.predict(X_test)
metrics = eval_block(y_test, test_pred, f"Test (best tree md={best_md}, msl={best_msl})")
metrics

In [None]:
import matplotlib.pyplot as plt
fig_dir = Path('../reports/figures'); fig_dir.mkdir(parents=True, exist_ok=True)

plt.figure(figsize=(12,4))
plt.plot(dates_test, y_test.values, label='Actual', linewidth=2)
plt.plot(dates_test, test_pred, label='DecisionTree', alpha=0.9)
plt.title('Daily Food Waste — Actual vs Predicted')
plt.xlabel('Date'); plt.ylabel('kg'); plt.grid(True); plt.legend(); plt.tight_layout()
plt.savefig(fig_dir/'fig_avp.png', dpi=150)
plt.show()

imp = pd.Series(best_tree.feature_importances_, index=FEATURES).sort_values(ascending=True)
plt.figure(figsize=(7,6))
imp.plot(kind='barh')
plt.title('Decision Tree — Feature Importance')
plt.tight_layout(); plt.savefig(fig_dir/'fig_importance.png', dpi=150); plt.show()

In [None]:
art_dir = Path('../artifacts'); art_dir.mkdir(parents=True, exist_ok=True)
joblib.dump({'model': best_tree, 'columns': list(X.columns)}, art_dir/'gim_tree_model.joblib')
import json
json.dump(metrics, open(art_dir/'metrics.json','w'), indent=2)
json.dump(list(X.columns), open(art_dir/'feature_columns.json','w'), indent=2)
print('Saved artifacts to', art_dir)

In [None]:
def predict_tomorrow_tree(cooked_kg, event_type='None', sweet_type='Milk-based', temp_c=30.0, rain_mm=5.0, humidity=85, df_history=df2):
    bundle = joblib.load(Path('../artifacts')/'gim_tree_model.joblib')
    model, cols = bundle['model'], bundle['columns']
    row = {
        'cooked_kg': cooked_kg,
        'temp_c': temp_c,
        'rain_mm': rain_mm,
        'humidity': humidity,
        'is_weekend': int((df_history['date'].iloc[-1] + pd.Timedelta(days=1)).dayofweek >= 5),
        'is_exam': int(event_type=='Exam'),
        'is_holiday': int(event_type=='Holiday'),
        'is_fest': int(event_type=='Fest'),
        'sweet_milk': int(sweet_type=='Milk-based'),
        'sweet_fried': int(sweet_type=='Fried'),
        'sweet_halwa': int(sweet_type=='Halwa-type'),
        'yesterday_waste': float(df_history['food_waste_kg'].iloc[-1])
    }
    X_row = pd.DataFrame([row])[cols]
    return float(model.predict(X_row)[0])

predict_tomorrow_tree(410, event_type='Exam', sweet_type='Fried', temp_c=29.5, rain_mm=8.0, humidity=86)