In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import re
import joblib
import warnings
warnings.filterwarnings('ignore')

# ----- 1. DATA LOADING -----
food_df = pd.read_csv('seasonal_food_database.csv')
print(f"Loaded food dataset with {len(food_df)} items and {len(food_df.columns)} columns")

# ----- 2. FEATURE ENGINEERING -----
def process_features(df):
    """Process and encode food features for machine learning"""
    # Deep copy to avoid modifying original
    processed_df = df.copy()

    # Handle categorical features with one-hot encoding
    categorical_cols = ['cuisine_type', 'region', 'country', 'diet_type', 'meal_type']
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    encoded_cats = encoder.fit_transform(processed_df[categorical_cols])
    encoded_df = pd.DataFrame(
        encoded_cats,
        columns=encoder.get_feature_names_out(categorical_cols),
        index=processed_df.index
    )

    # Scale numerical features
    numerical_cols = ['serving_size_g', 'calories', 'protein_g', 'fat_g', 'carbs_g',
                     'fiber_g', 'sugar_g', 'sodium_mg', 'cholesterol_mg']
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(processed_df[numerical_cols])
    scaled_df = pd.DataFrame(
        scaled_features,
        columns=numerical_cols,
        index=processed_df.index
    )

    # Process allergens - convert to a list if it's a string
    if 'allergens' in df.columns and df['allergens'].dtype == 'object':
        # Split allergens into individual items if they're in a comma-separated string
        allergen_set = set()
        for allergens_str in df['allergens'].dropna():
            if isinstance(allergens_str, str):
                allergens = [a.strip() for a in allergens_str.split(',')]
                allergen_set.update(allergens)

        # Create binary flags for each allergen
        for allergen in allergen_set:
            if allergen:  # Skip empty strings
                processed_df[f'contains_{allergen.lower()}'] = processed_df['allergens'].str.contains(
                    re.escape(allergen), case=False, na=False).astype(int)

    # Combine all processed features
    binary_cols = ['suitable_breakfast', 'suitable_lunch', 'suitable_dinner',
                  'suitable_snack', 'spring', 'summer', 'fall', 'winter']

    allergen_cols = [col for col in processed_df.columns if col.startswith('contains_')]

    result_df = pd.concat([
        processed_df[['food_id', 'food_name']],  # Keep identifiers
        encoded_df,                              # Encoded categoricals
        scaled_df,                               # Scaled numericals
        processed_df[binary_cols],               # Binary features
        processed_df[allergen_cols]              # Allergen flags
    ], axis=1)

    # Save encoders and scalers for later use
    joblib.dump(encoder, 'food_encoder.pkl')
    joblib.dump(scaler, 'food_scaler.pkl')

    return result_df

# Process the features
processed_food_df = process_features(food_df)
print(f"Processed food data: {processed_food_df.shape[1]} features created")

# ----- 3. MACHINE LEARNING MODEL 1: FOOD CLUSTERING -----
def cluster_foods(df, n_clusters=8):
    """Cluster foods by nutritional profile for similarity-based recommendations"""
    # Select features for clustering
    cluster_features = df[[col for col in df.columns if col.startswith(('calories', 'protein_', 'fat_', 'carbs_', 'fiber_', 'sugar_'))]].copy()

    # Find optimal number of clusters using silhouette score
    from sklearn.metrics import silhouette_score
    
    silhouette_scores = []
    k_range = range(2, 15)
    
    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        clusters = kmeans.fit_predict(cluster_features)
        score = silhouette_score(cluster_features, clusters)
        silhouette_scores.append(score)
        print(f"K={k}, Silhouette Score: {score:.4f}")
    
    # Determine optimal k value (you could automatically select the max)
    optimal_k = k_range[np.argmax(silhouette_scores)]
    print(f"Optimal number of clusters: {optimal_k}")
    
    # Apply K-Means with optimal number of clusters
    kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
    clusters = kmeans.fit_predict(cluster_features)

    # Add cluster labels to original dataframe
    result = df.copy()
    result['cluster'] = clusters

    # Save the clustering model
    joblib.dump(kmeans, 'food_clusters.pkl')

    return result

# Apply clustering to our processed food data
clustered_food_df = cluster_foods(processed_food_df)
print(f"Foods clustered into {clustered_food_df['cluster'].nunique()} groups")

# ----- 4. MACHINE LEARNING MODEL 2: MEAL TYPE PREDICTION WITH ACCURACY METRICS -----
def train_meal_type_predictor(df):
    """Train model to predict if a food is suitable for a specific meal type"""
    meal_predictors = {}
    
    # Features to use (exclude meal suitability flags and identifiers)
    feature_cols = [col for col in df.columns if not col.startswith(('food_', 'suitable_', 'cluster'))]

    for meal_type in ['breakfast', 'lunch', 'dinner', 'snack']:
        print(f"\n--- {meal_type.upper()} SUITABILITY CLASSIFIER ---")
        
        # Prepare data
        X = df[feature_cols]
        y = df[f'suitable_{meal_type}']

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

        # Initialize model with hyperparameters tuned for better performance
        clf = RandomForestClassifier(
            n_estimators=200,
            max_depth=None,
            min_samples_split=5,
            min_samples_leaf=2,
            max_features='sqrt',
            bootstrap=True,
            class_weight='balanced',
            random_state=42
        )
        
        # Train model
        clf.fit(X_train, y_train)
        
        # Evaluate on test set
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print(f"Test Accuracy: {accuracy:.4f}")
        
        # Cross-validation
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        cv_scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')
        print(f"Cross-validation accuracy: {cv_scores.mean():.4f} (±{cv_scores.std():.4f})")
        
        # Confusion Matrix and Classification Report
        cm = confusion_matrix(y_test, y_pred)
        print("Confusion Matrix:")
        print(cm)
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred))
        
        # Feature Importance
        feature_importances = pd.DataFrame({
            'feature': feature_cols,
            'importance': clf.feature_importances_
        }).sort_values('importance', ascending=False)
        print("\nTop 10 Important Features:")
        print(feature_importances.head(10))
        
        # Store model
        meal_predictors[meal_type] = clf
        joblib.dump(clf, f'{meal_type}_predictor.pkl')

    return meal_predictors

# Train meal type predictors with comprehensive evaluation
meal_predictors = train_meal_type_predictor(clustered_food_df)

# ----- 5. RECOMMENDATION SYSTEM: CREATE SIMILARITY MATRIX -----
def create_similarity_matrix(df):
    """Create food similarity matrix based on nutritional and categorical features"""
    # Select features for similarity calculation
    similarity_features = df.drop(['food_id', 'food_name', 'cluster'], axis=1, errors='ignore')

    # Calculate similarity
    similarity = cosine_similarity(similarity_features)
    
    # Evaluate similarity matrix by checking overall distribution
    sim_flat = similarity.flatten()
    sim_flat = sim_flat[sim_flat < 1.0]  # Remove self-similarities
    
    print(f"Similarity Matrix Statistics:")
    print(f"Shape: {similarity.shape}")
    print(f"Mean similarity: {sim_flat.mean():.4f}")
    print(f"Median similarity: {np.median(sim_flat):.4f}")
    print(f"Min similarity: {sim_flat.min():.4f}")
    print(f"Max similarity: {sim_flat.max():.4f}")

    return similarity

# Create similarity matrix
similarity_matrix = create_similarity_matrix(clustered_food_df)

# ----- 6. CLASS FOR FILTERING AND RECOMMENDATION -----
class FoodRecommendationSystem:
    def __init__(self, food_df, processed_df, clustered_df, similarity_matrix, meal_predictors):
        """Initialize the food recommendation system with all required components"""
        self.food_df = food_df  # Original food data
        self.processed_df = processed_df  # Processed features
        self.clustered_df = clustered_df  # With cluster assignments
        self.similarity_matrix = similarity_matrix  # Food similarity
        self.meal_predictors = meal_predictors  # Meal type classifiers

    def filter_foods_by_constraints(self, diet_type, meal_type, season, allergens=None):
        """Filter foods based on user constraints"""
        # Start with all foods
        filtered_df = self.food_df.copy()

        # Filter by diet type
        if diet_type:
            filtered_df = filtered_df[filtered_df['diet_type'] == diet_type]

        # Filter by meal type
        if meal_type:
            column = f'suitable_{meal_type.lower()}'
            filtered_df = filtered_df[filtered_df[column] == 1]

        # Filter by season
        if season:
            filtered_df = filtered_df[filtered_df[season] == 1]

        # Filter by allergens
        if allergens:
            for allergen in allergens:
                # Check if allergens column is string type
                if filtered_df['allergens'].dtype == 'object':
                    filtered_df = filtered_df[~filtered_df['allergens'].str.contains(
                        re.escape(allergen), case=False, na=False)]

        return filtered_df

    def get_similar_foods(self, food_id, top_n=5):
        """Find similar foods based on similarity matrix"""
        # Get index of the food
        idx = self.clustered_df[self.clustered_df['food_id'] == food_id].index

        if len(idx) == 0:
            return []

        idx = idx[0]

        # Get similarity scores
        similarity_scores = list(enumerate(self.similarity_matrix[idx]))

        # Sort by similarity (exclude itself)
        similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]

        # Get food details
        similar_foods = []
        for i, score in similarity_scores:
            food = self.food_df.iloc[i]
            similar_foods.append({
                'food_id': food['food_id'],
                'food_name': food['food_name'],
                'similarity': score,
                'calories': food['calories'],
                'protein_g': food['protein_g'],
                'diet_type': food['diet_type']
            })

        return similar_foods

    def evaluate_recommendations(self, num_test_foods=10):
        """Evaluate the quality of recommendations"""
        # Select random food items to test
        test_food_indices = np.random.choice(len(self.food_df), size=num_test_foods, replace=False)
        
        # For each test food, get recommendations and evaluate relevance
        cluster_hit_rate = 0
        diet_hit_rate = 0
        meal_hit_rate = 0
        
        for idx in test_food_indices:
            food_id = self.food_df.iloc[idx]['food_id']
            test_food = self.food_df.iloc[idx]
            
            # Get recommendations for this food
            similar_foods = self.get_similar_foods(food_id, top_n=5)
            
            if not similar_foods:
                continue
                
            # Check if recommendations are in the same cluster
            test_cluster = self.clustered_df[self.clustered_df['food_id'] == food_id]['cluster'].values[0]
            
            cluster_matches = 0
            diet_matches = 0
            meal_matches = 0
            
            for food in similar_foods:
                rec_idx = self.food_df[self.food_df['food_id'] == food['food_id']].index[0]
                rec_food = self.food_df.iloc[rec_idx]
                rec_cluster = self.clustered_df.iloc[rec_idx]['cluster']
                
                # Cluster match
                if rec_cluster == test_cluster:
                    cluster_matches += 1
                
                # Diet type match
                if rec_food['diet_type'] == test_food['diet_type']:
                    diet_matches += 1
                
                # Check if suitable for same meals
                meal_match = True
                for meal in ['breakfast', 'lunch', 'dinner', 'snack']:
                    if rec_food[f'suitable_{meal}'] != test_food[f'suitable_{meal}']:
                        meal_match = False
                        break
                
                if meal_match:
                    meal_matches += 1
            
            # Calculate hit rates
            cluster_hit_rate += cluster_matches / len(similar_foods)
            diet_hit_rate += diet_matches / len(similar_foods)
            meal_hit_rate += meal_matches / len(similar_foods)
        
        # Average hit rates
        cluster_hit_rate /= num_test_foods
        diet_hit_rate /= num_test_foods
        meal_hit_rate /= num_test_foods
        
        print(f"\n--- RECOMMENDATION EVALUATION ---")
        print(f"Cluster Match Rate: {cluster_hit_rate:.4f}")
        print(f"Diet Type Match Rate: {diet_hit_rate:.4f}")
        print(f"Meal Suitability Match Rate: {meal_hit_rate:.4f}")
        
        return {
            'cluster_hit_rate': cluster_hit_rate,
            'diet_hit_rate': diet_hit_rate,
            'meal_hit_rate': meal_hit_rate
        }

# ----- 7. INITIALIZE SYSTEM -----
def main():
    # Initialize the food recommendation system
    recommendation_system = FoodRecommendationSystem(
        food_df=food_df,
        processed_df=processed_food_df,
        clustered_df=clustered_food_df,
        similarity_matrix=similarity_matrix,
        meal_predictors=meal_predictors
    )
    
    # Evaluate the recommendation system
    recommendation_system.evaluate_recommendations(num_test_foods=20)

if __name__ == "__main__":
    main()

Loaded food dataset with 1042 items and 25 columns
Processed food data: 89 features created
K=2, Silhouette Score: 0.2265
K=3, Silhouette Score: 0.2825
K=4, Silhouette Score: 0.2467
K=5, Silhouette Score: 0.2251
K=6, Silhouette Score: 0.1946
K=7, Silhouette Score: 0.1894
K=8, Silhouette Score: 0.1912
K=9, Silhouette Score: 0.1853
K=10, Silhouette Score: 0.1749
K=11, Silhouette Score: 0.1697
K=12, Silhouette Score: 0.1774
K=13, Silhouette Score: 0.1800
K=14, Silhouette Score: 0.1808
Optimal number of clusters: 3
Foods clustered into 3 groups

--- BREAKFAST SUITABILITY CLASSIFIER ---
Test Accuracy: 0.7081
Cross-validation accuracy: 0.7140 (±0.0120)
Confusion Matrix:
[[119   7]
 [ 54  29]]

Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.94      0.80       126
           1       0.81      0.35      0.49        83

    accuracy                           0.71       209
   macro avg       0.75      0.65      0.64       209
weighted