In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import sys
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append('../src')

# Import custom modules
from data_utils import (
    load_raw_data, clean_interactions, clean_recipes,
    filter_sparse_users_recipes, create_binary_target,
    get_data_summary
)

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

print("Libraries imported successfully!")

## 1. Load Raw Data

In [None]:
# Load raw data
recipes_raw, interactions_raw = load_raw_data(
    recipes_path="../datasets/RAW_recipes.csv",
    interactions_path="../datasets/RAW_interactions.csv"
)

print(f"Loaded {len(recipes_raw)} recipes")
print(f"Loaded {len(interactions_raw)} interactions")

## 2. Basic Data Inspection

In [None]:
# Inspect recipes
print("=" * 50)
print("RECIPES DATA")
print("=" * 50)
print("\nFirst few rows:")
display(recipes_raw.head())

print("\nData types and missing values:")
display(recipes_raw.info())

print("\nBasic statistics:")
display(recipes_raw.describe())

In [None]:
# Inspect interactions
print("=" * 50)
print("INTERACTIONS DATA")
print("=" * 50)
print("\nFirst few rows:")
display(interactions_raw.head())

print("\nData types and missing values:")
display(interactions_raw.info())

print("\nBasic statistics:")
display(interactions_raw.describe())

In [None]:
# Check unique counts
print("Unique counts:")
print(f"  Unique recipes in recipes table: {recipes_raw['id'].nunique()}")
print(f"  Unique recipes in interactions: {interactions_raw['recipe_id'].nunique()}")
print(f"  Unique users: {interactions_raw['user_id'].nunique()}")
print(f"  Unique ratings: {interactions_raw['rating'].nunique()}")
print(f"\nRating distribution:")
print(interactions_raw['rating'].value_counts().sort_index())

## 3. Data Cleaning

In [None]:
# Clean interactions (drop zero ratings)
interactions_clean = clean_interactions(interactions_raw, drop_zero_ratings=True)

print(f"Interactions before cleaning: {len(interactions_raw)}")
print(f"Interactions after cleaning: {len(interactions_clean)}")
print(f"Dropped {len(interactions_raw) - len(interactions_clean)} rows ({100 * (1 - len(interactions_clean)/len(interactions_raw)):.1f}%)")

print("\nRating distribution after cleaning:")
print(interactions_clean['rating'].value_counts().sort_index())

In [None]:
# Clean recipes and extract features
recipes_clean = clean_recipes(recipes_raw)

print(f"Recipes before cleaning: {len(recipes_raw)}")
print(f"Recipes after cleaning: {len(recipes_clean)}")
print("\nNew columns added:")
new_cols = set(recipes_clean.columns) - set(recipes_raw.columns)
print(list(new_cols))

In [None]:
# Check healthiness distribution
print("Healthiness distribution:")
print(f"  Healthy recipes: {recipes_clean['is_healthy'].sum()} ({100 * recipes_clean['is_healthy'].mean():.1f}%)")
print(f"  Unhealthy recipes: {(1 - recipes_clean['is_healthy']).sum()} ({100 * (1 - recipes_clean['is_healthy'].mean()):.1f}%)")

print("\nNutrition statistics:")
display(recipes_clean[['calories', 'sugar_pdv', 'saturated_fat_pdv', 'protein_pdv', 'health_score']].describe())

## 4. Data Summary

In [None]:
# Get comprehensive summary
summary = get_data_summary(recipes_clean, interactions_clean)

print("Dataset Summary:")
print("=" * 50)
for key, value in summary.items():
    if key != 'rating_distribution':
        print(f"{key}: {value}")
    else:
        print(f"\n{key}:")
        for rating, count in value.items():
            print(f"  Rating {rating}: {count}")

## 5. Exploratory Visualizations

In [None]:
# Rating distribution
fig, ax = plt.subplots(figsize=(10, 6))
interactions_clean['rating'].value_counts().sort_index().plot(kind='bar', ax=ax, color='steelblue')
ax.set_xlabel('Rating', fontsize=12)
ax.set_ylabel('Count', fontsize=12)
ax.set_title('Distribution of Ratings', fontsize=14, fontweight='bold')
ax.grid(alpha=0.3, axis='y')
plt.xticks(rotation=0)
plt.tight_layout()
plt.savefig('../reports/figures/rating_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"Mean rating: {interactions_clean['rating'].mean():.2f}")
print(f"Median rating: {interactions_clean['rating'].median():.0f}")

In [None]:
# Calories distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Full distribution
axes[0].hist(recipes_clean['calories'].dropna(), bins=50, color='coral', edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Calories', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].set_title('Distribution of Calories (All Recipes)', fontsize=13, fontweight='bold')
axes[0].grid(alpha=0.3, axis='y')

# Zoomed in (reasonable range)
calories_filtered = recipes_clean['calories'][(recipes_clean['calories'] > 0) & (recipes_clean['calories'] < 1000)]
axes[1].hist(calories_filtered, bins=50, color='coral', edgecolor='black', alpha=0.7)
axes[1].set_xlabel('Calories', fontsize=12)
axes[1].set_ylabel('Frequency', fontsize=12)
axes[1].set_title('Distribution of Calories (0-1000 range)', fontsize=13, fontweight='bold')
axes[1].grid(alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('../reports/figures/calories_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"Calories - Mean: {recipes_clean['calories'].mean():.1f}, Median: {recipes_clean['calories'].median():.1f}")

In [None]:
# Join data for analysis
df_joined = interactions_clean.merge(recipes_clean, left_on='recipe_id', right_on='id', how='inner')
print(f"Joined dataset size: {len(df_joined)} rows")

In [None]:
# Healthy vs Unhealthy Ratings
fig, ax = plt.subplots(figsize=(10, 6))

df_plot = df_joined[['is_healthy', 'rating']].copy()
df_plot['health_label'] = df_plot['is_healthy'].map({0: 'Unhealthy', 1: 'Healthy'})

sns.boxplot(data=df_plot, x='health_label', y='rating', palette=['salmon', 'lightgreen'], ax=ax)
ax.set_xlabel('Recipe Category', fontsize=12)
ax.set_ylabel('Rating', fontsize=12)
ax.set_title('Rating Distribution: Healthy vs Unhealthy Recipes', fontsize=14, fontweight='bold')
ax.grid(alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('../reports/figures/healthy_vs_unhealthy_ratings.png', dpi=300, bbox_inches='tight')
plt.show()

# Statistics
print("\nMean ratings by health category:")
print(df_joined.groupby('is_healthy')['rating'].agg(['mean', 'median', 'count']))

In [None]:
# Calories vs Rating scatter plot
fig, ax = plt.subplots(figsize=(12, 6))

# Sample for visualization (too many points otherwise)
df_sample = df_joined[df_joined['calories'] < 1500].sample(n=min(10000, len(df_joined)), random_state=42)

scatter = ax.scatter(df_sample['calories'], df_sample['rating'], 
                    c=df_sample['is_healthy'], cmap='RdYlGn', 
                    alpha=0.3, s=10)
ax.set_xlabel('Calories', fontsize=12)
ax.set_ylabel('Rating', fontsize=12)
ax.set_title('Calories vs Rating (colored by health status)', fontsize=14, fontweight='bold')
ax.grid(alpha=0.3)
cbar = plt.colorbar(scatter, ax=ax)
cbar.set_label('Healthy', fontsize=11)

plt.tight_layout()
plt.savefig('../reports/figures/rating_vs_calories_scatter.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Correlation heatmap of nutrition features
nutrition_cols = ['calories', 'total_fat_pdv', 'sugar_pdv', 'sodium_pdv', 
                 'protein_pdv', 'saturated_fat_pdv', 'carbs_pdv']

fig, ax = plt.subplots(figsize=(10, 8))
corr_matrix = recipes_clean[nutrition_cols].corr()
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
           center=0, square=True, ax=ax, cbar_kws={'label': 'Correlation'})
ax.set_title('Correlation Matrix: Nutrition Features', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig('../reports/figures/nutrition_correlation.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Rating by cooking time categories
fig, ax = plt.subplots(figsize=(12, 6))

# Categorize minutes
df_time = df_joined[df_joined['minutes'] < 300].copy()
df_time['time_category'] = pd.cut(df_time['minutes'], 
                                   bins=[0, 30, 60, 120, 300],
                                   labels=['<30 min', '30-60 min', '60-120 min', '>120 min'])

sns.boxplot(data=df_time, x='time_category', y='rating', palette='Set2', ax=ax)
ax.set_xlabel('Cooking Time', fontsize=12)
ax.set_ylabel('Rating', fontsize=12)
ax.set_title('Rating Distribution by Cooking Time', fontsize=14, fontweight='bold')
ax.grid(alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('../reports/figures/rating_by_cooking_time.png', dpi=300, bbox_inches='tight')
plt.show()

## 6. Save Cleaned Data

In [None]:
# Save cleaned datasets
recipes_clean.to_csv('../datasets/recipes_clean.csv', index=False)
interactions_clean.to_csv('../datasets/interactions_clean.csv', index=False)

print("Cleaned data saved!")
print(f"  - recipes_clean.csv: {len(recipes_clean)} rows")
print(f"  - interactions_clean.csv: {len(interactions_clean)} rows")

## 7. Key Findings Summary

**Data Overview:**
- Total recipes and interactions analyzed
- Rating distribution is heavily skewed toward high ratings (4-5 stars)
- Many recipes with zero ratings were removed

**Healthiness Analysis:**
- Only a small fraction of recipes meet our "healthy" criteria
- Healthy recipes appear to have slightly different rating distributions
- Calories and other nutrition factors show interesting correlations

**Next Steps:**
- Build predictive models for rating prediction
- Develop recommender system
- Analyze health bias in recommendations