# Exploratory Data Analysis - Nutrition Classification

This notebook explores nutrition label data and prepares it for machine learning classification.


In [13]:
# Import libraries
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.data.data_loader import NutritionDataLoader, inspect_data
from src.data.preprocessing import NutritionPreprocessor, NUTRIENT_THRESHOLDS

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

%matplotlib inline


## 1. Load Data

Load nutrition data from Open Food Facts or USDA database.

**Note**: Update the file path to point to your downloaded dataset.


In [14]:

loader = NutritionDataLoader(processed_dir='../data/processed')
print('before the call')
open_foodfacts_df = loader.load_open_food_facts()  
print(f"Loaded {len(open_foodfacts_df)} products")


before the call
parquet_path: ../data/processed/openfoodfactsproducts.parquet
parquet_path.exists(): True
Loading Open Food Facts from Parquet...
Error loading Parquet file: A type extension with name pandas.period already defined


ArrowKeyError: A type extension with name pandas.period already defined

In [14]:
# Inspect the data
inspect_data(open_foodfacts_df)

# View first few rows
open_foodfacts_df.head()



DATASET OVERVIEW

Shape: (0, 0)

Columns: []

Data types:
Series([], dtype: object)

Missing values:
Series([], dtype: float64)


ValueError: Cannot describe a DataFrame without columns

## 2. Data Quality and Visualization


In [None]:
# Missing values visualization
missing_pct = (df.isnull().sum() / len(df) * 100).sort_values(ascending=False)

plt.figure(figsize=(10, 6))
missing_pct.plot(kind='bar')
plt.title('Missing Values by Column (%)')
plt.ylabel('Percentage')
plt.xlabel('Column')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Distribution of key nutrients
nutrients = ['energy_100g', 'sugars_100g', 'fat_100g', 'proteins_100g', 
             'fiber_100g', 'sodium_100g']

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for idx, nutrient in enumerate(nutrients):
    if nutrient in df.columns:
        df[nutrient].hist(bins=50, ax=axes[idx], edgecolor='black')
        axes[idx].set_title(f'Distribution of {nutrient}')
        axes[idx].set_xlabel(nutrient)
        axes[idx].set_ylabel('Frequency')

plt.tight_layout()
plt.show()


## 3. Data Preprocessing and Feature Engineering


In [None]:
# Initialize preprocessor
preprocessor = NutritionPreprocessor()

# Clean data
df_clean = preprocessor.clean_data(df, drop_threshold=0.5)

# Engineer features
df_features = preprocessor.engineer_features(df_clean)

print(f"\nData shape after cleaning: {df_clean.shape}")
print(f"Features after engineering: {df_features.shape[1]}")
print(f"\nNew features created:")
new_features = set(df_features.columns) - set(df_clean.columns)
for feat in new_features:
    print(f"  - {feat}")


## 4. Create Classification Labels


In [None]:
# Create multiclass labels for different nutrients
df_features['sugar_class'] = preprocessor.create_nutrient_labels(
    df_features, 'sugars_100g', NUTRIENT_THRESHOLDS.get('sugars_100g')
)

df_features['fiber_class'] = preprocessor.create_nutrient_labels(
    df_features, 'fiber_100g', NUTRIENT_THRESHOLDS.get('fiber_100g')
)

df_features['protein_class'] = preprocessor.create_nutrient_labels(
    df_features, 'proteins_100g', NUTRIENT_THRESHOLDS.get('proteins_100g')
)

# Create binary healthy/unhealthy labels
df_features['healthy'] = preprocessor.create_binary_labels(df_features)

print("Label distributions:")
print(f"\nSugar classes (0=Low, 1=Medium, 2=High):")
print(df_features['sugar_class'].value_counts().sort_index())
print(f"\nHealthy/Unhealthy: {df_features['healthy'].value_counts().to_dict()}")


## 5. Save Processed Data


In [None]:
# Save processed data
output_path = '../data/processed/nutrition_data_processed.csv'
df_features.to_csv(output_path, index=False)
print(f"Processed data saved to {output_path}")
print(f"Shape: {df_features.shape}")
