In [1]:
# -*- coding: utf-8 -*-
"""Fashion_Recommendation_System.ipynb

Automatically generated by Colab.

# üõçÔ∏è **Complete Fashion Recommendation & Outfit Building System**

**System Features:**
1. Personalized recommendations based on purchase history
2. Size/fit recommendations based on body measurements
3. Outfit building and compatibility scoring
4. Cold-start recommendations for new users
5. Style-based filtering
---
""" 

# @title ‚öôÔ∏è **Step 0: Install & Import Required Libraries**

!pip install pandas numpy scikit-learn tensorflow openpyxl sqlalchemy pymysql python-dotenv -q

import os
import json
import pandas as pd
import numpy as np
import pickle
import warnings
import builtins
from pathlib import Path
from dotenv import load_dotenv
from sqlalchemy import create_engine
from sqlalchemy.engine.url import URL

warnings.filterwarnings('ignore')

# Machine Learning libraries
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity

# TensorFlow for embeddings
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam

ARTIFACTS_DIR = Path.cwd() / "artifacts"
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)

def artifact_path(filename: str) -> Path:
    return ARTIFACTS_DIR / filename

def find_project_file(filename: str, start: Path | None = None) -> Path | None:
    current = (start or Path.cwd()).resolve()
    for parent in [current, *current.parents]:
        candidate = parent / filename
        if candidate.exists():
            return candidate
    return None

def _resolve_artifact_path(path):
    if isinstance(path, (str, Path)):
        path_str = str(path)
        if path_str.startswith("/content/"):
            return artifact_path(path_str.replace("/content/", "", 1))
    return path

if not hasattr(builtins, "_artifact_original_open"):
    builtins._artifact_original_open = builtins.open
    def _artifact_open(file, *args, **kwargs):
        resolved = _resolve_artifact_path(file)
        return builtins._artifact_original_open(resolved, *args, **kwargs)
    builtins.open = _artifact_open

if not hasattr(builtins, "_artifact_original_print"):
    builtins._artifact_original_print = builtins.print
    def _artifact_print(*args, **kwargs):
        prefix = f"{ARTIFACTS_DIR.resolve()}/"
        updated_args = []
        for arg in args:
            if isinstance(arg, str):
                updated_args.append(arg.replace("/content/", prefix))
            else:
                updated_args.append(arg)
        builtins._artifact_original_print(*updated_args, **kwargs)
    builtins.print = _artifact_print

if not hasattr(pd, "_artifact_original_read_pickle"):
    pd._artifact_original_read_pickle = pd.read_pickle
    def _artifact_read_pickle(path, *args, **kwargs):
        resolved = _resolve_artifact_path(path)
        return pd._artifact_original_read_pickle(resolved, *args, **kwargs)
    pd.read_pickle = _artifact_read_pickle

if not hasattr(pd.DataFrame, "_artifact_original_to_pickle"):
    pd.DataFrame._artifact_original_to_pickle = pd.DataFrame.to_pickle
    def _artifact_to_pickle(self, path, *args, **kwargs):
        resolved = _resolve_artifact_path(path)
        return pd.DataFrame._artifact_original_to_pickle(self, resolved, *args, **kwargs)
    pd.DataFrame.to_pickle = _artifact_to_pickle

if not hasattr(np, "_artifact_original_save"):
    np._artifact_original_save = np.save
    def _artifact_np_save(file, arr, *args, **kwargs):
        resolved = _resolve_artifact_path(file)
        return np._artifact_original_save(resolved, arr, *args, **kwargs)
    np.save = _artifact_np_save

ORIGINAL_DATA_PICKLE = artifact_path("original_items.pkl")

print("‚úÖ All libraries imported successfully!")
print(f"TensorFlow version: {tf.__version__}")


[notice] A new release of pip is available: 24.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


‚úÖ All libraries imported successfully!
TensorFlow version: 2.20.0


In [2]:
# @title üìÅ **Step 1: Upload Your Data File or Connect to Database**

import os
import io
import json
from pathlib import Path
import pandas as pd

# Try to import required database libraries
try:
    import sqlite3
    from sqlalchemy import create_engine
    print("‚úÖ Database libraries imported successfully")
except ImportError as e:
    print(f"‚ö†Ô∏è Some database libraries not available: {e}")
    print("   CSV/Excel upload will still work")

# Configuration - Use local path instead of /content/
ORIGINAL_DATA_PICKLE = Path('./original_items.pkl')

def _coerce_json(value):
    """Convert values to JSON strings if needed"""
    if value is None or (isinstance(value, float) and pd.isna(value)):
        return ""
    if isinstance(value, (dict, list)):
        return json.dumps(value)
    if isinstance(value, str):
        return value
    return json.dumps(value)

def _build_items_query():
    """Build SQL query for fetching items"""
    return """\
SELECT
    items.id AS `ID`,
    items.name AS `Name`,
    items.description AS `Description`,
    stores.name AS `Store`,
    items.price AS `Price`,
    categories.name AS `Category`,
    items.garment_type AS `Garment Type`,
    items.stock_quantity AS `Total Stock`,
    items.color_variants AS `Color Variants`,
    items.sizing_data AS `Sizing Data`,
    items.size_stock AS `Size Stock`,
    items.variants AS `Variants`,
    items.created_at AS `Created At`,
    items.updated_at AS `Updated At`
FROM items
LEFT JOIN categories ON items.category_id = categories.id
LEFT JOIN stores ON items.store_id = stores.id
ORDER BY items.id
"""

def _load_from_mysql():
    """Load data from MySQL database using your Laravel .env settings"""
    print("üîç Attempting to connect to MySQL database...")

    # USING YOUR EXACT .ENV VALUES
    db_host = "127.0.0.1"  # Your DB_HOST
    db_port = "3306"       # Your DB_PORT
    db_name = "fitfast"    # Your DB_DATABASE
    db_user = "root"       # Your DB_USERNAME
    db_pass = ""           # Your DB_PASSWORD (empty as shown)

    print(f"üìä Using database connection:")
    print(f"   Host: {db_host}:{db_port}")
    print(f"   Database: {db_name}")
    print(f"   Username: {db_user}")

    try:
        print(f"üóÑÔ∏è Connecting to MySQL...")

        # Create connection URL - using pymysql driver
        connection_url = f"mysql+pymysql://{db_user}:{db_pass}@{db_host}:{db_port}/{db_name}"

        # Create engine and connect with timeout
        engine = create_engine(
            connection_url,
            pool_pre_ping=True,
            connect_args={'connect_timeout': 10}
        )

        # Test connection first
        with engine.connect() as conn:
            print("‚úÖ Database connection successful!")

        # Execute query
        df_loaded = pd.read_sql_query(_build_items_query(), engine)

        print(f"‚úÖ Retrieved {len(df_loaded)} items from MySQL")

        # Show columns retrieved
        print(f"üìã Columns retrieved: {len(df_loaded.columns)}")
        for i, col in enumerate(df_loaded.columns, 1):
            print(f"   {i:2d}. {col}")

        return df_loaded

    except Exception as e:
        print(f"‚ùå MySQL connection failed: {e}")

        # Try alternative approach if pymysql fails
        try:
            print("üîÑ Trying alternative connection method...")
            import pymysql

            conn = pymysql.connect(
                host=db_host,
                port=int(db_port),
                user=db_user,
                password=db_pass,
                database=db_name,
                charset='utf8mb4'
            )

            df_loaded = pd.read_sql_query(_build_items_query(), conn)
            conn.close()

            print(f"‚úÖ Retrieved {len(df_loaded)} items from MySQL (alternative method)")
            return df_loaded

        except Exception as e2:
            print(f"‚ùå Alternative method also failed: {e2}")
            return None

# Main data loading logic
print("üîÑ Connecting to your MySQL database...")
print("="*60)

# Try MySQL (based on your .env)
df = _load_from_mysql()

# If MySQL fails, provide helpful error
if df is None or df.empty:
    print("\n" + "="*60)
    print("‚ùå DATABASE CONNECTION FAILED")
    print("="*60)
    raise ConnectionError("Failed to connect to MySQL database.")

# Process JSON columns if they exist
json_columns = ['Color Variants', 'Sizing Data', 'Size Stock', 'Variants']
for col in json_columns:
    if col in df.columns:
        df[col] = df[col].apply(_coerce_json)
        print(f"üìù Processed JSON column: {col}")

print(f"\n" + "="*60)
print(f"‚úÖ SUCCESS! Data loaded: {len(df)} rows, {len(df.columns)} columns")
print("="*60)

print("\nüìä Sample data (first 3 rows):")
print(df.head(3))

# Display column information
print(f"\nüìã Column Details ({len(df.columns)} total):")
for i, col in enumerate(df.columns, 1):
    non_null = df[col].notna().sum()
    null_count = df[col].isna().sum()
    unique_count = df[col].nunique()

    # Get sample value
    sample_val = "N/A"
    if non_null > 0:
        first_non_null = df[col].dropna().iloc[0]
        sample_val = str(first_non_null)[:50]
        if len(str(first_non_null)) > 50:
            sample_val += "..."

    print(f"{i:2d}. {col:20}")
    print(f"     Non-null: {non_null}/{len(df)} ({non_null/len(df)*100:.1f}%)")
    print(f"     Unique: {unique_count}")
    print(f"     Sample: {sample_val}")

# Save the original data - ensure directory exists
ORIGINAL_DATA_PICKLE.parent.mkdir(parents=True, exist_ok=True)
df.to_pickle(ORIGINAL_DATA_PICKLE)
print(f"\nüíæ Original data saved to '{ORIGINAL_DATA_PICKLE}' (absolute path: {ORIGINAL_DATA_PICKLE.absolute()})")

# Display summary statistics
print("\nüìà Data Summary:")
print(f"   - Total items: {len(df)}")
print(f"   - Unique Stores: {df['Store'].nunique() if 'Store' in df.columns else 'N/A'}")
print(f"   - Unique Categories: {df['Category'].nunique() if 'Category' in df.columns else 'N/A'}")
print(f"   - Unique Garment Types: {df['Garment Type'].nunique() if 'Garment Type' in df.columns else 'N/A'}")

if 'Price' in df.columns:
    print(f"   - Price Range: ${df['Price'].min():.2f} - ${df['Price'].max():.2f}")
    print(f"   - Average Price: ${df['Price'].mean():.2f}")

if 'Total Stock' in df.columns:
    print(f"   - Total Stock: {df['Total Stock'].sum():,}")
    print(f"   - Average Stock per Item: {df['Total Stock'].mean():.1f}")

print("\nüîç Data Quality Check:")
for col in ['Name', 'Price', 'Store', 'Category']:
    if col in df.columns:
        missing = df[col].isna().sum()
        total = len(df)
        pct = missing/total*100 if total > 0 else 0
        print(f"   - {col:15}: {missing:3d} missing ({pct:5.1f}%)")

print("\n" + "="*60)
print("‚úÖ Step 1 completed successfully! Ready for data analysis.")
print("="*60)

# Display additional insights about your data
print("\nüî¨ Additional Insights:")

# Store distribution
if 'Store' in df.columns:
    store_counts = df['Store'].value_counts()
    print(f"\nüè™ Store Distribution (Top 5):")
    for store, count in store_counts.head().items():
        pct = count/len(df)*100
        print(f"   {store}: {count} items ({pct:.1f}%)")

# Category distribution
if 'Category' in df.columns:
    category_counts = df['Category'].value_counts()
    print(f"\nüìÅ Category Distribution (Top 5):")
    for category, count in category_counts.head().items():
        pct = count/len(df)*100
        print(f"   {category}: {count} items ({pct:.1f}%)")

# Garment type distribution
if 'Garment Type' in df.columns:
    garment_counts = df['Garment Type'].value_counts()
    print(f"\nüëï Garment Type Distribution (Top 5):")
    for garment, count in garment_counts.head().items():
        pct = count/len(df)*100
        print(f"   {garment}: {count} items ({pct:.1f}%)")

# Stock analysis
if 'Total Stock' in df.columns:
    low_stock = df[df['Total Stock'] < 10].shape[0]
    out_of_stock = df[df['Total Stock'] == 0].shape[0]
    high_stock = df[df['Total Stock'] > 100].shape[0]

    print(f"\nüì¶ Stock Analysis:")
    print(f"   Items with low stock (<10): {low_stock} ({low_stock/len(df)*100:.1f}%)")
    print(f"   Items out of stock: {out_of_stock} ({out_of_stock/len(df)*100:.1f}%)")
    print(f"   Items with high stock (>100): {high_stock} ({high_stock/len(df)*100:.1f}%)")

# Price segments
if 'Price' in df.columns:
    price_bins = pd.cut(df['Price'], bins=5)
    price_dist = price_bins.value_counts().sort_index()
    print(f"\nüí∞ Price Distribution:")
    for price_range, count in price_dist.items():
        pct = count/len(df)*100
        print(f"   {price_range}: {count} items ({pct:.1f}%)")

‚úÖ Database libraries imported successfully
üîÑ Connecting to your MySQL database...
üîç Attempting to connect to MySQL database...
üìä Using database connection:
   Host: 127.0.0.1:3306
   Database: fitfast
   Username: root
üóÑÔ∏è Connecting to MySQL...
‚úÖ Database connection successful!
‚úÖ Retrieved 250 items from MySQL
üìã Columns retrieved: 14
    1. ID
    2. Name
    3. Description
    4. Store
    5. Price
    6. Category
    7. Garment Type
    8. Total Stock
    9. Color Variants
   10. Sizing Data
   11. Size Stock
   12. Variants
   13. Created At
   14. Updated At
üìù Processed JSON column: Color Variants
üìù Processed JSON column: Sizing Data
üìù Processed JSON column: Size Stock
üìù Processed JSON column: Variants

‚úÖ SUCCESS! Data loaded: 250 rows, 14 columns

üìä Sample data (first 3 rows):
   ID               Name                                        Description  \
0   1   Classic Crew Tee  Classic Crew Tee. Made from High-quality linen...   
1   2    

In [3]:
# @title üéØ **STEP 2: Feature Engineering with SIZE-BASED Measurements**
print("üéØ COMPLETE STEP 2: Feature Engineering with SIZE-BASED Measurements")
print("=" * 60)

import pandas as pd
import numpy as np
import json
import pickle
import os
import re
from pathlib import Path
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from collections import Counter

# ========== CONFIGURATION ==========
# Define paths
ARTIFACTS_DIR = Path('./artifacts')
ARTIFACTS_DIR.mkdir(exist_ok=True)

# Define standard size order for consistent processing
STANDARD_SIZES = ['XXS', 'XS', 'S', 'M', 'L', 'XL', 'XXL', 'XXXL']

# Define garment type measurement mappings (from your PHP code)
GARMENT_TYPE_MEASUREMENTS = {
    't_shirt': ['chest_circumference', 'garment_length', 'sleeve_length', 'shoulder_width'],
    'fitted_shirt': ['chest_circumference', 'waist_circumference', 'garment_length', 'sleeve_length', 'shoulder_width'],
    'dress_shirt': ['chest_circumference', 'waist_circumference', 'garment_length', 'sleeve_length', 'shoulder_width', 'collar_size'],
    'slim_pants': ['waist_circumference', 'hips_circumference', 'inseam_length', 'thigh_circumference', 'leg_opening'],
    'regular_pants': ['waist_circumference', 'hips_circumference', 'inseam_length', 'thigh_circumference', 'leg_opening'],
    'regular_jeans': ['waist_circumference', 'hips_circumference', 'inseam_length', 'thigh_circumference', 'leg_opening', 'rise'],
    'slim_jeans': ['waist_circumference', 'hips_circumference', 'inseam_length', 'thigh_circumference', 'leg_opening', 'rise'],
    'casual_shorts': ['waist_circumference', 'hips_circumference', 'short_length', 'thigh_circumference', 'leg_opening'],
    'a_line_dress': ['chest_circumference', 'waist_circumference', 'hips_circumference', 'dress_length', 'shoulder_to_hem'],
    'bodycon_dress': ['chest_circumference', 'waist_circumference', 'hips_circumference', 'dress_length'],
    'maxi_dress': ['chest_circumference', 'waist_circumference', 'hips_circumference', 'dress_length', 'shoulder_to_hem'],
    'sun_dress': ['chest_circumference', 'waist_circumference', 'hips_circumference', 'dress_length', 'shoulder_to_hem'],
    'pencil_skirt': ['waist_circumference', 'hips_circumference', 'skirt_length'],
    'a_line_skirt': ['waist_circumference', 'hips_circumference', 'skirt_length'],
    'bomber_jacket': ['chest_circumference', 'garment_length', 'sleeve_length', 'shoulder_width', 'bicep_circumference'],
    'denim_jacket': ['chest_circumference', 'garment_length', 'sleeve_length', 'shoulder_width'],
    'trench_coat': ['chest_circumference', 'garment_length', 'sleeve_length', 'shoulder_width'],
    'wool_coat': ['chest_circumference', 'garment_length', 'sleeve_length', 'shoulder_width'],
    'crewneck_sweater': ['chest_circumference', 'garment_length', 'sleeve_length', 'shoulder_width'],
    'v_neck_sweater': ['chest_circumference', 'garment_length', 'sleeve_length', 'shoulder_width'],
    'pullover_hoodie': ['chest_circumference', 'garment_length', 'sleeve_length', 'shoulder_width', 'hood_height'],
    'zip_hoodie': ['chest_circumference', 'garment_length', 'sleeve_length', 'shoulder_width', 'hood_height'],
    'yoga_pants': ['waist_circumference', 'hips_circumference', 'inseam_length', 'thigh_circumference'],
    'training_shorts': ['waist_circumference', 'hips_circumference', 'short_length', 'thigh_circumference'],
    'bikini_top': ['chest_circumference', 'underbust_circumference', 'cup_size'],
    'swim_trunks': ['waist_circumference', 'hips_circumference', 'short_length', 'thigh_circumference'],
    'briefs': ['waist_circumference', 'hips_circumference'],
    'boxers': ['waist_circumference', 'hips_circumference', 'short_length'],
    'ankle_socks': ['foot_length'],
    'crew_socks': ['foot_length'],
    'sneakers': ['foot_length', 'foot_width'],
    'dress_shoes': ['foot_length', 'foot_width'],
    'backpack': [],
    'tote_bag': [],
    'necklace': ['chain_length'],
    'bracelet': ['bracelet_circumference'],
    'baseball_cap': ['head_circumference'],
    'beanie': ['head_circumference'],
}

# ========== 1. RELOAD ORIGINAL DATA ==========
print("\n1. üîÑ Loading original data...")

# Use the CORRECT path from Step 1
ORIGINAL_DATA_PATH = Path('./original_items.pkl')  # Local path from Step 1
print(f"   Looking for: {ORIGINAL_DATA_PATH.absolute()}")

if not ORIGINAL_DATA_PATH.exists():
    print(f"   ‚ùå File not found! Trying fallback paths...")
    # Try common locations
    fallback_paths = [
        Path('original_items.pkl'),
        Path('./original_items.pkl'),
        Path('c:/Users/pc/Desktop/FYP/FitFast-FYP/fitfast/frontend/src/ai/original_items.pkl')
    ]

    for path in fallback_paths:
        if path.exists():
            ORIGINAL_DATA_PATH = path
            print(f"   ‚úÖ Found at: {path}")
            break

    if not ORIGINAL_DATA_PATH.exists():
        raise FileNotFoundError(f"Could not find original_items.pkl. Check Step 1 saved it correctly.")

original_df = pd.read_pickle(ORIGINAL_DATA_PATH)
print(f"   ‚úÖ Loaded {len(original_df)} items")
print(f"   Columns: {list(original_df.columns)}")

# ========== 2. CORRECT PARSING OF ALL DATA ==========
print("\n2. üìã Correct parsing of all data...")

def parse_sizing_data_final(sizing_str):
    """Properly parse sizing data with nested structure"""
    if pd.isna(sizing_str) or not isinstance(sizing_str, str):
        return {
            'garment_type': 'unknown',
            'measurements_cm': {},
            'fit_characteristics': {},
            'size_system': 'US'
        }

    result = {
        'garment_type': 'unknown',
        'measurements_cm': {},
        'fit_characteristics': {},
        'size_system': 'US'
    }

    try:
        # Step 1 already processed JSON to strings
        if sizing_str.strip().startswith('{'):
            try:
                data = json.loads(sizing_str)
                result['garment_type'] = data.get('garment_type', 'unknown')
                result['measurements_cm'] = data.get('measurements_cm', {})
                result['fit_characteristics'] = data.get('fit_characteristics', {})
                result['size_system'] = data.get('size_system', 'US')
            except json.JSONDecodeError:
                # Try to extract garment_type from string
                if 'garment_type' in sizing_str:
                    match = re.search(r'"garment_type"\s*:\s*"([^"]+)"', sizing_str)
                    if match:
                        result['garment_type'] = match.group(1)
    except Exception as e:
        print(f"   Warning parsing sizing data: {e}")

    return result

def parse_colors_final(color_str):
    """Properly parse color variants"""
    colors = {}
    if pd.isna(color_str) or not isinstance(color_str, str):
        return colors

    try:
        # Step 1 color variants are JSON strings
        if color_str.strip().startswith('{'):
            try:
                color_dict = json.loads(color_str)
                for color_name, color_data in color_dict.items():
                    if isinstance(color_data, dict):
                        colors[color_name] = color_data.get('stock', 1)
                    else:
                        colors[color_name] = 1
            except json.JSONDecodeError:
                # Fallback: try to extract color names
                color_names = re.findall(r'"([A-Za-z\s]+)"\s*:', color_str)
                for name in color_names:
                    colors[name] = 1
    except Exception as e:
        print(f"   Warning parsing colors: {e}")

    return colors

def parse_size_stock(size_stock_str):
    """Parse size stock information"""
    size_stock = {}
    if pd.isna(size_stock_str) or not isinstance(size_stock_str, str):
        return size_stock

    try:
        if size_stock_str.strip().startswith('{'):
            size_stock = json.loads(size_stock_str)
    except:
        pass

    return size_stock

# Parse all data with proper measurement extraction
print("   Parsing sizing data with nested measurements...")
all_items = []
for idx, row in original_df.iterrows():
    item = {
        'item_id': row.get('ID', idx + 1),
        'name': row.get('Name', f'Item {idx+1}'),
        'description': row.get('Description', ''),
        'price': float(row.get('Price', 0)),
        'category': row.get('Category', 'unknown'),
        'store': row.get('Store', 'unknown'),
        'total_stock': int(row.get('Total Stock', 0)) if pd.notna(row.get('Total Stock')) else 0,
        'garment_type_db': row.get('Garment Type', 'unknown')
    }

    # Parse colors
    colors = parse_colors_final(row.get('Color Variants', ''))
    item['colors'] = list(colors.keys())
    item['color_stocks'] = colors

    # Parse sizing data - CORRECTLY for nested structure
    sizing = parse_sizing_data_final(row.get('Sizing Data', ''))

    # Prioritize: 1. parsed garment_type, 2. database garment_type, 3. unknown
    parsed_garment_type = sizing.get('garment_type', 'unknown')
    if parsed_garment_type != 'unknown' and parsed_garment_type != '':
        item['garment_type'] = parsed_garment_type
    else:
        item['garment_type'] = item['garment_type_db']

    # Store all measurement data (nested by size)
    item['measurements_by_size'] = sizing.get('measurements_cm', {})

    # Store fit characteristics
    fit_chars = sizing.get('fit_characteristics', {})
    item['fit_type'] = fit_chars.get('fit_type', 'regular')
    item['ease'] = fit_chars.get('ease', 'standard')
    item['stretch'] = fit_chars.get('stretch', 'medium')
    item['size_system'] = sizing.get('size_system', 'US')

    # Parse size stock
    size_stock = parse_size_stock(row.get('Size Stock', ''))
    item['size_stock'] = size_stock

    all_items.append(item)

features_df = pd.DataFrame(all_items)
print(f"   ‚úÖ Parsed {len(features_df)} items")
print(f"   ‚úÖ Measurements by size extracted for all items")

# ========== 3. EXTRACT MEASUREMENT FEATURES ==========
print("\n3. üìè Extracting measurement features for size-based recommendations...")

def extract_measurement_stats(measurements_by_size, garment_type):
    """Extract statistical features from measurements across all sizes"""
    stats = {}

    # Initialize all possible measurement fields
    all_measurements = set()
    for size_data in measurements_by_size.values():
        if isinstance(size_data, dict):
            all_measurements.update(size_data.keys())

    # Convert to list for consistent ordering
    all_measurements = list(all_measurements)

    # For each measurement type, calculate stats across sizes
    for measurement in all_measurements:
        values = []
        for size, size_data in measurements_by_size.items():
            if isinstance(size_data, dict) and measurement in size_data:
                try:
                    value = float(size_data[measurement])
                    values.append(value)
                except (ValueError, TypeError):
                    continue

        if values:  # Only add if we have values
            stats[f'{measurement}_min'] = min(values)
            stats[f'{measurement}_max'] = max(values)
            stats[f'{measurement}_mean'] = np.mean(values)
            stats[f'{measurement}_std'] = np.std(values) if len(values) > 1 else 0
            stats[f'{measurement}_range'] = max(values) - min(values)

            # Add size progression (difference between consecutive sizes)
            if len(values) >= 2:
                sorted_values = sorted(values)
                diffs = [sorted_values[i+1] - sorted_values[i] for i in range(len(sorted_values)-1)]
                stats[f'{measurement}_avg_step'] = np.mean(diffs)
            else:
                stats[f'{measurement}_avg_step'] = 0

    # Add summary stats
    stats['num_sizes'] = len(measurements_by_size)
    stats['has_measurements'] = 1 if measurements_by_size else 0

    # Get garment-specific measurements
    if garment_type in GARMENT_TYPE_MEASUREMENTS:
        expected_measurements = GARMENT_TYPE_MEASUREMENTS[garment_type]
        for measurement in expected_measurements:
            stats[f'has_{measurement}'] = 1 if measurement in all_measurements else 0

    return stats

print("   Extracting measurement statistics...")
measurement_stats_list = []
for idx, row in features_df.iterrows():
    stats = extract_measurement_stats(row['measurements_by_size'], row['garment_type'])
    measurement_stats_list.append(stats)

# Convert to DataFrame
measurement_stats_df = pd.DataFrame(measurement_stats_list)
print(f"   ‚úÖ Extracted {len(measurement_stats_df.columns)} measurement features")

# Merge with main features DataFrame
features_df = pd.concat([features_df, measurement_stats_df], axis=1)

# ========== 4. CREATE SIZE-RELATED FEATURES ==========
print("\n4. üìê Creating size-related features...")

def create_size_features(size_stock_dict, measurements_by_size):
    """Create features related to size availability and distribution"""
    features = {
        'size_variety': 0,
        'avg_stock_per_size': 0,
        'max_stock_size': '',
        'min_stock_size': '',
        'total_available_sizes': 0
    }

    # Size availability from stock
    if size_stock_dict:
        features['total_available_sizes'] = len(size_stock_dict)
        stock_values = list(size_stock_dict.values())
        features['size_variety'] = len(stock_values)
        features['avg_stock_per_size'] = np.mean(stock_values) if stock_values else 0

        if stock_values:
            max_stock = max(stock_values)
            min_stock = min(stock_values)
            for size, stock in size_stock_dict.items():
                if stock == max_stock:
                    features['max_stock_size'] = size
                if stock == min_stock:
                    features['min_stock_size'] = size

    # Size range from measurements
    if measurements_by_size:
        # Convert size labels to numeric values for analysis
        size_order = {size: i for i, size in enumerate(STANDARD_SIZES)}
        available_sizes = []
        for size in measurements_by_size.keys():
            if size in size_order:
                available_sizes.append(size_order[size])
            elif size.upper() in size_order:
                available_sizes.append(size_order[size.upper()])

        if available_sizes:
            features['size_range_numeric'] = max(available_sizes) - min(available_sizes)
            features['min_size_numeric'] = min(available_sizes)
            features['max_size_numeric'] = max(available_sizes)

    return features

print("   Creating size distribution features...")
size_features_list = []
for idx, row in features_df.iterrows():
    features = create_size_features(row.get('size_stock', {}), row.get('measurements_by_size', {}))
    size_features_list.append(features)

size_features_df = pd.DataFrame(size_features_list)
features_df = pd.concat([features_df, size_features_df], axis=1)
print(f"   ‚úÖ Created {len(size_features_df.columns)} size-related features")

# ========== 5. CORRECT CATEGORIZATION WITH PRECISE RULES ==========
print("\n5. üè∑Ô∏è Correct categorization with precise rules...")

# Define precise categorization rules
garment_type_to_category = {
    # Tops
    't_shirt': ('top', 'casual'),
    'v_neck_tee': ('top', 'casual'),
    'fitted_shirt': ('top', 'business_casual'),
    'dress_shirt': ('top', 'formal'),
    'polo_shirt': ('top', 'business_casual'),
    'henley_shirt': ('top', 'casual'),

    # Sweaters & Hoodies
    'crewneck_sweater': ('top', 'casual'),
    'cardigan': ('top', 'casual'),
    'turtleneck': ('top', 'casual'),
    'pullover_hoodie': ('top', 'casual'),
    'zip_hoodie': ('top', 'casual'),

    # Bottoms
    'slim_pants': ('bottom', 'business_casual'),
    'regular_pants': ('bottom', 'business_casual'),
    'cargo_pants': ('bottom', 'casual'),
    'regular_jeans': ('bottom', 'casual'),
    'slim_jeans': ('bottom', 'casual'),
    'casual_shorts': ('bottom', 'casual'),
    'cargo_shorts': ('bottom', 'casual'),

    # Athletic
    'training_shorts': ('bottom', 'athletic'),
    'yoga_pants': ('bottom', 'athletic'),
    'leggings': ('bottom', 'athletic'),

    # Dresses
    'a_line_dress': ('dress', 'business_casual'),
    'bodycon_dress': ('dress', 'business_casual'),
    'maxi_dress': ('dress', 'casual'),
    'midi_dress': ('dress', 'business_casual'),
    'wrap_dress': ('dress', 'business_casual'),

    # Skirts
    'a_line_skirt': ('bottom', 'business_casual'),
    'pencil_skirt': ('bottom', 'business_casual'),
    'tennis_skirt': ('bottom', 'athletic'),

    # Outerwear
    'bomber_jacket': ('outerwear', 'casual'),
    'denim_jacket': ('outerwear', 'casual'),
    'windbreaker': ('outerwear', 'casual'),
    'puffer_jacket': ('outerwear', 'casual'),
    'trench_coat': ('outerwear', 'formal'),

    # Swimwear
    'bikini_top': ('swimwear', 'athletic'),
    'swim_trunks': ('swimwear', 'athletic'),
    'board_shorts': ('swimwear', 'athletic'),
    'one_piece_swimsuit': ('swimwear', 'athletic'),
    'rash_guard': ('swimwear', 'athletic'),

    # Footwear
    'sneakers': ('footwear', 'casual'),
    'dress_shoes': ('footwear', 'formal'),

    # Underwear
    'briefs': ('underwear', 'casual'),
    'boxer_briefs': ('underwear', 'casual'),

    # Socks
    'crew_socks': ('socks', 'casual'),
    'ankle_socks': ('socks', 'casual'),
}

# Apply categorization
features_df['garment_category'] = 'other'
features_df['garment_formality'] = 'casual'

for idx, row in features_df.iterrows():
    garment_type = row['garment_type']
    if garment_type in garment_type_to_category:
        category, formality = garment_type_to_category[garment_type]
        features_df.at[idx, 'garment_category'] = category
        features_df.at[idx, 'garment_formality'] = formality
    else:
        # Fallback based on name
        name_lower = str(row['name']).lower()
        if any(word in name_lower for word in ['dress', 'gown']):
            features_df.at[idx, 'garment_category'] = 'dress'
            features_df.at[idx, 'garment_formality'] = 'business_casual'
        elif any(word in name_lower for word in ['shirt', 'blouse', 'top', 'tee']):
            features_df.at[idx, 'garment_category'] = 'top'
            features_df.at[idx, 'garment_formality'] = 'business_casual' if 'shirt' in name_lower else 'casual'
        elif any(word in name_lower for word in ['pants', 'jeans', 'shorts', 'skirt']):
            features_df.at[idx, 'garment_category'] = 'bottom'
            features_df.at[idx, 'garment_formality'] = 'business_casual' if 'pants' in name_lower and 'dress' in name_lower else 'casual'
        elif any(word in name_lower for word in ['jacket', 'coat', 'blazer']):
            features_df.at[idx, 'garment_category'] = 'outerwear'
            features_df.at[idx, 'garment_formality'] = 'formal' if 'coat' in name_lower else 'casual'
        elif any(word in name_lower for word in ['shoes', 'sneakers', 'boots']):
            features_df.at[idx, 'garment_category'] = 'footwear'
            features_df.at[idx, 'garment_formality'] = 'formal' if 'dress' in name_lower else 'casual'

# ========== 6. SPECIAL FIXES FOR SPECIFIC ITEMS ==========
print("\n6. üîß Applying special fixes for specific items...")

# Fix 1: Performance Training items should be athletic
print("   Fixing 'Performance Training' items to 'athletic'...")
mask = features_df['name'].str.contains('Performance Training', case=False, na=False)
features_df.loc[mask, 'garment_formality'] = 'athletic'
print(f"   ‚úÖ Fixed {mask.sum()} 'Performance Training' items")

# Fix 2: Training items should be athletic
print("   Fixing 'Training' items to 'athletic'...")
mask = features_df['name'].str.contains('Training', case=False, na=False) & \
       ~features_df['name'].str.contains('Performance Training', case=False, na=False)
features_df.loc[mask, 'garment_formality'] = 'athletic'
print(f"   ‚úÖ Fixed {mask.sum()} 'Training' items")

# Fix 3: Athletic items should be athletic
print("   Fixing 'Athletic' items to 'athletic'...")
mask = features_df['name'].str.contains('Athletic', case=False, na=False)
features_df.loc[mask, 'garment_formality'] = 'athletic'
print(f"   ‚úÖ Fixed {mask.sum()} 'Athletic' items")

print(f"   ‚úÖ Categorized all items with special fixes")

# ========== 7. CREATE COLOR FEATURES ==========
print("\n7. üé® Creating color features...")

# Create color features
all_colors = []
for colors in features_df['colors']:
    all_colors.extend(colors)

top_colors = [color for color, count in Counter(all_colors).most_common(10)]

color_themes = {
    'dark_colors': ['Black', 'Navy', 'Charcoal', 'Dark', 'Brown', 'Dark Blue', 'Dark Gray'],
    'light_colors': ['White', 'Beige', 'Ivory', 'Cream', 'Light', 'Light Gray'],
    'bold_colors': ['Red', 'Blue', 'Green', 'Yellow', 'Pink', 'Orange', 'Purple', 'Royal Blue', 'Burgundy'],
    'neutral_colors': ['Gray', 'Beige', 'White', 'Black', 'Navy', 'Brown', 'Charcoal', 'Dark Gray']
}

for theme_name, colors in color_themes.items():
    features_df[f'has_{theme_name}'] = features_df['colors'].apply(
        lambda x: 1 if any(color in str(color_item) for color in colors for color_item in x) else 0
    )

print(f"   ‚úÖ Created {len(color_themes)} color theme features")

# ========== 8. ENCODE CATEGORICAL FEATURES ==========
print("\n8. üî† Encoding categorical features...")

categorical_columns = ['category', 'store', 'garment_type', 'garment_category', 'garment_formality', 'fit_type', 'ease', 'stretch']
encoders = {}

for col in categorical_columns:
    if col in features_df.columns:
        encoders[col] = LabelEncoder()
        # Fill NaN with 'unknown' before encoding
        features_df[col] = features_df[col].fillna('unknown')
        features_df[f'{col}_encoded'] = encoders[col].fit_transform(features_df[col].astype(str))

print(f"   ‚úÖ Encoded {len(encoders)} categorical features")

# ========== 9. SCALE NUMERICAL FEATURES ==========
print("\n9. üìè Scaling numerical features...")

# Identify numerical columns (excluding encoded ones)
numerical_columns = []
for col in features_df.columns:
    if (col not in ['item_id', 'name', 'description', 'colors', 'color_stocks',
                   'measurements_by_size', 'size_stock', 'garment_type_db',
                   'max_stock_size', 'min_stock_size'] and
        not col.endswith('_encoded') and
        not col.startswith('has_') and
        features_df[col].dtype in [np.float64, np.int64, np.float32, np.int32]):
        numerical_columns.append(col)

# Filter to most important numerical features
important_numerical = ['price', 'total_stock', 'num_sizes', 'size_variety',
                      'avg_stock_per_size', 'total_available_sizes',
                      'size_range_numeric', 'min_size_numeric', 'max_size_numeric']

# Add measurement stats that exist
for col in important_numerical:
    if col in features_df.columns:
        numerical_columns.append(col)

# Also add key measurement stats (min values for common measurements)
common_measurements = ['chest_circumference', 'waist_circumference', 'hips_circumference',
                      'garment_length', 'sleeve_length', 'inseam_length']
for measurement in common_measurements:
    min_col = f'{measurement}_min'
    if min_col in features_df.columns:
        numerical_columns.append(min_col)

# Remove duplicates
numerical_columns = list(set(numerical_columns))

# Fill NaN with 0 for scaling
for col in numerical_columns:
    if col in features_df.columns:
        features_df[col] = features_df[col].fillna(0)

if numerical_columns:
    scaler = StandardScaler()
    scaled_values = scaler.fit_transform(features_df[numerical_columns])
    scaled_df = pd.DataFrame(scaled_values, columns=[f'scaled_{col}' for col in numerical_columns])
    features_df = pd.concat([features_df.reset_index(drop=True), scaled_df.reset_index(drop=True)], axis=1)
    print(f"   ‚úÖ Scaled {len(numerical_columns)} numerical features")
    print(f"   Scaled columns: {numerical_columns[:10]}...")  # Show first 10
else:
    scaler = None
    print("   ‚ö†Ô∏è No numerical columns to scale")

# ========== 10. CREATE FINAL FEATURE MATRIX ==========
print("\n10. üéØ Creating final feature matrix for size-based recommendations...")

# Collect all feature types
encoded_cols = [col for col in features_df.columns if col.endswith('_encoded')]
scaled_cols = [col for col in features_df.columns if col.startswith('scaled_')]
binary_cols = [col for col in features_df.columns if col.startswith('has_')]

# Add key measurement features (min values for fitting)
measurement_min_cols = [col for col in features_df.columns if col.endswith('_min')]
measurement_range_cols = [col for col in features_df.columns if col.endswith('_range')]

# Combine all feature columns
all_feature_cols = encoded_cols + scaled_cols + binary_cols + measurement_min_cols[:10] + measurement_range_cols[:10]

# Remove duplicates
all_feature_cols = list(set(all_feature_cols))

# Filter to columns that actually exist
all_feature_cols = [col for col in all_feature_cols if col in features_df.columns]

# Create final feature matrix
feature_matrix = features_df[all_feature_cols].copy()

# Fill any remaining NaN with 0
feature_matrix = feature_matrix.fillna(0)

print(f"   ‚úÖ Feature matrix shape: {feature_matrix.shape}")
print(f"   ‚úÖ Total features: {len(all_feature_cols)}")
print(f"   Feature breakdown:")
print(f"     - Encoded: {len(encoded_cols)}")
print(f"     - Scaled: {len(scaled_cols)}")
print(f"     - Binary: {len(binary_cols)}")
print(f"     - Measurements: {len(measurement_min_cols) + len(measurement_range_cols)}")

# ========== 11. SAVE PROCESSED DATA ==========
print("\n11. üíæ Saving processed data for size-based recommendations...")

# Save raw measurements for later use in size matching
raw_measurements_data = {
    'measurements_by_size': features_df['measurements_by_size'].tolist(),
    'item_ids': features_df['item_id'].tolist(),
    'garment_types': features_df['garment_type'].tolist()
}

with open(ARTIFACTS_DIR / 'raw_measurements.pkl', 'wb') as f:
    pickle.dump(raw_measurements_data, f)

# Save artifacts
with open(ARTIFACTS_DIR / 'feature_encoders.pkl', 'wb') as f:
    pickle.dump(encoders, f)

if scaler:
    with open(ARTIFACTS_DIR / 'scaler.pkl', 'wb') as f:
        pickle.dump(scaler, f)

# Save features DataFrame with all information
features_df.to_pickle(ARTIFACTS_DIR / 'features_df.pkl')

# Save feature matrix for ML models
feature_matrix.to_pickle(ARTIFACTS_DIR / 'feature_matrix.pkl')

# Save measurement mapping for size-based recommendations
measurement_mapping = {
    'garment_type_measurements': GARMENT_TYPE_MEASUREMENTS,
    'standard_sizes': STANDARD_SIZES,
    'feature_columns': all_feature_cols
}

with open(ARTIFACTS_DIR / 'measurement_mapping.pkl', 'wb') as f:
    pickle.dump(measurement_mapping, f)

# Save metadata
metadata = {
    'total_items': len(features_df),
    'total_features': len(all_feature_cols),
    'feature_types': {
        'encoded': len(encoded_cols),
        'scaled': len(scaled_cols),
        'binary': len(binary_cols),
        'measurement_min': len([c for c in all_feature_cols if c.endswith('_min')]),
        'measurement_range': len([c for c in all_feature_cols if c.endswith('_range')])
    },
    'garment_categories': features_df['garment_category'].nunique(),
    'formality_levels': features_df['garment_formality'].nunique(),
    'garment_types': features_df['garment_type'].nunique(),
    'has_size_measurements': features_df['has_measurements'].sum() if 'has_measurements' in features_df.columns else 0,
    'size_based_features': True
}

with open(ARTIFACTS_DIR / 'feature_metadata.pkl', 'wb') as f:
    pickle.dump(metadata, f)

print("   ‚úÖ All data saved for size-based recommendation system")

# ========== 12. DISPLAY SIZE-BASED SUMMARY ==========
print("\n" + "=" * 60)
print("üìä SIZE-BASED FEATURE ENGINEERING SUMMARY")
print("=" * 60)

print(f"\nüìà Data Overview:")
print(f"   Total items: {len(features_df)}")
print(f"   Total features: {len(all_feature_cols)}")
print(f"   Feature matrix shape: {feature_matrix.shape}")

print(f"\nüìè Measurement Coverage:")
if 'has_measurements' in features_df.columns:
    measurement_count = features_df['has_measurements'].sum()
    print(f"   Items with measurements: {measurement_count} ({measurement_count/len(features_df):.1%})")

if 'num_sizes' in features_df.columns:
    avg_sizes = features_df['num_sizes'].mean()
    print(f"   Average sizes per item: {avg_sizes:.1f}")

print(f"\nüè∑Ô∏è Garment Categories:")
category_dist = features_df['garment_category'].value_counts()
for cat, count in category_dist.items():
    print(f"   {cat}: {count} items ({count/len(features_df):.1%})")

print(f"\nüìê Key Measurement Statistics:")
# Show most common measurements
measurement_cols = [col for col in features_df.columns if col.endswith('_min')]
if measurement_cols:
    print(f"   Found {len(measurement_cols)} different measurement types")
    # Show top 5 most common measurements
    for col in measurement_cols[:5]:
        non_zero = (features_df[col] > 0).sum()
        if non_zero > 0:
            measurement_name = col.replace('_min', '')
            print(f"   {measurement_name}: {non_zero} items ({non_zero/len(features_df):.1%})")

print(f"\nüé≠ Formality Levels:")
formality_dist = features_df['garment_formality'].value_counts()
for level, count in formality_dist.items():
    print(f"   {level}: {count} items ({count/len(features_df):.1%})")

# Show sample of items with measurements
print(f"\nüìã SAMPLE ITEMS WITH MEASUREMENTS:")
print("-" * 80)

sample_with_measurements = features_df[features_df['num_sizes'] > 0].head(5) if 'num_sizes' in features_df.columns else features_df.head(5)
for idx, row in sample_with_measurements.iterrows():
    name = row['name'][:35] + "..." if len(row['name']) > 35 else row['name']
    sizes = len(row['measurements_by_size']) if isinstance(row['measurements_by_size'], dict) else 0
    garment_type = row['garment_type']

    # Get some measurement stats
    if 'chest_circumference_min' in features_df.columns:
        chest_min = row.get('chest_circumference_min', 'N/A')
        chest_max = row.get('chest_circumference_max', 'N/A')
        print(f"{name:<40} | {garment_type:<15} | {sizes:>2} sizes | Chest: {chest_min}-{chest_max}cm")
    else:
        print(f"{name:<40} | {garment_type:<15} | {sizes:>2} sizes")

print(f"\n‚ú® Special Features for Size-Based Recommendations:")
print(f"   - Raw measurements preserved for exact size matching")
print(f"   - Statistical features (min, max, mean, range) for each measurement")
print(f"   - Size availability features (variety, stock distribution)")
print(f"   - Garment-type specific measurement validation")
print(f"   - Ready for user measurement comparison")

print("\n" + "=" * 60)
print("üéâ STEP 2 COMPLETED SUCCESSFULLY!")
print("‚úÖ Size-based measurement extraction implemented")
print("‚úÖ Statistical features created for all measurements")
print("‚úÖ Raw measurements preserved for exact matching")
print("‚úÖ All data saved for recommendation system")
print("‚úÖ Ready for Step 3: Create embeddings with size features")
print("=" * 60)

üéØ COMPLETE STEP 2: Feature Engineering with SIZE-BASED Measurements

1. üîÑ Loading original data...
   Looking for: c:\Users\Rana\OneDrive\Desktop\FitFast FYP\fitfast\frontend\src\ai\original_items.pkl
   ‚úÖ Loaded 250 items
   Columns: ['ID', 'Name', 'Description', 'Store', 'Price', 'Category', 'Garment Type', 'Total Stock', 'Color Variants', 'Sizing Data', 'Size Stock', 'Variants', 'Created At', 'Updated At']

2. üìã Correct parsing of all data...
   Parsing sizing data with nested measurements...
   ‚úÖ Parsed 250 items
   ‚úÖ Measurements by size extracted for all items

3. üìè Extracting measurement features for size-based recommendations...
   Extracting measurement statistics...
   ‚úÖ Extracted 146 measurement features

4. üìê Creating size-related features...
   Creating size distribution features...
   ‚úÖ Created 8 size-related features

5. üè∑Ô∏è Correct categorization with precise rules...

6. üîß Applying special fixes for specific items...
   Fixing 'Performan

In [4]:
# @title üéØ **STEP 3: Create Item Embeddings (CORRECTED - LOCAL VERSION)**
print("üéØ STEP 3: Create Item Embeddings (CORRECTED - LOCAL VERSION)")
print("=" * 60)

import pandas as pd
import numpy as np
import pickle
import json
from pathlib import Path
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ All libraries imported")

# ========== CONFIGURATION ==========
# Use same directory structure as Steps 1 and 2
ARTIFACTS_DIR = Path('./artifacts')
ORIGINAL_DATA_DIR = Path('./')

print(f"üìÅ Working directory: {Path.cwd()}")
print(f"üìÅ Artifacts directory: {ARTIFACTS_DIR.absolute()}")

# ========== 1. LOAD & PREPARE DATA ==========
print("\n1. üîÑ Loading data from Step 2...")

# Check what files exist from Step 2
print("   Looking for Step 2 artifacts...")
available_files = list(ARTIFACTS_DIR.glob('*')) + list(Path('.').glob('*.pkl'))
print("   Available files:")
for file in available_files:
    print(f"   - {file.name}")

# Load the processed data from Step 2 (should be in artifacts folder)
try:
    # Load features DataFrame from Step 2
    features_df_path = ARTIFACTS_DIR / 'features_df.pkl'
    if features_df_path.exists():
        features_df = pd.read_pickle(features_df_path)
        print(f"   ‚úÖ Loaded features_df: {features_df.shape}")
    else:
        print(f"   ‚ùå features_df.pkl not found in {ARTIFACTS_DIR}")
        raise FileNotFoundError(f"features_df.pkl not found")

    # Load feature matrix from Step 2
    feature_matrix_path = ARTIFACTS_DIR / 'feature_matrix.pkl'
    if feature_matrix_path.exists():
        feature_matrix = pd.read_pickle(feature_matrix_path)
        print(f"   ‚úÖ Loaded feature_matrix: {feature_matrix.shape}")
    else:
        print(f"   ‚ö†Ô∏è feature_matrix.pkl not found, using features_df")
        feature_matrix = None

except Exception as e:
    print(f"   ‚ùå Error loading Step 2 data: {e}")
    raise

print(f"   Columns in features_df: {len(features_df.columns)}")
print(f"   Sample columns: {list(features_df.columns)[:10]}")

# ========== 2. VERIFY DATA FROM STEP 2 ==========
print("\n2. üîç Verifying data from Step 2...")

# Check critical columns exist
required_columns = ['item_id', 'name', 'garment_category', 'garment_formality', 'price']
missing_columns = [col for col in required_columns if col not in features_df.columns]
if missing_columns:
    print(f"   ‚ö†Ô∏è Missing columns: {missing_columns}")
    print("   Checking for alternative column names...")

    # Try to find alternative column names
    for col in missing_columns:
        possible_matches = [c for c in features_df.columns if col in str(c).lower()]
        if possible_matches:
            print(f"   Found possible match for '{col}': {possible_matches[0]}")
else:
    print("   ‚úÖ All required columns found")

# Check garment categories
if 'garment_category' in features_df.columns:
    categories = features_df['garment_category'].unique()
    print(f"   Garment categories found: {len(categories)}")
    print(f"   Categories: {list(categories)}")

    # Show distribution
    category_dist = features_df['garment_category'].value_counts()
    print(f"   Category distribution:")
    for cat, count in category_dist.items():
        print(f"     {cat}: {count} items")

# Check if we have encoded columns from Step 2
encoded_cols = [col for col in features_df.columns if col.endswith('_encoded')]
print(f"   Found {len(encoded_cols)} encoded columns from Step 2")

# Check if we have scaled columns from Step 2
scaled_cols = [col for col in features_df.columns if col.startswith('scaled_')]
print(f"   Found {len(scaled_cols)} scaled columns from Step 2")

# Check measurement features
measurement_cols = [col for col in features_df.columns if any(m in col for m in ['min', 'max', 'mean', 'range'])]
print(f"   Found {len(measurement_cols)} measurement columns")

# ========== 3. PREPARE FOR EMBEDDING CREATION ==========
print("\n3. üõ†Ô∏è Preparing for embedding creation...")

# Start with robust features
robust_features = features_df.copy()

# Ensure critical categorical columns are string type
categorical_cols = ['garment_category', 'garment_formality']
for col in categorical_cols:
    if col in robust_features.columns:
        robust_features[col] = robust_features[col].astype(str)
        print(f"   Converted {col} to string")

# === Create category-aware features ===
print("   Creating category-aware features...")

# Create category strength mapping (same as Step 2 categories)
category_strength_map = {
    'top': 1.0, 'bottom': 2.0, 'dress': 3.0,
    'outerwear': 4.0, 'swimwear': 5.0,
    'footwear': 6.0, 'socks': 7.0, 'underwear': 8.0, 'accessory': 9.0
}

# Add default for 'other' category
robust_features['category_strength'] = robust_features['garment_category'].apply(
    lambda x: category_strength_map.get(str(x), 0.0)
)

# Create clothing vs non-clothing feature
print("   Creating clothing vs non-clothing feature...")
robust_features['is_clothing'] = robust_features['garment_category'].apply(
    lambda x: 0.0 if str(x) in ['accessory', 'footwear', 'socks', 'underwear'] else 1.0
)

# Create one-hot category features
print("   Creating one-hot category features...")
for category in robust_features['garment_category'].unique():
    col_name = f'cat_{category}'
    robust_features[col_name] = (robust_features['garment_category'] == category).astype(float) * 5.0

print(f"   Added {len(robust_features['garment_category'].unique())} category features")

# === Create formality features ===
print("   Creating formality features...")
formality_strength_map = {
    'athletic': 1.0, 'casual': 2.0, 'business_casual': 3.0, 'formal': 4.0
}

robust_features['formality_strength'] = robust_features['garment_formality'].apply(
    lambda x: formality_strength_map.get(str(x), 2.0)
)

# Create one-hot formality features
for formality in robust_features['garment_formality'].unique():
    col_name = f'form_{formality}'
    robust_features[col_name] = (robust_features['garment_formality'] == formality).astype(float) * 3.0

print(f"   Added {len(robust_features['garment_formality'].unique())} formality features")

# === Create interaction features ===
print("   Creating interaction features...")
robust_features['clothing_formality'] = robust_features['is_clothing'] * robust_features['formality_strength']
robust_features['category_formality'] = robust_features['category_strength'] * robust_features['formality_strength']

# === Ensure we have price feature ===
if 'price' not in robust_features.columns and 'Price' in robust_features.columns:
    robust_features['price'] = robust_features['Price']

if 'price' in robust_features.columns:
    # Scale price
    price_scaler = MinMaxScaler(feature_range=(0, 1))
    robust_features['price_scaled'] = price_scaler.fit_transform(robust_features[['price']].fillna(0))
    print("   ‚úÖ Scaled price feature")

# === Ensure we have total_stock feature ===
if 'total_stock' not in robust_features.columns:
    if 'Total Stock' in robust_features.columns:
        robust_features['total_stock'] = robust_features['Total Stock']
    elif 'total_stock_scaled' in scaled_cols:
        print("   Using existing total_stock_scaled")

# ========== 4. SELECT FEATURES FOR EMBEDDINGS ==========
print("\n4. üéØ Selecting features for embeddings...")

# Define feature groups
feature_groups = {
    # High importance (5x weight)
    'category_features': [col for col in robust_features.columns if col.startswith('cat_')],
    'category_strength': ['category_strength', 'is_clothing'],

    # Medium importance (3x weight)
    'formality_features': [col for col in robust_features.columns if col.startswith('form_')],
    'formality_strength': ['formality_strength'],
    'interaction_features': ['clothing_formality', 'category_formality'],

    # Use encoded features from Step 2 (2x weight)
    'encoded_features': [col for col in encoded_cols if col in robust_features.columns],

    # Use scaled features from Step 2 (1x weight)
    'scaled_features': [col for col in scaled_cols if col in robust_features.columns],

    # Color features from Step 2
    'color_features': [col for col in robust_features.columns if col.startswith('has_') and 'color' in col],

    # Price feature
    'price_feature': ['price_scaled'] if 'price_scaled' in robust_features.columns else [],
}

# Collect all features with weights
all_features = []
weights = []

for group_name, feature_list in feature_groups.items():
    available_features = [f for f in feature_list if f in robust_features.columns]
    if available_features:
        # Assign weight based on group
        if 'category' in group_name or 'strength' in group_name:
            weight = 5.0
        elif 'formality' in group_name or 'interaction' in group_name:
            weight = 3.0
        elif 'encoded' in group_name:
            weight = 2.0
        else:
            weight = 1.0

        all_features.extend(available_features)
        weights.extend([weight] * len(available_features))

        print(f"   ‚úÖ {group_name}: {len(available_features)} features (weight: {weight}x)")

print(f"\n   Total selected features: {len(all_features)}")

# Create weighted feature matrix
print("\n   Creating weighted feature matrix...")
X_weighted = np.zeros((len(robust_features), len(all_features)))

for i, (feature, weight) in enumerate(zip(all_features, weights)):
    X_weighted[:, i] = robust_features[feature].fillna(0).values * weight

print(f"   ‚úÖ Weighted feature matrix: {X_weighted.shape}")

# ========== 5. CREATE EMBEDDINGS WITH PCA ==========
print("\n5. ü§ñ Creating embeddings with PCA...")

# Scale the weighted features
scaler_pca = StandardScaler()
X_weighted_scaled = scaler_pca.fit_transform(X_weighted)

# Determine optimal number of components
n_samples = X_weighted_scaled.shape[0]
n_components = min(32, n_samples - 1)  # Don't exceed n_samples - 1
print(f"   Creating {n_components} PCA components")

pca = PCA(n_components=n_components, random_state=42)
embeddings = pca.fit_transform(X_weighted_scaled)

print(f"   ‚úÖ PCA embeddings created: {embeddings.shape}")
print(f"   Explained variance ratio: {pca.explained_variance_ratio_.sum():.2%}")

# Show top components
print(f"\n   Top 5 components explain: {pca.explained_variance_ratio_[:5].sum():.1%} of variance")

# ========== 6. CLUSTER VALIDATION ==========
print("\n6. üè∑Ô∏è Validating embeddings with clustering...")

# Create clusters based on garment categories
n_clusters = min(robust_features['garment_category'].nunique(), 10)
print(f"   Creating {n_clusters} clusters")

kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=20)
cluster_labels = kmeans.fit_predict(embeddings)

robust_features['embedding_cluster'] = cluster_labels

# Calculate cluster purity
print("\n   Cluster-Category Alignment:")
print("-" * 50)

purity_scores = []
for cluster_id in range(n_clusters):
    cluster_items = robust_features[robust_features['embedding_cluster'] == cluster_id]
    if len(cluster_items) > 0:
        dominant_category = cluster_items['garment_category'].mode()[0]
        purity = (cluster_items['garment_category'] == dominant_category).mean()
        purity_scores.append(purity)

        print(f"   Cluster {cluster_id}: {dominant_category:<15} purity = {purity:.1%}")

avg_purity = np.mean(purity_scores) if purity_scores else 0
print(f"\n   Average cluster purity: {avg_purity:.1%}")

# ========== 7. SIMILARITY FUNCTION ==========
print("\n7. üîç Creating similarity function...")

def find_similar_items_cosine(item_id, top_k=5, same_category_only=False):
    """Find similar items using cosine similarity"""
    # Find item index
    item_idx = robust_features[robust_features['item_id'] == item_id].index
    if len(item_idx) == 0:
        return []

    item_idx = item_idx[0]
    item_embedding = embeddings[item_idx].reshape(1, -1)

    # Calculate cosine similarity
    similarities = cosine_similarity(item_embedding, embeddings)[0]

    # Get indices sorted by similarity
    sorted_indices = np.argsort(similarities)[::-1]

    # Collect results
    results = []
    for idx in sorted_indices:
        if idx == item_idx:
            continue

        if same_category_only:
            item_category = robust_features.iloc[item_idx]['garment_category']
            other_category = robust_features.iloc[idx]['garment_category']
            if item_category != other_category:
                continue

        results.append({
            'item_id': int(robust_features.iloc[idx]['item_id']),
            'name': robust_features.iloc[idx]['name'],
            'category': robust_features.iloc[idx]['garment_category'],
            'formality': robust_features.iloc[idx]['garment_formality'],
            'similarity': float(similarities[idx]),
            'similarity_percent': float(similarities[idx] * 100)
        })

        if len(results) >= top_k:
            break

    return results

# Test similarity function
print("\n   Testing similarity function...")
print("-" * 50)

# Get a few sample items for testing
sample_items = robust_features.head(3)
for _, item in sample_items.iterrows():
    similar = find_similar_items_cosine(item['item_id'], top_k=2, same_category_only=True)
    if similar:
        print(f"   {item['name'][:20]:<20} ‚Üí {similar[0]['name'][:20]:<20} ({similar[0]['similarity_percent']:.1f}%)")

# ========== 8. CREATE FINAL EMBEDDINGS DATAFRAME ==========
print("\n8. üíæ Creating final embeddings dataframe...")

# Create embeddings dataframe
embeddings_df = pd.DataFrame(embeddings)
embeddings_df.columns = [f'embedding_{i}' for i in range(embeddings.shape[1])]

# Add metadata
metadata_cols = ['item_id', 'name', 'garment_category', 'garment_formality',
                 'price', 'total_stock', 'embedding_cluster']

for col in metadata_cols:
    if col in robust_features.columns:
        embeddings_df[col] = robust_features[col].values

# Add embedding quality metrics
embeddings_df['embedding_norm'] = np.linalg.norm(embeddings, axis=1)
embeddings_df['embedding_magnitude'] = embeddings_df['embedding_norm']

print(f"   ‚úÖ Embeddings dataframe: {embeddings_df.shape}")
print(f"   Embedding dimensions: {embeddings.shape[1]}")
print(f"   Items with embeddings: {len(embeddings_df)}")

# ========== 9. SAVE ALL ARTIFACTS ==========
print("\n9. üíæ Saving all artifacts...")

# Create embeddings directory
EMBEDDINGS_DIR = ARTIFACTS_DIR / 'embeddings'
EMBEDDINGS_DIR.mkdir(exist_ok=True)

# Save embeddings dataframe
embeddings_df.to_pickle(EMBEDDINGS_DIR / 'item_embeddings.pkl')
print("   ‚úÖ Saved item_embeddings.pkl")

# Save embeddings array
np.save(EMBEDDINGS_DIR / 'embeddings_array.npy', embeddings)
print("   ‚úÖ Saved embeddings_array.npy")

# Save PCA model
with open(EMBEDDINGS_DIR / 'pca_model.pkl', 'wb') as f:
    pickle.dump(pca, f)
print("   ‚úÖ Saved pca_model.pkl")

# Save KMeans model
with open(EMBEDDINGS_DIR / 'kmeans_model.pkl', 'wb') as f:
    pickle.dump(kmeans, f)
print("   ‚úÖ Saved kmeans_model.pkl")

# Save scaler
with open(EMBEDDINGS_DIR / 'scaler_pca.pkl', 'wb') as f:
    pickle.dump(scaler_pca, f)
print("   ‚úÖ Saved scaler_pca.pkl")

# Save robust features
robust_features.to_pickle(EMBEDDINGS_DIR / 'robust_features.pkl')
print("   ‚úÖ Saved robust_features.pkl")

# Save similarity function test
test_results = {}
for _, item in robust_features.head(5).iterrows():
    similar = find_similar_items_cosine(item['item_id'], top_k=3, same_category_only=True)
    test_results[item['item_id']] = {
        'name': item['name'],
        'similar_items': similar
    }

with open(EMBEDDINGS_DIR / 'similarity_test_results.pkl', 'wb') as f:
    pickle.dump(test_results, f)
print("   ‚úÖ Saved similarity test results")

# Save summary
summary = {
    'step': 3,
    'method': 'PCA with weighted feature engineering',
    'embeddings_info': {
        'total_items': len(embeddings_df),
        'embedding_dimensions': embeddings.shape[1],
        'original_features': len(all_features),
        'explained_variance_ratio': float(pca.explained_variance_ratio_.sum()),
        'average_cluster_purity': float(avg_purity)
    },
    'features_used': {
        'total_features': len(all_features),
        'category_features': len([f for f in all_features if 'cat_' in f]),
        'formality_features': len([f for f in all_features if 'form_' in f]),
        'encoded_features': len(encoded_cols),
        'scaled_features': len(scaled_cols)
    },
    'quality_metrics': {
        'cluster_purity': float(avg_purity),
        'embedding_quality': 'good' if avg_purity > 0.6 else 'acceptable',
        'category_separation': len(embeddings_df['garment_category'].unique()),
        'formality_levels': len(embeddings_df['garment_formality'].unique())
    }
}

with open(EMBEDDINGS_DIR / 'embeddings_summary.pkl', 'wb') as f:
    pickle.dump(summary, f)

print("   ‚úÖ Saved embeddings summary")

# ========== 10. FINAL RESULTS ==========
print("\n10. üìä FINAL RESULTS")
print("=" * 60)

print(f"\nüéØ METHOD: PCA with weighted feature engineering")
print(f"üìà Embedding Dimensions: {embeddings.shape[1]}")
print(f"üìä Explained Variance: {pca.explained_variance_ratio_.sum():.1%}")
print(f"üè∑Ô∏è  Cluster Purity: {avg_purity:.1%}")
print(f"üî¢ Total Items: {len(embeddings_df)}")

print(f"\nüìã Feature Engineering:")
print(f"   - Total features used: {len(all_features)}")
print(f"   - Category features: {len([f for f in all_features if 'cat_' in f])}")
print(f"   - Formality features: {len([f for f in all_features if 'form_' in f])}")
print(f"   - Encoded features: {len(encoded_cols)}")
print(f"   - Scaled features: {len(scaled_cols)}")

print(f"\nüìÅ Output Files (saved to ./artifacts/embeddings/):")
for file in EMBEDDINGS_DIR.glob('*.pkl'):
    print(f"   ‚Ä¢ {file.name}")
print(f"   ‚Ä¢ embeddings_array.npy")

for file in EMBEDDINGS_DIR.glob('*.pkl'):
    print(f"   ‚Ä¢ {file.name}")
print(f"   ‚Ä¢ embeddings_array.npy")

print(f"\nüé≠ Category Distribution:")
category_dist = embeddings_df['garment_category'].value_counts()
for cat, count in category_dist.items():
    print(f"   {cat:<15}: {count:>3} items")

print("\n" + "=" * 60)

# Quality assessment
if avg_purity > 0.7:
    print("‚úÖ EXCELLENT EMBEDDINGS - READY FOR PRODUCTION!")
elif avg_purity > 0.6:
    print("‚úÖ VERY GOOD EMBEDDINGS - READY FOR NEXT STEP!")
elif avg_purity > 0.5:
    print("‚úÖ GOOD EMBEDDINGS - READY FOR USE")
elif avg_purity > 0.4:
    print("‚ö†Ô∏è  ACCEPTABLE EMBEDDINGS - PROCEED WITH CAUTION")
else:
    print("‚ùå POOR EMBEDDINGS - NEEDS REVISION")

print(f"\nüéØ Proceed to Step 4: Size Recommendation Engine")
print("=" * 60)

# Show sample embeddings
print("\nüìã SAMPLE EMBEDDINGS (first 3 items):")
print("-" * 50)
sample = embeddings_df.head(3)
for _, row in sample.iterrows():
    print(f"\n{row['name'][:30]:<30} ({row['garment_category']}/{row['garment_formality']})")
    print(f"  Cluster: {row['embedding_cluster']}")
    print(f"  Embedding norm: {row['embedding_norm']:.2f}")
    emb_values = [row[f'embedding_{i}'] for i in range(3)]  # First 3 dimensions
    print(f"  First 3 dims: {[round(v, 3) for v in emb_values]}")

print("\n" + "=" * 60)
print("üéâ STEP 3 COMPLETE WITH PCA EMBEDDINGS!")
print("‚úÖ Ready for size recommendation engine")
print("=" * 60)

üéØ STEP 3: Create Item Embeddings (CORRECTED - LOCAL VERSION)
‚úÖ All libraries imported
üìÅ Working directory: c:\Users\Rana\OneDrive\Desktop\FitFast FYP\fitfast\frontend\src\ai
üìÅ Artifacts directory: c:\Users\Rana\OneDrive\Desktop\FitFast FYP\fitfast\frontend\src\ai\artifacts

1. üîÑ Loading data from Step 2...
   Looking for Step 2 artifacts...
   Available files:
   - complete_outfit_system.pkl
   - complete_size_system.pkl
   - embeddings
   - features_df.pkl
   - feature_encoders.pkl
   - feature_matrix.pkl
   - feature_metadata.pkl
   - hybrid_recommender.pkl
   - intelligent_outfit_builder.pkl
   - intelligent_outfit_builder_fixed_names.pkl
   - measurement_mapping.pkl
   - raw_measurements.pkl
   - scaler.pkl
   - size_recommender_v2.pkl
   - size_system_summary.pkl
   - user_108.json
   - user_114.json
   - user_116.json
   - user_212.json
   - user_213.json
   - user_214.json
   - user_215.json
   - user_216.json
   - user_217.json
   - user_317.json
   - original_ite

In [5]:
# @title üìè **STEP 4: Size Recommendation Engine with Real Measurements (LOCAL FIXED VERSION)**
print("üìè STEP 4: Size Recommendation Engine with Real Measurements (LOCAL FIXED VERSION)")
print("=" * 60)

import sys
import os

current_dir = os.getcwd()  # Get current working directory
sys.path.insert(0, current_dir)

import ai_module




import pandas as pd
import numpy as np
import json
import pickle
import re
from pathlib import Path
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import euclidean_distances
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ Libraries imported")


# ========== CONFIGURATION ==========
# Use same directory structure as Steps 1-3
ARTIFACTS_DIR = Path('./artifacts')
ORIGINAL_DATA_PATH = Path('./original_items.pkl')  # From Step 1

print(f"üìÅ Working directory: {Path.cwd()}")
print(f"üìÅ Artifacts directory: {ARTIFACTS_DIR.absolute()}")
print(f"üìÅ Original data: {ORIGINAL_DATA_PATH.absolute()}")

# ========== 1. LOAD DATA FROM PREVIOUS STEPS ==========
print("\n1. üîÑ Loading data from previous steps...")

# Load original data from Step 1
if not ORIGINAL_DATA_PATH.exists():
    print(f"   ‚ùå Original data not found at {ORIGINAL_DATA_PATH}")
    # Try alternative paths
    fallback_paths = [
        Path('original_items.pkl'),
        Path('./original_items.pkl'),
        Path('./artifacts/original_items.pkl')
    ]
    for path in fallback_paths:
        if path.exists():
            ORIGINAL_DATA_PATH = path
            print(f"   ‚úÖ Found at: {path}")
            break
    
    if not ORIGINAL_DATA_PATH.exists():
        raise FileNotFoundError(f"Could not find original_items.pkl")

original_df = pd.read_pickle(ORIGINAL_DATA_PATH)
print(f"   ‚úÖ Loaded original data: {original_df.shape}")

# Load features from Step 2
features_df_path = ARTIFACTS_DIR / 'features_df.pkl'
if features_df_path.exists():
    features_df = pd.read_pickle(features_df_path)
    print(f"   ‚úÖ Loaded features from Step 2: {features_df.shape}")
else:
    print(f"   ‚ö†Ô∏è features_df.pkl not found")
    features_df = None

# Load embeddings from Step 3
embeddings_path = ARTIFACTS_DIR / 'embeddings' / 'item_embeddings.pkl'
if embeddings_path.exists():
    embeddings_df = pd.read_pickle(embeddings_path)
    print(f"   ‚úÖ Loaded embeddings from Step 3: {embeddings_df.shape}")
else:
    print(f"   ‚ö†Ô∏è embeddings not found")
    embeddings_df = None

# ========== 2. PARSE YOUR REAL DATA FORMAT ==========
print("\n2. üîß Parsing your actual sizing data format...")

def parse_actual_sizing_data(sizing_str):
    """
    Parse the actual format from your database
    Example: {"garment_type":"t_shirt","measurements_cm":{"XS":{"chest_circumference":86,...}}}
    """
    if pd.isna(sizing_str) or not isinstance(sizing_str, str):
        return {'garment_type': 'unknown', 'measurements': {}, 'has_measurements': False}
    
    try:
        # The data is already JSON from Step 1 processing
        if sizing_str.strip().startswith('{'):
            data = json.loads(sizing_str)
            
            result = {
                'garment_type': data.get('garment_type', 'unknown'),
                'measurements': data.get('measurements_cm', {}),
                'fit_characteristics': data.get('fit_characteristics', {}),
                'size_system': data.get('size_system', 'US'),
                'has_measurements': bool(data.get('measurements_cm', {}))
            }
            return result
        else:
            return {'garment_type': 'unknown', 'measurements': {}, 'has_measurements': False}
            
    except json.JSONDecodeError as e:
        print(f"   Warning: JSON decode error: {e}")
        return {'garment_type': 'unknown', 'measurements': {}, 'has_measurements': False}
    except Exception as e:
        print(f"   Warning: Other error: {e}")
        return {'garment_type': 'unknown', 'measurements': {}, 'has_measurements': False}

# Test parsing with actual data
print("   Testing parsing with actual data...")
sample_data = original_df.iloc[0]['Sizing Data'] if 'Sizing Data' in original_df.columns else ''
print(f"   Sample data: {str(sample_data)[:100]}...")
parsed_sample = parse_actual_sizing_data(sample_data)
print(f"   Parsed: garment_type='{parsed_sample['garment_type']}', "
      f"has_measurements={parsed_sample['has_measurements']}, "
      f"sizes={list(parsed_sample.get('measurements', {}).keys())[:3]}...")

# ========== 3. EXTRACT ALL MEASUREMENTS ==========
print("\n3. üìä Extracting all measurements...")

all_measurements = []
items_with_measurements = 0

for idx, row in original_df.iterrows():
    item_id = row.get('ID', idx + 1)
    item_name = row.get('Name', f'Item {item_id}')
    
    # Get sizing data
    sizing_str = row.get('Sizing Data', '')
    parsed_data = parse_actual_sizing_data(sizing_str)
    
    if parsed_data.get('has_measurements', False):
        measurements = parsed_data.get('measurements', {})
        
        for size, size_measurements in measurements.items():
            if isinstance(size_measurements, dict):
                record = {
                    'item_id': item_id,
                    'item_name': item_name,
                    'garment_type': parsed_data['garment_type'],
                    'size': size,
                    'fit_type': parsed_data.get('fit_characteristics', {}).get('fit_type', 'regular'),
                    'ease': parsed_data.get('fit_characteristics', {}).get('ease', 'standard'),
                    'stretch': parsed_data.get('fit_characteristics', {}).get('stretch', 'medium'),
                    'size_system': parsed_data.get('size_system', 'US')
                }
                
                # Add all measurements as separate columns
                for key, value in size_measurements.items():
                    if isinstance(value, (int, float)):
                        record[key] = float(value)
                    elif isinstance(value, str):
                        # Try to convert string numbers
                        try:
                            record[key] = float(value)
                        except:
                            record[key] = value
                    else:
                        record[key] = value
                
                all_measurements.append(record)
        
        items_with_measurements += 1

# Create measurement database
if all_measurements:
    measurement_db = pd.DataFrame(all_measurements)
    print(f"   ‚úÖ Created measurement database: {measurement_db.shape}")
    print(f"   Items with measurements: {items_with_measurements}/{len(original_df)}")
    print(f"   Total size records: {len(measurement_db)}")
    
    # Show what we found
    print(f"\n   üìã Measurement Database Summary:")
    print(f"   - Unique garment types: {measurement_db['garment_type'].nunique()}")
    print(f"   - Unique sizes: {measurement_db['size'].nunique()}")
    print(f"   - Available measurements: {[col for col in measurement_db.columns if col not in ['item_id', 'item_name', 'garment_type', 'size', 'fit_type', 'ease', 'stretch', 'size_system']]}")
    
    # Show sample
    print(f"\n   üìã Sample records:")
    sample = measurement_db.head(3)
    for _, row in sample.iterrows():
        measurements = {k: v for k, v in row.items() if k not in ['item_id', 'item_name', 'garment_type', 'size', 'fit_type', 'ease', 'stretch', 'size_system'] and isinstance(v, (int, float))}
        print(f"   Item {row['item_id']} ({row['garment_type']} - Size {row['size']}): {list(measurements.keys())[:3]}...")
else:
    measurement_db = pd.DataFrame()
    print("   ‚ùå No measurements extracted")

# ========== 4. BUILD SIZE RECOMMENDER ==========
print("\n4. ü§ñ Building Size Recommender...")
SizeRecommenderV2 = ai_module.SizeRecommenderV2



# ========== 5. BUILD AND TRAIN THE RECOMMENDER ==========
print("\n5. üöÄ Building and training Size Recommender...")

if not measurement_db.empty:
    # Initialize recommender
    size_recommender = SizeRecommenderV2()
    size_recommender.load_data(measurement_db, original_df)
    
    print(f"\n   ‚úÖ Recommender built successfully!")
    print(f"   Available garment types: {len(size_recommender.get_garment_types())}")
    
    # Save the recommender
    recommender_path = ARTIFACTS_DIR / 'size_recommender_v2.pkl'
    with open(recommender_path, 'wb') as f:
        pickle.dump(size_recommender, f)
    print(f"   üíæ Saved to: {recommender_path}")
else:
    print("   ‚ùå No measurement data available")
    size_recommender = None

# ========== 6. CREATE HYBRID SYSTEM WITH EMBEDDINGS ==========
print("\n6. üîó Creating Hybrid Recommendation System...")

class HybridRecommender:
    """Combine size recommendations with style embeddings"""
    
    def __init__(self, size_recommender, embeddings_df=None):
        self.size_recommender = size_recommender
        self.embeddings_df = embeddings_df
        self.item_embeddings = None
        self.item_ids = None
        
        # Load embeddings if available
        if embeddings_df is not None:
            self._load_embeddings()
    
    def _load_embeddings(self):
        """Load and prepare embeddings"""
        # Extract embedding columns
        embed_cols = [col for col in self.embeddings_df.columns if col.startswith('embedding_')]
        if embed_cols:
            self.item_embeddings = self.embeddings_df[embed_cols].values
            
            # Get item IDs - ensure they're integers
            if 'item_id' in self.embeddings_df.columns:
                self.item_ids = self.embeddings_df['item_id'].values.astype(int)
            else:
                # Create sequential IDs if not present
                self.item_ids = np.arange(len(self.item_embeddings))
                
            print(f"   ‚úÖ Loaded {len(self.item_embeddings)} embeddings")
            print(f"   Item IDs range: {self.item_ids.min()} to {self.item_ids.max()}")
    
    def hybrid_recommend(self, user_measurements, garment_type, 
                        style_preference=None, top_k=5, size_weight=0.7, style_weight=0.3):
        """
        Hybrid recommendation combining size fit and style similarity
        
        Args:
            user_measurements: User body measurements
            garment_type: Type of garment
            style_preference: Optional preferred item ID (integer)
            top_k: Number of recommendations
            size_weight: Weight for size fit (0-1)
            style_weight: Weight for style similarity (0-1)
        """
        print(f"\n   üîç Hybrid recommendation for {garment_type}...")
        
        # Get size-based recommendations
        size_recommendations = self.size_recommender.find_best_fitting_items(
            user_measurements, garment_type, top_k=top_k * 2
        )
        
        if not size_recommendations:
            print("   ‚ö†Ô∏è No size-based recommendations found")
            return []
        
        # If no style preference or embeddings not available, just return size recommendations
        if style_preference is None or self.item_embeddings is None:
            print("   ‚ÑπÔ∏è  No style preference or embeddings available, returning size-only recommendations")
            return size_recommendations[:top_k]
        
        # Calculate style similarity
        style_scores = self._calculate_style_similarity(style_preference)
        
        # If no style scores, return size-only
        if not style_scores:
            print("   ‚ÑπÔ∏è  Could not calculate style scores, returning size-only recommendations")
            return size_recommendations[:top_k]
        
        # Combine scores
        final_recommendations = []
        for rec in size_recommendations:
            item_id = rec['item_id']
            
            # Get style score if available
            style_score = style_scores.get(item_id, 0.5)  # Default to neutral
            
            # Combine scores
            combined_score = (rec['overall_fit_score'] * size_weight + 
                            style_score * style_weight)
            
            final_rec = rec.copy()
            final_rec['style_similarity'] = style_score
            final_rec['combined_score'] = combined_score
            final_recommendations.append(final_rec)
        
        # Sort by combined score
        final_recommendations.sort(key=lambda x: x['combined_score'], reverse=True)
        
        return final_recommendations[:top_k]
    
    def _calculate_style_similarity(self, style_preference):
        """Calculate style similarity scores for all items"""
        if self.item_embeddings is None:
            return {}
        
        # Ensure style_preference is an integer (item ID)
        try:
            style_item_id = int(style_preference)
        except (ValueError, TypeError):
            print(f"   ‚ö†Ô∏è Style preference must be an item ID (integer), got: {style_preference}")
            return {}
        
        # Find the embedding for this item ID
        idx_mask = self.item_ids == style_item_id
        if not np.any(idx_mask):
            print(f"   ‚ö†Ô∏è Item ID {style_item_id} not found in embeddings")
            return {}
        
        idx = np.where(idx_mask)[0][0]
        target_embedding = self.item_embeddings[idx]
        
        # Reshape to 2D for cosine_similarity
        if len(target_embedding.shape) == 1:
            target_embedding = target_embedding.reshape(1, -1)
        
        # Calculate cosine similarity
        from sklearn.metrics.pairwise import cosine_similarity
        
        similarities = cosine_similarity(target_embedding, self.item_embeddings)[0]
        
        # Create dictionary of item_id -> similarity
        style_scores = {}
        for item_id, similarity in zip(self.item_ids, similarities):
            style_scores[int(item_id)] = float(similarity)
        
        return style_scores

# Create hybrid recommender
print("\n   Creating hybrid system...")
hybrid_recommender = HybridRecommender(size_recommender, embeddings_df)

# Save hybrid system
hybrid_path = ARTIFACTS_DIR / 'hybrid_recommender.pkl'
with open(hybrid_path, 'wb') as f:
    pickle.dump(hybrid_recommender, f)
print(f"   üíæ Saved hybrid recommender to: {hybrid_path}")

# ========== 7. TEST THE SYSTEM ==========
print("\n7. üß™ Testing the system...")
print("=" * 60)

# Test user measurements
test_user = {
    'chest_circumference': 95,      # Medium chest
    'waist_circumference': 82,      # Medium waist
    'garment_length': 75,          # Average length
    'sleeve_length': 62,           # Average sleeve
    'shoulder_width': 45,          # Average shoulder
    'inseam_length': 78,           # Average inseam
    'hips_circumference': 96       # Medium hips
}

print(f"\nüë§ TEST USER:")
for key, value in test_user.items():
    print(f"  {key}: {value}cm")

# Test cases
test_cases = [
    ('t_shirt', 'Expected: Size M for 95cm chest'),
    ('regular_jeans', 'Expected: Size M for 82cm waist'),
    ('slim_pants', 'Expected: Size M or L'),
]

print(f"\nüß™ TESTING SIZE RECOMMENDATIONS:")
print("-" * 50)

if size_recommender:
    for garment_type, expectation in test_cases:
        print(f"\n{garment_type.upper()}:")
        print(f"  Expectation: {expectation}")
        
        recommendations = size_recommender.find_best_fitting_items(
            test_user, garment_type, top_k=2
        )
        
        if recommendations:
            best = recommendations[0]
            print(f"  ‚úÖ RECOMMENDED: Size {best['recommended_size']}")
            print(f"     Item: {best['item_name']}")
            print(f"     Fit Score: {best['overall_fit_score']:.2f} ({best['fit_assessment']})")
            print(f"     Price: ${best['price']:.2f}")
            
            # Show key measurements
            if best.get('key_measurements'):
                print(f"     Key Measurements:")
                for meas, info in list(best['key_measurements'].items())[:3]:
                    diff_word = "smaller" if info['difference'] < 0 else "larger"
                    print(f"       ‚Ä¢ {meas}: {info['assessment']} "
                          f"(User is {abs(info['difference']):.1f}cm {diff_word})")
        else:
            print(f"  ‚ö†Ô∏è  No recommendations found")

# Test hybrid recommendations if embeddings available
if hybrid_recommender and embeddings_df is not None:
    print(f"\nüß™ TESTING HYBRID RECOMMENDATIONS:")
    print("-" * 50)
    
    # Use first item as style preference
    style_item_id = embeddings_df.iloc[0]['item_id']
    style_item_name = embeddings_df.iloc[0]['name']
    
    print(f"\nStyle Preference: {style_item_name}")
    
    hybrid_recs = hybrid_recommender.hybrid_recommend(
        test_user, 't_shirt', 
        style_preference=style_item_id,
        top_k=2
    )
    
    if hybrid_recs:
        print(f"\nHybrid Recommendations:")
        for i, rec in enumerate(hybrid_recs, 1):
            print(f"{i}. {rec['item_name']}")
            print(f"   Size: {rec['recommended_size']}, "
                  f"Fit: {rec['overall_fit_score']:.2f}, "
                  f"Style: {rec['style_similarity']:.2f}, "
                  f"Combined: {rec['combined_score']:.2f}")

# ========== 8. SAVE COMPLETE SYSTEM ==========
print("\n8. üíæ Saving complete system...")
print("=" * 60)

# Create system package
system_package = {
    'size_recommender': size_recommender,
    'hybrid_recommender': hybrid_recommender,
    'measurement_database': measurement_db,
    'test_user': test_user,
    'test_results': {
        't_shirt': size_recommender.find_best_fitting_items(test_user, 't_shirt', top_k=3) if size_recommender else []
    }
}

# Save complete system
complete_path = ARTIFACTS_DIR / 'complete_size_system.pkl'
with open(complete_path, 'wb') as f:
    pickle.dump(system_package, f)

print(f"   ‚úÖ Complete system saved to: {complete_path}")

# Create summary
summary = {
    'system_info': {
        'total_items': len(original_df),
        'items_with_measurements': items_with_measurements,
        'measurement_records': len(measurement_db),
        'garment_types': measurement_db['garment_type'].nunique() if not measurement_db.empty else 0,
        'available_sizes': measurement_db['size'].nunique() if not measurement_db.empty else 0,
        'has_embeddings': embeddings_df is not None,
        'has_hybrid': hybrid_recommender is not None
    },
    'file_locations': {
        'size_recommender': str(ARTIFACTS_DIR / 'size_recommender_v2.pkl'),
        'hybrid_recommender': str(ARTIFACTS_DIR / 'hybrid_recommender.pkl'),
        'complete_system': str(complete_path),
        'original_data': str(ORIGINAL_DATA_PATH)
    }
}

summary_path = ARTIFACTS_DIR / 'size_system_summary.pkl'
with open(summary_path, 'wb') as f:
    pickle.dump(summary, f)

print(f"   ‚úÖ System summary saved to: {summary_path}")

print("\nüìä SYSTEM SUMMARY:")
print("-" * 30)
print(f"‚Ä¢ Total items: {summary['system_info']['total_items']}")
print(f"‚Ä¢ Items with measurements: {summary['system_info']['items_with_measurements']}")
print(f"‚Ä¢ Measurement records: {summary['system_info']['measurement_records']}")
print(f"‚Ä¢ Unique garment types: {summary['system_info']['garment_types']}")
print(f"‚Ä¢ Available sizes: {summary['system_info']['available_sizes']}")
print(f"‚Ä¢ Has embeddings: {summary['system_info']['has_embeddings']}")
print(f"‚Ä¢ Has hybrid system: {summary['system_info']['has_hybrid']}")

print("\nüéØ FEATURES:")
print("-" * 30)
print("1. ‚úÖ Item-first size recommendations")
print("2. ‚úÖ Detailed fit scoring per measurement")
print("3. ‚úÖ Garment-type specific measurements")
print("4. ‚úÖ Human-readable fit assessments")
print("5. ‚úÖ Hybrid style + size recommendations")
print("6. ‚úÖ Integration with Step 3 embeddings")
print("7. ‚úÖ All files saved locally")
# Test that ai_module is imported correctly
print(f"‚úÖ Imported ai_module from: {ai_module.__file__}")
print(f"‚úÖ Available classes: {ai_module.__all__}")

print("\n" + "=" * 60)
print("‚úÖ STEP 4 COMPLETE - SIZE RECOMMENDATION SYSTEM READY!")
print("=" * 60)

# ========== 9. USAGE EXAMPLE ==========
print("\nüìñ QUICK USAGE GUIDE:")
print("=" * 60)

print("""
HOW TO USE THE SYSTEM:

1. Get size-only recommendations:
   recommendations = size_recommender.find_best_fitting_items(
       user_measurements,
       't_shirt',  # garment type
       top_k=5
   )

2. Get hybrid recommendations (style + size):
   hybrid_recs = hybrid_recommender.hybrid_recommend(
       user_measurements,
       't_shirt',
       style_preference=item_id,  # or embedding vector
       top_k=5,
       size_weight=0.7,  # balance between fit and style
       style_weight=0.3
   )

3. Get garment statistics:
   stats = size_recommender.get_garment_stats('t_shirt')

4. Load saved system:
   with open('artifacts/complete_size_system.pkl', 'rb') as f:
       system = pickle.load(f)
   size_recommender = system['size_recommender']

EXAMPLE:
--------
user = {
    'chest_circumference': 95,
    'waist_circumference': 82,
    'sleeve_length': 62
}

# Size-only
recs = size_recommender.find_best_fitting_items(user, 't_shirt', top_k=3)

# Hybrid (prefer items similar to item_id 123)
hybrid = hybrid_recommender.hybrid_recommend(
    user, 't_shirt', 
    style_preference=123,
    top_k=3
)
""")

üìè STEP 4: Size Recommendation Engine with Real Measurements (LOCAL FIXED VERSION)
‚úÖ Libraries imported
üìÅ Working directory: c:\Users\Rana\OneDrive\Desktop\FitFast FYP\fitfast\frontend\src\ai
üìÅ Artifacts directory: c:\Users\Rana\OneDrive\Desktop\FitFast FYP\fitfast\frontend\src\ai\artifacts
üìÅ Original data: c:\Users\Rana\OneDrive\Desktop\FitFast FYP\fitfast\frontend\src\ai\original_items.pkl

1. üîÑ Loading data from previous steps...
   ‚úÖ Loaded original data: (250, 14)
   ‚úÖ Loaded features from Step 2: (250, 314)
   ‚úÖ Loaded embeddings from Step 3: (250, 41)

2. üîß Parsing your actual sizing data format...
   Testing parsing with actual data...
   Sample data: {"garment_type":"t_shirt","measurements_cm":{"XS":{"chest_circumference":"90","garment_length":"71",...
   Parsed: garment_type='t_shirt', has_measurements=True, sizes=['XS', 'S', 'M']...

3. üìä Extracting all measurements...
   ‚úÖ Created measurement database: (1500, 28)
   Items with measurements: 250

In [6]:
# @title üëó **STEP 5: Intelligent Outfit Builder (LOCAL COMPATIBLE VERSION)**
print("üëó STEP 5: Intelligent Outfit Builder (LOCAL COMPATIBLE VERSION)")
print("=" * 60)

import sys
import os

current_dir = os.getcwd()  # Get current working directory
sys.path.insert(0, current_dir)

import ai_module

import pandas as pd
import numpy as np
import json
import pickle
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ Libraries imported")

# ========== 1. LOAD DATA FROM STEPS 3 & 4 (LOCAL VERSION) ==========
print("\n1. üîÑ Loading data from Steps 3 & 4 (LOCAL)...")

# Define local paths
ARTIFACTS_DIR = Path('./artifacts')
EMBEDDINGS_DIR = ARTIFACTS_DIR / 'embeddings'
ORIGINAL_DATA_PATH = Path('./original_items.pkl')

print(f"üìÅ Working directory: {Path.cwd()}")
print(f"üìÅ Artifacts directory: {ARTIFACTS_DIR.absolute()}")

# Load item embeddings from Step 3 (local version)
print("   Loading item embeddings from Step 3...")
try:
    embeddings_path = EMBEDDINGS_DIR / 'item_embeddings.pkl'
    if embeddings_path.exists():
        item_embeddings_df = pd.read_pickle(embeddings_path)
        print(f"   ‚úÖ Loaded item embeddings: {item_embeddings_df.shape}")
        print(f"   Columns: {list(item_embeddings_df.columns)[:5]}...")
    else:
        print(f"   ‚ùå Item embeddings not found at {embeddings_path}")
        item_embeddings_df = pd.DataFrame()
except Exception as e:
    print(f"   ‚ùå Error loading item embeddings: {e}")
    item_embeddings_df = pd.DataFrame()

# Load robust features from Step 3
print("\n   Loading robust features from Step 3...")
try:
    robust_features_path = EMBEDDINGS_DIR / 'robust_features.pkl'
    if robust_features_path.exists():
        robust_features = pd.read_pickle(robust_features_path)
        print(f"   ‚úÖ Loaded robust features: {robust_features.shape}")
    else:
        print(f"   ‚ö†Ô∏è Robust features not found")
        robust_features = pd.DataFrame()
except Exception as e:
    print(f"   ‚ùå Error loading robust features: {e}")
    robust_features = pd.DataFrame()

# Load original items (for metadata)
print("\n   Loading original items...")
if ORIGINAL_DATA_PATH.exists():
    original_df = pd.read_pickle(ORIGINAL_DATA_PATH)
    print(f"   ‚úÖ Loaded original items: {original_df.shape}")
    
    # Rename ID column for consistency
    if 'ID' in original_df.columns:
        original_df.rename(columns={'ID': 'item_id'}, inplace=True)
else:
    print(f"   ‚ö†Ô∏è Original items not found at {ORIGINAL_DATA_PATH}")
    original_df = pd.DataFrame()

# Load size recommender from Step 4
print("\n   Loading size recommender from Step 4...")
try:
    size_recommender_path = ARTIFACTS_DIR / 'size_recommender_v2.pkl'
    if size_recommender_path.exists():
        with open(size_recommender_path, 'rb') as f:
            size_recommender = pickle.load(f)
        print("   ‚úÖ Loaded size recommender")
    else:
        print(f"   ‚ö†Ô∏è Size recommender not found at {size_recommender_path}")
        size_recommender = None
except Exception as e:
    print(f"   ‚ö†Ô∏è Could not load size recommender: {e}")
    size_recommender = None

# ========== 2. CREATE UNIFIED ITEM DATABASE ==========
print("\n2. üõ†Ô∏è Creating unified item database...")

# Start with embeddings as base (they contain essential metadata)
if not item_embeddings_df.empty:
    unified_df = item_embeddings_df.copy()
    print(f"   Starting with embeddings: {unified_df.shape}")
    
    # Ensure we have critical columns
    if 'item_id' not in unified_df.columns:
        if 'item_id' in robust_features.columns:
            unified_df['item_id'] = robust_features['item_id']
        else:
            unified_df['item_id'] = unified_df.index + 1
    
    # Add missing metadata from original data
    if not original_df.empty:
        # Keep only essential columns from original
        original_metadata = original_df[['item_id', 'Name', 'Description', 'Store', 'Category']].copy()
        original_metadata.rename(columns={
            'Name': 'name',
            'Description': 'description',
            'Store': 'store',
            'Category': 'category'
        }, inplace=True)
        
        # Merge with unified_df
        unified_df = pd.merge(
            unified_df,
            original_metadata,
            on='item_id',
            how='left'
        )
        print(f"   ‚úÖ Added metadata from original data")
    
    # Add garment_type from robust_features
    if not robust_features.empty and 'garment_type' in robust_features.columns:
        garment_info = robust_features[['item_id', 'garment_type', 'garment_category', 'garment_formality']].copy()
        unified_df = pd.merge(
            unified_df,
            garment_info,
            on='item_id',
            how='left'
        )
        print(f"   ‚úÖ Added garment info from robust features")
    
else:
    # Fallback: create from original data
    print("   ‚ö†Ô∏è No embeddings found, creating from original data")
    unified_df = original_df.copy()
    
    # Rename columns for consistency
    rename_map = {
        'Name': 'name',
        'Description': 'description',
        'Store': 'store',
        'Category': 'category',
        'Garment Type': 'garment_type',
        'Price': 'price',
        'Total Stock': 'total_stock'
    }
    
    for old_col, new_col in rename_map.items():
        if old_col in unified_df.columns:
            unified_df[new_col] = unified_df[old_col]
    
    # Create item_id if missing
    if 'item_id' not in unified_df.columns:
        unified_df['item_id'] = unified_df.index + 1

# Ensure essential columns exist
print("\n   Ensuring essential columns...")
required_columns = {
    'name': 'Item {item_id}',
    'garment_type': 'unknown',
    'garment_category': 'other',
    'garment_formality': 'casual',
    'price': 0.0,
    'description': '',
    'store': 'unknown',
    'category': 'unknown'
}

for col, default in required_columns.items():
    if col not in unified_df.columns:
        if col == 'name':
            unified_df[col] = unified_df.apply(lambda x: default.format(item_id=x['item_id']), axis=1)
        elif col == 'price':
            # Try to find price from any source
            if 'Price' in unified_df.columns:
                unified_df[col] = pd.to_numeric(unified_df['Price'], errors='coerce').fillna(0.0)
            else:
                unified_df[col] = default
        else:
            unified_df[col] = default

print(f"   ‚úÖ Unified database: {unified_df.shape}")
print(f"   Columns: {list(unified_df.columns)[:10]}...")

# ========== 3. CREATE EMBEDDINGS FOR SIMILARITY ==========
print("\n3. ü§ñ Creating embeddings for similarity search...")

# Extract embedding columns
embedding_cols = [col for col in unified_df.columns if col.startswith('embedding_')]
print(f"   Found {len(embedding_cols)} embedding columns")

if embedding_cols:
    # Create embeddings dictionary
    item_embeddings_dict = {}
    for idx, row in unified_df.iterrows():
        item_id = str(int(row['item_id']))  # Ensure string ID
        embeddings = row[embedding_cols].values.astype(np.float32)
        item_embeddings_dict[item_id] = embeddings

    print(f"   Created embeddings for {len(item_embeddings_dict)} items")
else:
    print("   ‚ö†Ô∏è No embedding columns found, creating simple embeddings...")
    
    # Create simple embeddings based on categories and price
    item_embeddings_dict = {}
    for idx, row in unified_df.iterrows():
        item_id = str(int(row['item_id']))
        
        # Create 10-dimensional embedding
        embedding = np.zeros(10, dtype=np.float32)
        
        # Encode garment category (simple one-hot)
        categories = ['top', 'bottom', 'dress', 'outerwear', 'footwear', 
                     'swimwear', 'underwear', 'socks', 'other']
        category = row.get('garment_category', 'other')
        if category in categories:
            embedding[categories.index(category)] = 1.0
        
        # Encode price (scaled)
        price = float(row.get('price', 0))
        embedding[9] = np.log1p(price) / 10.0  # Log scale for price
        
        item_embeddings_dict[item_id] = embedding
    
    print(f"   Created generated embeddings for {len(item_embeddings_dict)} items")

# ========== 4. INTELLIGENT OUTFIT BUILDER ==========
print("\n4. üé® Creating Intelligent Outfit Builder...")
IntelligentOutfitBuilder = ai_module.IntelligentOutfitBuilder


# ========== 5. BUILD AND TEST THE SYSTEM ==========
print("\n5. üöÄ Building and testing outfit builder...")

# Create outfit builder
outfit_builder = IntelligentOutfitBuilder(
    items_df=unified_df,
    item_embeddings_dict=item_embeddings_dict,
    size_recommender=size_recommender
)

print(f"\n   ‚úÖ Outfit builder created!")
print(f"   ‚Ä¢ Items loaded: {len(outfit_builder.item_metadata)}")
print(f"   ‚Ä¢ Categories: {len(set(item['garment_category'] for item in outfit_builder.item_metadata.values()))}")
print(f"   ‚Ä¢ Style themes: {len(outfit_builder.style_themes)}")

# Continue from where it left off (replace the broken section from line 763 onward):

# ========== 6. DEMONSTRATION ==========
print("\n6. üß™ DEMONSTRATION")
print("=" * 60)

# Test user measurements (same as Step 4)
test_user = {
    'chest_circumference': 95,
    'waist_circumference': 82,
    'garment_length': 75,
    'sleeve_length': 62,
    'hips_circumference': 96,
    'inseam_length': 78
}

print(f"\nüë§ TEST USER MEASUREMENTS:")
for key, value in test_user.items():
    print(f"  {key}: {value}cm")

# Find a starting item (t-shirt)
print(f"\nüîç Finding t-shirt items...")
tshirt_items = []
for item_id, metadata in outfit_builder.item_metadata.items():
    if 'tee' in metadata['garment_type'] or 't_shirt' in metadata['garment_type']:
        tshirt_items.append((item_id, metadata))

if tshirt_items:
    print(f"Found {len(tshirt_items)} t-shirt items")
    
    # Test with first t-shirt
    starting_item_id, starting_item = tshirt_items[0]
    
    # Test 1: Similar items
    print(f"\nüß™ TEST 1: Find similar items to '{starting_item['name']}'")
    similar_items = outfit_builder.find_similar_items(starting_item_id, n=3, same_category=True)
    if similar_items:
        for i, item in enumerate(similar_items, 1):
            print(f"  {i}. {item['name']} (similarity: {item['similarity']:.2f})")
    
    # Test 2: Build casual outfit
    print(f"\nüß™ TEST 2: Build 'casual_everyday' outfit")
    casual_outfit = outfit_builder.build_outfit(
        starting_item_id=starting_item_id,
        user_measurements=test_user,
        style_theme='casual_everyday',
        max_items=4
    )
    
    if casual_outfit:
        print(f"\nüéØ OUTFIT: {casual_outfit['style_theme'].upper()}")
        print(f"   Description: {casual_outfit['description']}")
        print(f"   Compatibility: {casual_outfit['compatibility_score']:.0f}/100")
        print(f"   Style Coherence: {casual_outfit['style_coherence']:.0f}/100")
        print(f"   Total Price: ${casual_outfit['total_price']:.2f}")
        
        print(f"\nüëï ITEMS ({casual_outfit['item_count']}):")
        for i, item in enumerate(casual_outfit['outfit_items'], 1):
            size_rec = casual_outfit['size_recommendations'].get(item['id'], 'Size info N/A')
            print(f"{i}. {item['name']}")
            print(f"   Type: {item['garment_type']} ({item['garment_category']})")
            print(f"   Formality: {item['formality']}")
            print(f"   Price: ${item['price']:.2f}")
            if size_rec != 'Size info N/A':
                print(f"   Recommended Size: {size_rec}")
    
    # Test 3: Build smart casual outfit
    print(f"\nüß™ TEST 3: Build 'smart_casual' outfit")
    smart_outfit = outfit_builder.build_outfit(
        starting_item_id=starting_item_id,
        user_measurements=test_user,
        style_theme='smart_casual',
        max_items=4
    )
    
    if smart_outfit:
        print(f"\nüéØ OUTFIT: {smart_outfit['style_theme'].upper()}")
        print(f"   Description: {smart_outfit['description']}")
        print(f"   Compatibility: {smart_outfit['compatibility_score']:.0f}/100")
        print(f"   Style Coherence: {smart_outfit['style_coherence']:.0f}/100")
        print(f"   Total Price: ${smart_outfit['total_price']:.2f}")
        
        print(f"\nüëï ITEMS ({smart_outfit['item_count']}):")
        for i, item in enumerate(smart_outfit['outfit_items'], 1):
            size_rec = smart_outfit['size_recommendations'].get(item['id'], 'Size info N/A')
            print(f"{i}. {item['name']}")
            print(f"   Type: {item['garment_type']} ({item['garment_category']})")
            print(f"   Formality: {item['formality']}")
            print(f"   Price: ${item['price']:.2f}")
            if size_rec != 'Size info N/A':
                print(f"   Recommended Size: {size_rec}")
    
    # Test 4: Generate multiple outfit options
    print(f"\nüß™ TEST 4: Generate multiple outfit options")
    all_outfits = outfit_builder.generate_multiple_outfits(
        starting_item_id=starting_item_id,
        user_measurements=test_user,
        n_outfits=2
    )
    
    print(f"\nGenerated {len(all_outfits)} outfit options:")
    for i, outfit in enumerate(all_outfits, 1):
        print(f"\n  Option {i}: {outfit['style_theme']}")
        print(f"     Description: {outfit['description']}")
        print(f"     Score: {outfit['compatibility_score']:.0f}/100")
        print(f"     Price: ${outfit['total_price']:.2f}")
        print(f"     Items: {len(outfit['outfit_items'])}")

else:
    print("‚ö†Ô∏è No t-shirt items found for testing")

# ========== 7. SAVE THE COMPLETE SYSTEM ==========
print("\n7. üíæ SAVING COMPLETE SYSTEM")
print("=" * 60)

# Save outfit builder
outfit_builder_path = ARTIFACTS_DIR / 'intelligent_outfit_builder.pkl'
outfit_builder.save_model(outfit_builder_path)
print(f"   ‚úÖ Outfit builder saved to: {outfit_builder_path}")

# Create complete system package
system_package = {
    'outfit_builder': outfit_builder,
    'items_data': unified_df,
    'embeddings': item_embeddings_dict,
    'test_user': test_user,
    'metadata': {
        'total_items': len(outfit_builder.item_metadata),
        'categories': len(set(item['garment_category'] for item in outfit_builder.item_metadata.values())),
        'style_themes': len(outfit_builder.style_themes),
        'has_size_recommender': size_recommender is not None
    }
}

complete_system_path = ARTIFACTS_DIR / 'complete_outfit_system.pkl'
with open(complete_system_path, 'wb') as f:
    pickle.dump(system_package, f)

print(f"   ‚úÖ Complete system saved to: {complete_system_path}")

print("\nüìä SYSTEM SUMMARY:")
print("-" * 30)
print(f"‚Ä¢ Total items: {system_package['metadata']['total_items']}")
print(f"‚Ä¢ Categories: {system_package['metadata']['categories']}")
print(f"‚Ä¢ Style themes: {system_package['metadata']['style_themes']}")
print(f"‚Ä¢ Has size recommender: {system_package['metadata']['has_size_recommender']}")

print("\nüéØ KEY FEATURES:")
print("-" * 30)
print("1. ‚úÖ Category-based compatibility rules")
print("2. ‚úÖ Formality level matching")
print("3. ‚úÖ Style theme generation (casual, smart, athletic, etc.)")
print("4. ‚úÖ Integration with Step 4 size recommendations")
print("5. ‚úÖ Multiple outfit generation")
print("6. ‚úÖ Price-aware outfit building")
print("7. ‚úÖ Compatibility scoring system")

print("\n" + "=" * 60)
print("‚úÖ STEP 5 COMPLETE - INTELLIGENT OUTFIT BUILDER READY!")
print("=" * 60)

print("\nüìñ QUICK USAGE:")
print("""1. Build an outfit:
   outfit = outfit_builder.build_outfit(
       starting_item_id='1',
       user_measurements=user_data,
       style_theme='casual_everyday',
       max_items=4
   )

2. Find similar items:
   similar = outfit_builder.find_similar_items(
       item_id='1',
       n=5,
       same_category=True
   )

3. Generate multiple outfits:
   outfits = outfit_builder.generate_multiple_outfits(
       starting_item_id='1',
       user_measurements=user_data,
       n_outfits=3
   )

4. Save/load model:
   outfit_builder.save_model('outfit_builder.pkl')
   loaded = IntelligentOutfitBuilder.load_model('outfit_builder.pkl')
""")

print("\nüéØ READY FOR PRODUCTION INTEGRATION!")
print("The system can now:")
print("‚Ä¢ Build complete outfits based on style preferences")
print("‚Ä¢ Recommend sizes for each outfit item")
print("‚Ä¢ Generate multiple outfit options")
print("‚Ä¢ Score outfit compatibility and style coherence")

üëó STEP 5: Intelligent Outfit Builder (LOCAL COMPATIBLE VERSION)
‚úÖ Libraries imported

1. üîÑ Loading data from Steps 3 & 4 (LOCAL)...
üìÅ Working directory: c:\Users\Rana\OneDrive\Desktop\FitFast FYP\fitfast\frontend\src\ai
üìÅ Artifacts directory: c:\Users\Rana\OneDrive\Desktop\FitFast FYP\fitfast\frontend\src\ai\artifacts
   Loading item embeddings from Step 3...
   ‚úÖ Loaded item embeddings: (250, 41)
   Columns: ['embedding_0', 'embedding_1', 'embedding_2', 'embedding_3', 'embedding_4']...

   Loading robust features from Step 3...
   ‚úÖ Loaded robust features: (250, 334)

   Loading original items...
   ‚úÖ Loaded original items: (250, 14)

   Loading size recommender from Step 4...
   ‚úÖ Loaded size recommender

2. üõ†Ô∏è Creating unified item database...
   Starting with embeddings: (250, 41)
   ‚úÖ Added metadata from original data
   ‚úÖ Added garment info from robust features

   Ensuring essential columns...
   ‚úÖ Unified database: (250, 51)
   Columns: ['embedd

In [7]:
import pickle
import ai_module

# Load and test
with open('artifacts/size_recommender_v2.pkl', 'rb') as f:
    model = pickle.load(f)
    
print(f"‚úÖ Success! Model from: {model.__class__.__module__}")
print(f"   Class name: {model.__class__.__name__}")
print(f"   Is SizeRecommenderV2? {isinstance(model, ai_module.SizeRecommenderV2)}") 

‚úÖ Success! Model from: ai_module
   Class name: SizeRecommenderV2
   Is SizeRecommenderV2? True
