# Data Preparation for Fashion Recommendation System 📊

This notebook covers:
1. Data loading and cleaning
2. Feature extraction
3. Data preprocessing
4. Train-test split

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

## 1. Load and Explore Data

In [None]:
# Load fashion product data
df = pd.read_csv('../fashion_products.csv')

# Display basic information
print("Dataset Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nSample Data:")
df.head()

## 2. Data Cleaning

In [None]:
def clean_data(df):
    # Remove duplicates
    df = df.drop_duplicates()
    
    # Handle missing values
    df['Description'] = df['Description'].fillna('')
    df['Brand'] = df['Brand'].fillna('Unknown')
    
    # Convert price to numeric
    df['Price'] = pd.to_numeric(df['Price'], errors='coerce')
    
    return df

df_clean = clean_data(df)
print("Cleaned dataset shape:", df_clean.shape)

## 3. Feature Engineering

In [None]:
def engineer_features(df):
    # Create price categories
    df['PriceCategory'] = pd.qcut(df['Price'], q=5, labels=['Budget', 'Low', 'Medium', 'High', 'Premium'])
    
    # Encode categorical variables
    le = LabelEncoder()
    df['CategoryEncoded'] = le.fit_transform(df['Category'])
    df['BrandEncoded'] = le.fit_transform(df['Brand'])
    
    # Create season feature based on release date
    df['Season'] = pd.to_datetime(df['ReleaseDate']).dt.quarter.map({1: 'Winter', 2: 'Spring', 3: 'Summer', 4: 'Fall'})
    
    return df

df_featured = engineer_features(df_clean)
df_featured.head()

## 4. Data Analysis

In [None]:
# Analyze price distribution
plt.figure(figsize=(10, 6))
sns.histplot(data=df_featured, x='Price', bins=50)
plt.title('Price Distribution')
plt.show()

# Analyze category distribution
plt.figure(figsize=(12, 6))
sns.countplot(data=df_featured, y='Category', order=df_featured['Category'].value_counts().index)
plt.title('Category Distribution')
plt.show()

## 5. Prepare Train-Test Split

In [None]:
# Split features and target
X = df_featured[['CategoryEncoded', 'BrandEncoded', 'Price']]
y = df_featured['PriceCategory']

# Create train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

## 6. Save Processed Data

In [None]:
# Save processed data
np.save('../data/X_train.npy', X_train_scaled)
np.save('../data/X_test.npy', X_test_scaled)
np.save('../data/y_train.npy', y_train)
np.save('../data/y_test.npy', y_test)

print("Data preparation completed and saved!")