In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import pickle

# Load the dataset
df = pd.read_csv('flipkart_products_20250405.csv')

# Data Cleaning
df = df.drop_duplicates()
df['Price (₹)'] = df['Price (₹)'].astype(float)

# Feature Engineering
df['Popularity'] = df['Number of Buyers'] * df['Rating (★)']

# Create price categories
bins = [0, 1000, 5000, 10000, 20000, 50000, 100000, 200000, float('inf')]
labels = ['<1k', '1k-5k', '5k-10k', '10k-20k', '20k-50k', '50k-100k', '100k-200k', '>200k']
df['Price_Category'] = pd.cut(df['Price (₹)'], bins=bins, labels=labels)

# Normalize numerical features
scaler = MinMaxScaler()
numerical_features = ['Price (₹)', 'Rating (★)', 'Number of Buyers', 'Total Sold', 'Available Stock', 'Discount (%)', 'Popularity']
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Encode categorical features
categorical_features = ['Main Category', 'Sub Category', 'Seller', 'Return Policy', 'Price_Category']
label_encoders = {}
for feature in categorical_features:
    le = LabelEncoder()
    df[feature] = le.fit_transform(df[feature])
    label_encoders[feature] = le

# Save processed data
df.to_csv('processed_products.csv', index=False)
with open('label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)