### Purpose: Clean and preprocess data for product success prediction

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

In [18]:
# --- Load Data ---
df = pd.read_csv('../data/raw/ecommerce_sales.csv')

In [13]:
# --- Check Missing Values ---
print(df.isnull().sum())

product_id        0
product_name      0
category          0
price             0
review_score      0
review_count      0
sales_month_1     0
sales_month_2     0
sales_month_3     0
sales_month_4     0
sales_month_5     0
sales_month_6     0
sales_month_7     0
sales_month_8     0
sales_month_9     0
sales_month_10    0
sales_month_11    0
sales_month_12    0
dtype: int64


In [14]:
# --- Create Total and Average Sales ---
sales_cols = [col for col in df.columns if 'sales_month' in col]
df['total_sales'] = df[sales_cols].sum(axis=1)
df['avg_sales_per_month'] = df[sales_cols].mean(axis=1)

In [15]:
# --- Define Success Target ---
threshold = df['total_sales'].median()
df['success'] = np.where(df['total_sales'] > threshold, 1, 0)

In [16]:
# --- Normalize Numeric Features ---
num_cols = ['price', 'review_score', 'review_count', 'total_sales', 'avg_sales_per_month']
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

In [17]:
# --- Encode Category ---
df = pd.get_dummies(df, columns=['category'], drop_first=True)

# --- Save Processed Data ---
df.to_csv('../data/processed/ecommerce_sales_preprocessed.csv', index=False)