In [None]:
pip install pandas numpy seaborn matplotlib scikit-learn


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA


In [None]:
df = pd.read_csv("ecommerce_sales_data.csv")
print(df.head())
print(df.info())


In [None]:
# Check basic statistics
print(df.describe())

# Visualize sales distribution
sns.histplot(df['Sales'], bins=30, kde=True)
plt.title("Sales Distribution")
plt.show()

# Correlation heatmap
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()


In [None]:
# Check missing values
print(df.isnull().sum())

# Fill numerical columns with median
num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
for col in num_cols:
    df[col].fillna(df[col].median(), inplace=True)

# Fill categorical columns with mode
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
for col in cat_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)


In [None]:
label_encoder = LabelEncoder()

# Encode product category and customer location
for col in ['Product Category', 'Customer Location']:
    df[col] = label_encoder.fit_transform(df[col])


In [None]:
# Define features and target
X = df.drop(columns=['Sales'])  # Target
y = df['Sales']  # You can modify based on actual column name

# Select top 5 features
selector = SelectKBest(score_func=f_classif, k=5)
X_selected = selector.fit_transform(X, y)

print("Shape after SelectKBest:", X_selected.shape)


In [None]:
# Normalize features before PCA
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# PCA to 2 components
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Visualize PCA result
plt.figure(figsize=(8, 6))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1])
plt.title("PCA: Customer Behavior Patterns")
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.show()


In [None]:
# Normalize product price, discount, purchase frequency, etc.
num_features_to_scale = ['Product Price', 'Discount Applied', 'Purchase Frequency']
scaler = MinMaxScaler()
df[num_features_to_scale] = scaler.fit_transform(df[num_features_to_scale])


In [None]:
# Save final dataset
df.to_csv("processed_ecommerce_data.csv", index=False)
print("✅ Final dataset saved: 'processed_ecommerce_data.csv'")
