In [2]:

import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("../ecommerce_customer.csv")

print("Initial Shape:", df.shape)
print(df.info())

# Check missing values
print("\nMissing Values:\n", df.isnull().sum())

# Fill numerical missing values with median
num_cols = df.select_dtypes(include=["int64", "float64"]).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())


# Fill categorical missing values with mode
cat_cols = df.select_dtypes(include=["object"]).columns
df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])

# Filling with median values for rate_columns
rate_columns = ["Cart_Abandonment_Rate",
    "Returns_Rate",
    "Email_Open_Rate",
    "Discount_Usage_Rate"]

for col in rate_columns:
    df[col] = df[col].fillna(df[col].median())

# Remove duplicate rows
df.drop_duplicates(inplace=True)

#Removing unnecessary columns
columns_to_remove = [
    "City",
    "Signup_Quarter",
    "Lifetime_Value"
]

df.drop(columns=columns_to_remove, inplace=True)

print("\nShape after cleaning:", df.shape)
df.head()

Initial Shape: (50000, 25)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 25 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Age                            47505 non-null  float64
 1   Gender                         50000 non-null  object 
 2   Country                        50000 non-null  object 
 3   City                           50000 non-null  object 
 4   Membership_Years               50000 non-null  float64
 5   Login_Frequency                50000 non-null  float64
 6   Session_Duration_Avg           46601 non-null  float64
 7   Pages_Per_Session              47000 non-null  float64
 8   Cart_Abandonment_Rate          50000 non-null  float64
 9   Wishlist_Items                 46000 non-null  float64
 10  Total_Purchases                50000 non-null  float64
 11  Average_Order_Value            50000 non-null  float64
 12  Days_Since_Last_Pur

Unnamed: 0,Age,Gender,Country,Membership_Years,Login_Frequency,Session_Duration_Avg,Pages_Per_Session,Cart_Abandonment_Rate,Wishlist_Items,Total_Purchases,...,Discount_Usage_Rate,Returns_Rate,Email_Open_Rate,Customer_Service_Calls,Product_Reviews_Written,Social_Media_Engagement_Score,Mobile_App_Usage,Payment_Method_Diversity,Credit_Balance,Churned
0,43.0,Male,France,2.9,14.0,27.4,6.0,50.6,3.0,9.0,...,46.4,2.0,17.9,9.0,4.0,16.3,20.8,1.0,2278.0,0
1,36.0,Male,UK,1.6,15.0,42.7,10.3,37.7,1.0,19.5,...,57.96,9.2,42.8,7.0,3.0,27.6,23.3,3.0,3028.0,0
2,45.0,Female,Canada,2.9,10.0,24.8,1.6,70.9,1.0,9.1,...,12.24,11.5,0.0,4.0,1.0,27.6,8.8,2.0,2317.0,0
3,56.0,Female,USA,2.6,10.0,38.4,14.8,41.7,9.0,15.0,...,44.1,5.4,41.4,2.0,5.0,85.9,31.0,3.0,2674.0,0
4,35.0,Male,India,3.1,29.0,51.4,8.4,19.1,9.0,32.5,...,25.2,5.5,37.9,1.0,11.0,83.0,50.4,4.0,5354.0,0


In [3]:
%pip install scikit-learn

# Checking categorical columns and applying label encoding
from sklearn.preprocessing import LabelEncoder

#Finding categorical columns
cat_cols = df.select_dtypes(include=['object']).columns
print(cat_cols)

label_encoder = {}

for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoder[col] = le
df[cat_cols].head()

Collecting scikit-learn
  Using cached scikit_learn-1.8.0-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.10.0 (from scikit-learn)
  Using cached scipy-1.16.3-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.3.0 (from scikit-learn)
  Using cached joblib-1.5.3-py3-none-any.whl.metadata (5.5 kB)
Collecting threadpoolctl>=3.2.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.8.0-cp313-cp313-win_amd64.whl (8.0 MB)
Using cached joblib-1.5.3-py3-none-any.whl (309 kB)
Using cached scipy-1.16.3-cp313-cp313-win_amd64.whl (38.5 MB)
Using cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn

   ---------------------------------------- 0/4 [threadpoolctl]
   ---------- ----------------------------- 1/4 [scipy]
   ---------- ----------------------------- 1/4 [scipy]
   ---------- ----------------------------- 1/4 [scipy]
   -----


[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Index(['Gender', 'Country'], dtype='object')


Unnamed: 0,Gender,Country
0,1,2
1,1,6
2,0,1
3,0,7
4,1,4
