In [1]:
from pathlib import Path

import typer
from loguru import logger
from tqdm import tqdm
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder

import scipy.stats as stats
from scipy.stats import zscore

from seaborn import histplot
import matplotlib.pyplot as plt
import seaborn as sns

from customer_churn_pridiction.config import PROCESSED_DATA_DIR, RAW_DATA_DIR


[32m2024-10-28 01:48:36.135[0m | [1mINFO    [0m | [36mcustomer_churn_pridiction.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /Users/saadkhalid/Documents/epita/s2/ai_methodology/customer-churn-prediction-pipeline[0m


In [2]:
df_master = pd.read_excel(RAW_DATA_DIR / "E_Commerce_Dataset.xlsx", sheet_name="E Comm")
df = df_master.copy()

In [3]:
df.drop(columns=["CustomerID"], inplace=True)

In [4]:
df.describe()

Unnamed: 0,Churn,Tenure,CityTier,WarehouseToHome,HourSpendOnApp,NumberOfDeviceRegistered,SatisfactionScore,NumberOfAddress,Complain,OrderAmountHikeFromlastYear,CouponUsed,OrderCount,DaySinceLastOrder,CashbackAmount
count,5630.0,5366.0,5630.0,5379.0,5375.0,5630.0,5630.0,5630.0,5630.0,5365.0,5374.0,5372.0,5323.0,5630.0
mean,0.168384,10.189899,1.654707,15.639896,2.931535,3.688988,3.066785,4.214032,0.284902,15.707922,1.751023,3.008004,4.543491,177.22303
std,0.37424,8.557241,0.915389,8.531475,0.721926,1.023999,1.380194,2.583586,0.451408,3.675485,1.894621,2.93968,3.654433,49.207036
min,0.0,0.0,1.0,5.0,0.0,1.0,1.0,1.0,0.0,11.0,0.0,1.0,0.0,0.0
25%,0.0,2.0,1.0,9.0,2.0,3.0,2.0,2.0,0.0,13.0,1.0,1.0,2.0,145.77
50%,0.0,9.0,1.0,14.0,3.0,4.0,3.0,3.0,0.0,15.0,1.0,2.0,3.0,163.28
75%,0.0,16.0,3.0,20.0,3.0,4.0,4.0,6.0,1.0,18.0,2.0,3.0,7.0,196.3925
max,1.0,61.0,3.0,127.0,5.0,6.0,5.0,22.0,1.0,26.0,16.0,16.0,46.0,324.99


In [5]:
columns_to_fill = ["Tenure", "WarehouseToHome", "HourSpendOnApp", "OrderAmountHikeFromlastYear", 
                   "CouponUsed", "OrderCount", "DaySinceLastOrder"]

for column in columns_to_fill:
    mode_value = df[column].mode()[0]
    df[column].fillna(mode_value, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(mode_value, inplace=True)


In [6]:
df.isnull().sum()

Churn                          0
Tenure                         0
PreferredLoginDevice           0
CityTier                       0
WarehouseToHome                0
PreferredPaymentMode           0
Gender                         0
HourSpendOnApp                 0
NumberOfDeviceRegistered       0
PreferedOrderCat               0
SatisfactionScore              0
MaritalStatus                  0
NumberOfAddress                0
Complain                       0
OrderAmountHikeFromlastYear    0
CouponUsed                     0
OrderCount                     0
DaySinceLastOrder              0
CashbackAmount                 0
dtype: int64

In [35]:
numeric_cols = ['Churn', 'Tenure', 'CityTier', 'WarehouseToHome', 'HourSpendOnApp', 'NumberOfDeviceRegistered', 'SatisfactionScore', 'NumberOfAddress', 'Complain', 'OrderCount', 'OrderAmountHikeFromlastYear', 'CouponUsed', 'DaySinceLastOrder', 'CashbackAmount']

z_scores = df[numeric_cols].apply(zscore)
outliers_mask = (abs(z_scores) > 3.0)
outliers = df[outliers_mask.any(axis=1)]

df_cleaned = df[~outliers_mask.any(axis=1)]

In [33]:
selected_features = ['Tenure','Complain', 'DaySinceLastOrder', 'CashbackAmount', 'SatisfactionScore']

In [37]:
df_cleaned.to_csv(PROCESSED_DATA_DIR / "cleaned_data.csv", index=False)