# 0. Imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Data Load

In [None]:
customer_df = pd.read_csv("../dataset/clustering_customer.csv")
features_df= pd.read_csv("../dataset/clustering_features.csv")
payment_df= pd.read_csv("../dataset/clustering_payment.csv")
product_df= pd.read_csv("../dataset/clustering_product.csv")

In [4]:
customer_df.shape, features_df.shape, product_df.shape, payment_df.shape

((1000, 5), (1000, 27), (22, 3), (5, 2))

In [5]:
customer_df.head()

Unnamed: 0,customer_id,age,hh_income,omni_shopper,email_subscribed
0,1,46,640000,0,0
1,2,32,890000,1,1
2,3,45,772000,0,0
3,4,46,303000,0,1
4,5,38,412000,0,0


In [8]:
pd.set_option("display.max_columns", None)
features_df.head()


Unnamed: 0,customer_id,sales,units,orders,unique_products_bought,unique_payments_used,unique_categories_bought,aov,aur,upt,category_a_sales,category_b_sales,category_c_sales,category_d_sales,category_e_sales,category_a_units,category_b_units,category_c_units,category_d_units,category_e_units,payment_cash,payment_credit,payment_debit,payment_gc,payment_others,email_subscribed,omni_shopper
0,1,2395,11,5,5,3,3,479.0,217.727273,2.2,0.480167,,0.450939,,0.068894,0.363636,,0.363636,,0.272727,,0.519833,0.167015,0.313152,,0,0
1,2,4815,15,7,6,4,3,687.857143,321.0,2.142857,0.721703,,0.247144,,0.031153,0.466667,,0.333333,,0.2,0.031153,0.70405,0.186916,0.077882,,1,1
2,3,4285,21,10,9,4,5,428.5,204.047619,2.1,0.240373,0.175029,0.309218,0.210035,0.065344,0.190476,0.238095,0.238095,0.095238,0.238095,,0.263711,0.309218,0.147025,0.280047,0,0
3,4,12000,44,21,15,4,5,571.428571,272.727273,2.095238,0.575,0.0675,0.158333,0.170833,0.028333,0.409091,0.159091,0.181818,0.113636,0.136364,0.066667,0.345,0.37,0.218333,,1,0
4,5,1700,8,3,3,2,2,566.666667,212.5,2.666667,,0.264706,0.735294,,,,0.375,0.625,,,,,0.735294,0.264706,,0,0


In [9]:
product_df.head()

Unnamed: 0,product_id,category,price
0,1,A,450
1,2,B,80
2,3,C,250
3,4,D,400
4,5,E,50


In [10]:
payment_df.head()

Unnamed: 0,payment_type_id,payment_type
0,1,cash
1,2,credit card
2,3,debit card
3,4,gift card
4,5,others


# 2. Preprocessing

## 2.1 Data type handling

In [13]:
features_df.dtypes

customer_id                   int64
sales                         int64
units                         int64
orders                        int64
unique_products_bought        int64
unique_payments_used          int64
unique_categories_bought      int64
aov                         float64
aur                         float64
upt                         float64
category_a_sales            float64
category_b_sales            float64
category_c_sales            float64
category_d_sales            float64
category_e_sales            float64
category_a_units            float64
category_b_units            float64
category_c_units            float64
category_d_units            float64
category_e_units            float64
payment_cash                float64
payment_credit              float64
payment_debit               float64
payment_gc                  float64
payment_others              float64
email_subscribed              int64
omni_shopper                  int64
dtype: object

In [30]:
binary_cols = ['omni_shopper', 'email_subscribed']
string_cols = ['customer_id']


features_df[binary_cols] = features_df[binary_cols].astype(bool)
features_df[string_cols] = features_df[string_cols].astype(str)

In [16]:
customer_df.dtypes

customer_id         int64
age                 int64
hh_income           int64
omni_shopper        int64
email_subscribed    int64
dtype: object

In [23]:
customer_df[binary_cols] = customer_df[binary_cols].astype(bool)
customer_df[string_cols] = customer_df[string_cols].astype(str)

In [25]:
payment_df.dtypes

payment_type_id     int64
payment_type       object
dtype: object

In [34]:
binary_cols_payment = ['payment_type_id']
payment_df[binary_cols_payment] = payment_df[binary_cols_payment].astype(str)

In [31]:
product_df.dtypes

product_id     int64
category      object
price          int64
dtype: object

In [35]:
binary_cols_product = ['product_id']

product_df[binary_cols_product] = product_df[binary_cols_product].astype(str)

In [33]:
product_df.dtypes

product_id      bool
category      object
price          int64
dtype: object

## 2.2 Handling null values

In [36]:
features_df.isnull().sum()

customer_id                   0
sales                         0
units                         0
orders                        0
unique_products_bought        0
unique_payments_used          0
unique_categories_bought      0
aov                           0
aur                           0
upt                           0
category_a_sales             89
category_b_sales            246
category_c_sales            171
category_d_sales            322
category_e_sales            329
category_a_units             89
category_b_units            246
category_c_units            171
category_d_units            322
category_e_units            329
payment_cash                737
payment_credit               19
payment_debit               129
payment_gc                  366
payment_others              744
email_subscribed              0
omni_shopper                  0
dtype: int64

In [46]:
feature_df2 = features_df.copy()
product_df2 = product_df.copy()
payment_df2 = payment_df.copy()
customer_df2 = customer_df.copy()



feature_df2 = feature_df2.fillna(0)

In [None]:
product_df2.isnull().sum()

product_id    0
category      0
price         0
dtype: int64

In [None]:
customer_df2.isnull().sum()

customer_id         0
age                 0
hh_income           0
omni_shopper        0
email_subscribed    0
dtype: int64

In [None]:
payment_df2.isnull().sum()

payment_type_id    0
payment_type       0
dtype: int64