# Customer Clustering

A project to practice customer segmentation and to learn new clustering techniques

## Load packages

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.cluster import DBSCAN

## Loading Data

In [23]:
customers = pd.read_csv('data/customers.csv')
orders = pd.read_csv('data/orders.csv')
products = pd.read_csv('data/products.csv')
sales = pd.read_csv('data/sales.csv')

## EDA

In [3]:
customers.head()

Unnamed: 0,customer_id,customer_name,gender,age,home_address,zip_code,city,state,country
0,1,Leanna Busson,Female,30,8606 Victoria TerraceSuite 560,5464,Johnstonhaven,Northern Territory,Australia
1,2,Zabrina Harrowsmith,Genderfluid,69,8327 Kirlin SummitApt. 461,8223,New Zacharyfort,South Australia,Australia
2,3,Shina Dullaghan,Polygender,59,269 Gemma SummitSuite 109,5661,Aliburgh,Australian Capital Territory,Australia
3,4,Hewet McVitie,Bigender,67,743 Bailey GroveSuite 141,1729,South Justinhaven,Queensland,Australia
4,5,Rubia Ashleigh,Polygender,30,48 Hyatt ManorSuite 375,4032,Griffithsshire,Queensland,Australia


In [4]:
customers.shape

(1000, 9)

In [5]:
orders.head()

Unnamed: 0,order_id,customer_id,payment,order_date,delivery_date
0,1,64,30811,2021-8-30,2021-09-24
1,2,473,50490,2021-2-3,2021-02-13
2,3,774,46763,2021-10-8,2021-11-03
3,4,433,39782,2021-5-6,2021-05-19
4,5,441,14719,2021-3-23,2021-03-24


In [6]:
orders.shape

(1000, 5)

In [7]:
print(len(orders['customer_id'].unique()))

617


In [8]:
products.head()

Unnamed: 0,product_ID,product_type,product_name,size,colour,price,quantity,description
0,0,Shirt,Oxford Cloth,XS,red,114,66,"A red coloured, XS sized, Oxford Cloth Shirt"
1,1,Shirt,Oxford Cloth,S,red,114,53,"A red coloured, S sized, Oxford Cloth Shirt"
2,2,Shirt,Oxford Cloth,M,red,114,54,"A red coloured, M sized, Oxford Cloth Shirt"
3,3,Shirt,Oxford Cloth,L,red,114,69,"A red coloured, L sized, Oxford Cloth Shirt"
4,4,Shirt,Oxford Cloth,XL,red,114,47,"A red coloured, XL sized, Oxford Cloth Shirt"


In [9]:
products.shape

(1260, 8)

In [10]:
sales.head()

Unnamed: 0,sales_id,order_id,product_id,price_per_unit,quantity,total_price
0,0,1,218,106,2,212
1,1,1,481,118,1,118
2,2,1,2,96,3,288
3,3,1,1002,106,2,212
4,4,1,691,113,3,339


In [11]:
sales.shape

(5000, 6)

## Aggregating data to customer level

### Aggregate data to order level

In [34]:
visits = pd.merge(orders, sales, on='order_id', how='inner').groupby('order_id', as_index=False).agg({'total_price': 'sum', 'quantity': 'sum', 'customer_id': 'first'})

In [35]:
visits.head()

Unnamed: 0,order_id,total_price,quantity,customer_id
0,1,1487,14,64
1,2,1130,11,473
2,3,508,5,774
3,4,976,10,433
4,5,2043,20,441


### Aggregate orders to customer level

In [37]:
custs = visits.groupby('customer_id', as_index=False).agg({'total_price': 'mean', 'quantity': 'mean', 'order_id': 'count'})
custs.head()

Unnamed: 0,customer_id,total_price,quantity,order_id
0,1,547.0,5.666667,3
1,7,1017.0,9.0,1
2,10,270.0,3.0,1
3,11,382.0,4.0,1
4,12,1551.0,15.0,1


In [38]:
custs.columns = ['customer_id', 'avg_spend', 'avg_basket_size', 'visits']
custs['total_spend'] = custs['avg_spend'] * custs['visits']
custs['total_units'] = custs['avg_basket_size'] * custs['visits']
custs.head()

Unnamed: 0,customer_id,avg_spend,avg_basket_size,visits,total_spend,total_units
0,1,547.0,5.666667,3,1641.0,17.0
1,7,1017.0,9.0,1,1017.0,9.0
2,10,270.0,3.0,1,270.0,3.0
3,11,382.0,4.0,1,382.0,4.0
4,12,1551.0,15.0,1,1551.0,15.0
