In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd
import numpy as np
import geopandas as gpd
import math
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from kmodes.kprototypes import KPrototypes
from sklearn.preprocessing import scale
from sklearn import metrics

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Idea of the notebook

Maximilian Lang (friend & collaborator https://www.kaggle.com/maximilianlang) and I wanted to understand how we could handle categorical data in a clustering problem. In order to do so and work on our DS skills we made this notebook. PS: The visualisation anf general pandas analysis is short on purpose, I had the feeling the notebook is quite full as it is.

# Overview

**Importing E-commerce business data & first analysis**
    

**Address Analysis**


**Creating dummy variables**


**Visualisation**


**DBSCAN**


**K-Prototypes**


**Validation**

# **Importing E-commerce business data & first analysis**

In [None]:
data = pd.read_csv("../input/ecommerce-customers/Ecommerce Customers.csv")
display(data.head())

data_num = data[["Avg. Session Length","Time on App","Time on Website","Length of Membership","Yearly Amount Spent"]]

In [None]:
data.describe()
data.info()

# **Address analysis**

The original dataframe had three categorical variables: Email, Address and Avatar

I tried making clusters consisting of E-Mail domains like yahoo.com/ gmail.com and unique ones. I thought unique domains could potentially be from companies and they might have higher values. Unfortunately half of the rows had unique domain names and those were not from companies. I guess the data is handmade and was created for practise purposes. Furthermore I couldn't retrieve any sensible information from "Avatar". The variables "Email" and "Avatar" were no longer part of my project. 

Anyway I still used the variable "Address", because I wanted to have some categorical data and try out geopandas ;) After all this is practise, so please feel free to comment below if you have any remarks! 

The idea behind the analysis of the variable "Address" is to only look at the state the customer lives in. 

In [None]:
list_shortcut_states = ["AK","AL","AR","AZ","CA","CO","CT","DE","FL","GA","HI","IA","ID","IL","IN","KS","KY","LA","MA","MD","ME","MI","MN","MO","MS","MT","NC","ND","NE","NH","NJ","NM","NV","NY","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VA","VT","WA","WI","WV","WY"]
state_list = []

for i in data["Address"]:
    if "Box" in i:
        index_to_drop = data[data["Address"] == i].index.values 
        data.drop(index_to_drop, inplace = True)
    else:
        state = i.split(",")[-1].split()[0]

        if state in list_shortcut_states:
            state_list.append(state)
        else:
            index_to_drop = data[data["Address"] == i].index.values 
            data.drop(index_to_drop, inplace = True)


data.drop(["Email","Avatar","Address"], inplace = True, axis = 1)
data.insert(5, "State", state_list)

data_mixed = data
data_mixed.index = range(len(data_mixed.index))

#print(data_mixed.info())

In [None]:
states_and_customers_list = data_mixed.State.value_counts()

states = gpd.read_file("../input/usa-states-geopandas/cb_2016_us_state_5m/cb_2016_us_state_5m.shp")
#print(states.head(10))

geos_to_drop = [26,29,51,52,53,54]
states.drop(geos_to_drop, inplace=True)


states_50 = states
states_50.index = range(len(states_50.index))

states_50.sort_values('STUSPS', inplace= True)

states_50.loc[28,"STUSPS"] = "DE"
#print(states_50)

states_50 = states_50
states_50.index = range(len(states_50.index))


geo_list = []

for i in list_shortcut_states:
    #print(i," ",states_and_customers_list[i])
    geo_list.append(states_and_customers_list[i])
   
VALUES = gpd.GeoDataFrame(geo_list)

#print(states_50)


usa = pd.concat([states_50, VALUES], axis=1)
usa.columns = ["STATEFP","STATENS","AFFGEOID","GEOID","STUSPS","NAME","LSAD","ALAND","AWATER","geometry","Customers"]

usa = usa

#indexes_to_drop = [0]
# Alaska got dropped for visualisation
#usa.drop(indexes_to_drop, inplace=True)

fig = plt.figure(1, figsize=(10,10))
ax = fig.add_subplot()
plt.xlim(-130, -60)
plt.ylim(20,55)

usa.plot(column="Customers",ax=ax, legend= True, cmap="YlOrBr", legend_kwds = {"label": "Customers in states", "orientation":"horizontal"})

In [None]:
states_and_spent_list = []

for i in list_shortcut_states:
    state_rows = data_mixed.loc[data_mixed["State"] == i]

    state_spent = state_rows["Yearly Amount Spent"].sum()

    states_and_spent_list.append(state_spent)

#print(states_and_spent_list)
#print(usa.head())


states_and_spent_list = gpd.GeoDataFrame(states_and_spent_list)

#print(states_and_spent_list.head())

usa_spent = pd.concat([usa, states_and_spent_list], axis=1)
usa_spent.columns = ["STATEFP","STATENS","AFFGEOID","GEOID","STUSPS","NAME","LSAD","ALAND","AWATER","geometry","Customers","Avg. Amount Spent"]
#print(usa_spent.head())

fig = plt.figure(1, figsize=(10,10))
ax = fig.add_subplot()
plt.xlim(-130, -60)
plt.ylim(20,55)
usa_spent.plot(column="Avg. Amount Spent",ax=ax, legend= True, cmap="YlGn", legend_kwds = {"label": "Avg. amount spent in states","orientation":"horizontal"})

# **Creating dummy variables**

In [None]:
state_group_list = []

for i in data_mixed["State"]:
    # 12 - 10
    if i in (["MO","DE","SC","OR","VT","FL","MS","MN","KS","NJ","NC"]):
        state_group_list.append("HIGH")
    # 9 - 8
    if i in (["AZ","HI","AL","MI","WV","ME","ND","NY","IL","TX","PA","GA","KY","MT"]):
        state_group_list.append("MEDIUM")
    # 7 - 0
    if i in (["MA","OK","WY","IN","IA","SD","AK","NH","RI","CA","NV","NE","VA","LA","NM","AR","WI","OH","CT","MD","CO","TN","UT","WA","ID"]):
        state_group_list.append("LOW")

state_group_series_new = pd.Series(state_group_list)

data_mixed_new = pd.concat([data_mixed, state_group_series_new], axis=1)

data_mixed_new.columns = ["Avg. Session Length","Time on App","Time on Website","Length of Membership","Yearly Amount Spent","State","State group"]

#print(data_mixed_new.head(5))

hot_states = pd.get_dummies(data_mixed_new["State group"])
#print(hot_states.head(5))

data_hot_clustering = pd.concat([data_mixed_new,hot_states], axis=1)
display(data_hot_clustering.head(5))

# **Visualisation**

In [None]:
data_hot_clustering_num = data_hot_clustering[["Avg. Session Length","Time on App","Time on Website","Length of Membership","Yearly Amount Spent"]]

In [None]:
sns.pairplot(data_hot_clustering_num)

# ****DBSCAN****

**Standardizing numerical data**

In [None]:
data_dbscan = data_hot_clustering[["Avg. Session Length","Time on App","Time on Website","Length of Membership","Yearly Amount Spent","HIGH","MEDIUM","LOW"]]

scale = StandardScaler()

data_dbscan.iloc[:,[0,1,2,3,4]] = scale.fit_transform(data_dbscan.iloc[:,[0,1,2,3,4]])

data_dbscan.head()

**Finding optimal values for eps**

In [None]:
neigh = NearestNeighbors(n_neighbors=3)
nbrs = neigh.fit(data_dbscan)
distances, indices = nbrs.kneighbors(data_dbscan)

distances = np.sort(distances, axis=0)

distances = distances[:,1]

plt.plot(distances)

- eps = 1.6
- min_samples >= dimensinality + 1

In [None]:
dbscan = DBSCAN(eps=1.6, min_samples=10)
clusters_dbscan = dbscan.fit_predict(data_dbscan)

clusters_dbscan

2D eps

In [None]:
x = data_dbscan.iloc[:,[0,4]]

neigh = NearestNeighbors(n_neighbors=3)
nbrs = neigh.fit(x)
distances, indices = nbrs.kneighbors(x)

distances = np.sort(distances, axis=0)

distances = distances[:,1]

plt.plot(distances)

In [None]:
X = data_dbscan.iloc[:,[0,4]].values

dbscan = DBSCAN(eps=0.4, min_samples=8)
clusters = dbscan.fit_predict(X)
labels = dbscan.labels_

sns.scatterplot(X[:,0], X[:,1], hue=["cluster: {}".format(i) for i in labels])
plt.xlabel("Avg. Session Length")
plt.ylabel("Yearly Amount Spent")
plt.title('DBSCAN clustering 2D' )
plt.legend(fancybox=False, fontsize='small')
plt.show()

# K-Prototypes

Choosing the correct data 
   * K-Prototypes can handle categorical variables, so we do not utilize one-hot encoding
   * We do need to standardize the data

In [None]:
data_kproto = data_hot_clustering[["Avg. Session Length","Time on App","Time on Website","Length of Membership","Yearly Amount Spent","State group"]]

scale = StandardScaler()

data_kproto.iloc[:,[0,1,2,3,4]] = scale.fit_transform(data_kproto.iloc[:,[0,1,2,3,4]])

data_kproto.head()

Turning data_kproto into numpy array --> K-Prototypes takes a numpy array as input

In [None]:
data_kproto_array = data_kproto.values
data_kproto_array

The numerical data has to be of type float --> it already is in this case

    *data_kproto_array[:,0] = data_kproto_array[:,0].astype(float)
    *data_kproto_array[:,1] = data_kproto_array[:,1].astype(float)
    *data_kproto_array[:,2] = data_kproto_array[:,2].astype(float)
    *data_kproto_array[:,3] = data_kproto_array[:,3].astype(float)
    *data_kproto_array[:,4] = data_kproto_array[:,4].astype(float)
    
would be a possible way of turning int to float

Choosing optimal number of clusters

In [None]:
cost = []
for num_clusters in list(range(1,8)):
    kproto = KPrototypes(n_clusters=num_clusters, init='Cao')
    kproto.fit_predict(data_kproto_array, categorical=[5])
    cost.append(kproto.cost_)

plt.plot(cost)

The diagram above shows the optimal number of clusters. The kink in the function hints the optimal number. The diagram suggests one cluster is optimal. We chose three clusters, because after one the slope of the function is still quite steep.

In [None]:
kproto = KPrototypes(n_clusters=3, max_iter=20)
clusters_kproto = kproto.fit_predict(data_kproto_array, categorical=[5])
clusters_kproto

Let us take a deeper look into the cluster values!

In [None]:
cluster_list = []
for i in clusters_kproto:
    cluster_list.append(i)

#data_kproto["Clusters"].loc[cluster_list]
data_kproto["Clusters"] = cluster_list
data_kproto.head()

Cluster 0

In [None]:
cluster_0 = data_kproto[data_kproto["Clusters"] == 0]

indices_cluster_0 = cluster_0.index

cluster_0_df = []

for i in indices_cluster_0:
    cluster_0_df.append(data_hot_clustering.iloc[i])

cluster_0_df = pd.DataFrame(cluster_0_df)

display(cluster_0_df.iloc[:,[0,1,2,3,4]].describe())

In [None]:
cluster_1 = data_kproto[data_kproto["Clusters"] == 1]

indices_cluster_1 = cluster_1.index

cluster_1_df = []

for i in indices_cluster_1:
    cluster_1_df.append(data_hot_clustering.iloc[i])

cluster_1_df = pd.DataFrame(cluster_1_df)

display(cluster_1_df.iloc[:,[0,1,2,3,4]].describe())

Cluster 2

In [None]:
cluster_2 = data_kproto[data_kproto["Clusters"] == 2]

indices_cluster_2 = cluster_2.index

cluster_2_df = []

for i in indices_cluster_2:
    cluster_2_df.append(data_hot_clustering.iloc[i])

cluster_2_df = pd.DataFrame(cluster_2_df)

display(cluster_2_df.iloc[:,[0,1,2,3,4]].describe())

**Key takeaways**
    
"Avg. Session Length" & "Time on App" & "Time on Website" are very similar throughout the clusters

Cluster 0 has the lowest mean values: "Length of Membership" and "Yearly Amount Spent" (~ 2.8 years and ~ 433$) & 150 people in cluster 0

Cluster 1 is the middleground: "Length of Membership" and "Yearly Amount Spent" (~ 3.9 years and ~ 510$) & 130 people in cluster 1

Cluster 2 has the highest mean values: "Length of Membership" and "Yearly Amount Spent" (~ 4.3 years and ~ 585$) & 113 people in cluster 2
    


--> it appears that long-standing and new customers like to surf the website for a shorter period of time as well as spending more time on the website (when they are buying merchandise?)
--> as one would expect with increasing "Average Amount Spent" and "Length of Memebership" the actual number of people in the clusters decrease
    

2D graph of "Yearly Amount Spent" and "Average Session Length"

In [None]:
x = data_kproto.iloc[:,[0,4,5]].values

kproto = KPrototypes(n_clusters=3)
clusters = kproto.fit_predict(x, categorical=[2])

plt.scatter(x[clusters == 0, 0], x[clusters == 0, 1], s = 25, c = 'orange', label="cluster 0")
plt.scatter(x[clusters == 1, 0], x[clusters == 1, 1], s = 25, c = "blue", label="cluster 1")
plt.scatter(x[clusters == 2, 0], x[clusters == 2, 1], s = 25, c = 'cyan', label="cluster 2")
plt.xlabel("Average Session Length")
plt.ylabel("Yearly Amount Spent")
plt.title('K-Prototypes clustering 2D' )
plt.legend(fancybox=False, fontsize='small')
plt.show()

**Interpretation**

Cluster 0 spend an above average amount of money each year & and tend to have a high "Average Session Length"

Cluster 1 has a relatively long "Average Session Length". Interestingly enough their "Yearly Amount Spent" values are lesser compared to Cluster 0 and 2. It seems like these customers like to browse the website but not actually buy. Maybe they are hesitant because of certain insecurities. These could potentially be quality and/ or price. 

Cluster 2: The customers purchase quickly of the website/ app and spend low to medium funds on merchandise


--> The question arises what makes some customers hesitate to buy? 

# Validation

For the validation of our cluster we will be using the silhouette coefficient

Interpretation:

-1: bad heterogeneous clustering

0: overlapping clusters

1: ideal homogeneous clustering

**DBSCAN**

In [None]:
metrics.silhouette_score(X= data_dbscan, labels=clusters_dbscan,metric='euclidean')

**K-Prototype**

In [None]:
metrics.silhouette_score(X= data_dbscan, labels=clusters_kproto,metric='euclidean')

The k-prototypes silhouette score was calculated using X=data_dbscan (one-hot encoded), because of the categorical nature of the data. I am not sure how else to handle this situation.

# Thank you very much for your time! We hope you enjoyed it :) 