In [1]:
import numpy as np

import pandas as pd

import math

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.mixture import GaussianMixture

import plotly.graph_objects as go

In [2]:
# Dataset (example)
data = {
    'client_id':['001','002','003','004','005','006','007','008','009','010','011'],
    'nb_trades':[32,1,2,36,4,41,29,3,2,3,15],
    'nb_products':[3,2,2,4,9,2,1,10,13,2,6],
    'aum':[100000,3000000,3500000,200000,150000,220000,100000,90000,210000,4000000,4000000],
    'age':[75,33,45,73,24,80,79,25,21,46,43]
}

df = pd.DataFrame(data)
df

Unnamed: 0,client_id,nb_trades,nb_products,aum,age
0,1,32,3,100000,75
1,2,1,2,3000000,33
2,3,2,2,3500000,45
3,4,36,4,200000,73
4,5,4,9,150000,24
5,6,41,2,220000,80
6,7,29,1,100000,79
7,8,3,10,90000,25
8,9,2,13,210000,21
9,10,3,2,4000000,46


In [3]:
scaler = StandardScaler()

df_pre = scaler.fit_transform(df[['nb_trades','nb_products','aum','age']])

# K-means

In [4]:
df_kmeans = df.copy()
df_kmeans_pre = df_pre.copy()

#### Clustering

In [5]:
kmeans = KMeans(n_clusters=3, random_state=0).fit(df_kmeans_pre)

In [6]:
df_kmeans['label'] = kmeans.labels_

In [7]:
df_kmeans

Unnamed: 0,client_id,nb_trades,nb_products,aum,age,label
0,1,32,3,100000,75,0
1,2,1,2,3000000,33,1
2,3,2,2,3500000,45,1
3,4,36,4,200000,73,0
4,5,4,9,150000,24,2
5,6,41,2,220000,80,0
6,7,29,1,100000,79,0
7,8,3,10,90000,25,2
8,9,2,13,210000,21,2
9,10,3,2,4000000,46,1


In [8]:
dict_label = {
    0:'active & old',
    1:'passive & wealthy',
    2:'diversified & young'
}

#### Cluster quality

$$s(i) = \frac{b(i)-a(i)}{max(a(i),b(i))}$$

In [9]:
df_kmeans['silhouette'] = metrics.silhouette_samples(df_kmeans_pre, kmeans.labels_)

In [10]:
dict_silhouette_clusters = {
    0:np.mean(df_kmeans[df_kmeans['label']==0]['silhouette']),
    1:np.mean(df_kmeans[df_kmeans['label']==1]['silhouette']),
    2:np.mean(df_kmeans[df_kmeans['label']==2]['silhouette'])
}

High silhouette => good quality => low variance

#### Visualization

In [11]:
# mu_list_2 = get_all_circle_coords(x_center=0, y_center=0, radius=1, n_points=10)
# fig = go.Figure()
# for coord in mu_list_2:
#     fig.add_trace(go.Scatter(x=[coord[0]], 
#                              y=[coord[1]],
#                              marker=dict(size=30)))
# fig.update_layout({'showlegend':False})
# fig.update_xaxes(visible=False, scaleanchor='y', scaleratio=1)
# fig.update_yaxes(visible=False, scaleanchor='x', scaleratio=1)

In [36]:
# https://gis.stackexchange.com/questions/394955/generating-approximate-polygon-for-circle-with-given-radius-and-centre-without

def get_circle_coord(theta, x_center, y_center, radius):
    x = radius*math.cos(theta)+x_center
    y = radius*math.sin(theta)+y_center
    return x,y

def get_all_circle_coords(x_center, y_center, radius, n_points):
    # equidistant points on perimeter (2*pi*r) = distance in radians = angles in radians
    thetas = [i/n_points * 2 * math.pi for i in range(n_points)]
    # we use cos/sin formulas to find coordinates
    circle_coords = [get_circle_coord(theta, x_center, y_center, radius) for theta in thetas]
    return circle_coords

mu_list = get_all_circle_coords(x_center=0, y_center=0, radius=2, n_points=3)

In [37]:
def distance_to_center(row, center):
    point = np.array(row[['x','y']])
    center = np.asarray(center)
    return np.linalg.norm(point-center)

In [38]:
np.random.seed(0)

fig = go.Figure()

cluster_labels = df_kmeans['label'].unique()

for idx, cluster_number in enumerate(cluster_labels):
    
    df_cluster = df_kmeans[df_kmeans['label']==cluster_number].copy()
    n_count = df_cluster.shape[0]
    center = mu_list[idx]
    
    # Generate coordinates
    df_simul_cluster = pd.DataFrame()
    df_simul_cluster['x'] = np.random.normal(center[0],1-dict_silhouette_clusters[cluster_number],n_count)
    df_simul_cluster['y'] = np.random.normal(center[1],1-dict_silhouette_clusters[cluster_number],n_count)
    
    # Associate coordinates to points w.r.t their distance to the center
    df_simul_cluster['distance_to_center'] = df_simul_cluster.apply(lambda x: distance_to_center(x, center), axis=1)
    df_simul_cluster = df_simul_cluster.sort_values(by='distance_to_center')
    df_cluster = df_cluster.sort_values(by='silhouette', ascending=False)
    df_cluster['x'] = df_simul_cluster['x'].values
    df_cluster['y'] = df_simul_cluster['y'].values
    
    d = df_cluster.drop(columns=['x','y','label']).to_dict('index')
    info = [d[x] for x in d.keys()]
    info_str = [str(x).replace(',','<br>').replace('{','').replace('}','') for x in info]
#     df_cluster['info'] = info_str
    
    # Graph
    fig.add_trace(go.Scatter(
                             x=df_cluster['x'], 
                             y=df_cluster['y'], 
                             text=['client ' + val for val in df_cluster['client_id'].values],
                             name=dict_label[cluster_number],
                             mode='markers',
                             marker=dict(size=20),
                             marker_line=dict(width=2),
                             hovertemplate=info_str
                            )
                 )

fig.update_layout(yaxis_range=[-2,2])
fig.update_layout(xaxis_range=[-1,2])
fig.show()

In [39]:
df_kmeans[df_kmeans['label']==1]

Unnamed: 0,client_id,nb_trades,nb_products,aum,age,label,silhouette
1,2,1,2,3000000,33,1,0.646431
2,3,2,2,3500000,45,1,0.757484
9,10,3,2,4000000,46,1,0.757159
10,11,15,6,4000000,43,1,0.504055


# GMM

In [40]:
df_gmm = df.copy()
df_gmm_pre = df_pre.copy()

#### Clustering

In [41]:
gm = GaussianMixture(n_components=3, random_state=0).fit(df_gmm_pre)

In [42]:
df_gmm['label'] = gm.predict(df_gmm_pre)

In [43]:
dict_label = {
    0:'active & old',
    1:'passive & wealthy',
    2:'diversified & young'
}

#### Cluster quality

In [44]:
df_gmm['silhouette'] = metrics.silhouette_samples(df_gmm_pre, df_gmm['label'])

In [45]:
dict_silhouette_clusters = {
    0:np.mean(df_gmm[df_gmm['label']==0]['silhouette']),
    1:np.mean(df_gmm[df_gmm['label']==1]['silhouette']),
    2:np.mean(df_gmm[df_gmm['label']==2]['silhouette'])
}

In [46]:
np.random.seed(0)

fig = go.Figure()

cluster_labels = df_gmm['label'].unique()

for idx, cluster_number in enumerate(cluster_labels):
    
    df_cluster = df_gmm[df_gmm['label']==cluster_number].copy()
    n_count = df_cluster.shape[0]
    
    # Define centers for each cluster
    center = mu_list[idx]
    
    # Generate coordinates
    df_simul_cluster = pd.DataFrame()
    df_simul_cluster['x'] = np.random.normal(center[0],1-dict_silhouette_clusters[cluster_number],n_count)
    df_simul_cluster['y'] = np.random.normal(center[1],1-dict_silhouette_clusters[cluster_number],n_count)
    
    # Compute the distance to center
    df_simul_cluster['distance_to_center'] = df_simul_cluster.apply(lambda x: distance_to_center(x, center), axis=1)
    
    # Associate coordinates to points w.r.t their distance to the center
    df_simul_cluster = df_simul_cluster.sort_values(by='distance_to_center')
    df_cluster = df_cluster.sort_values(by='silhouette', ascending=False)
    df_cluster['x'] = df_simul_cluster['x'].values
    df_cluster['y'] = df_simul_cluster['y'].values
    
    d = df_cluster.drop(columns=['x','y','label']).to_dict('index')
    info = [d[x] for x in d.keys()]
    info_str = [str(x).replace(',','<br>').replace('{','').replace('}','') for x in info]
#     df_cluster['info'] = info_str
    
    # Graph
    fig.add_trace(go.Scatter(
                             x=df_cluster['x'], 
                             y=df_cluster['y'], 
                             text=['client ' + val for val in df_cluster['client_id'].values],
                             name=dict_label[cluster_number],
                             mode='markers',
                             marker=dict(size=20),
                             marker_line=dict(width=2),
                             hovertemplate=info_str
                            )
                 )

fig.update_layout(yaxis_range=[-2,2])
fig.update_layout(xaxis_range=[-1,2])
fig.update_layout({'showlegend':False})
fig.update_xaxes(visible=False, scaleanchor='y', scaleratio=1)
fig.update_yaxes(visible=False, scaleanchor='x', scaleratio=1)
fig.show()

In [47]:
df_gmm[df_gmm['label']==1]

Unnamed: 0,client_id,nb_trades,nb_products,aum,age,label,silhouette
66,67,20,17,562594,81,1,0.558818
67,68,20,8,617948,89,1,0.359099
68,69,20,15,727622,80,1,0.627649
69,70,20,12,577468,81,1,0.514126
70,71,22,18,710646,85,1,0.62551
71,72,22,16,470253,79,1,0.489939
72,73,23,18,774311,84,1,0.623891
73,74,20,16,928960,82,1,0.566743
74,75,20,12,664104,88,1,0.586925
75,76,21,9,641529,75,1,0.340459


# GMM - large sample

In [48]:
n_clients = 100
n_cluster_1 = int(n_clients/3)
n_cluster_2 = int(n_clients/3)
n_cluster_3 = n_clients-(n_cluster_1+n_cluster_2)

In [49]:
np.random.seed(3)

trades = list(np.random.randint(low=1, high=5, size=n_cluster_1)) + \
         list(np.random.randint(low=10, high=15, size=n_cluster_2)) + \
         list(np.random.randint(low=20, high=25, size=n_cluster_3))

products = list(np.random.randint(low=1, high=5, size=n_cluster_1)) + \
           list(np.random.randint(low=3, high=10, size=n_cluster_2)) + \
           list(np.random.randint(low=8, high=20, size=n_cluster_3))

aum = list(np.random.randint(low=100000, high=300000, size=n_cluster_1)) + \
           list(np.random.randint(low=200000, high=500000, size=n_cluster_2)) + \
           list(np.random.randint(low=400000, high=1000000, size=n_cluster_3))

age = list(np.random.randint(low=30, high=60, size=n_cluster_1)) + \
           list(np.random.randint(low=55, high=75, size=n_cluster_2)) + \
           list(np.random.randint(low=75, high=90, size=n_cluster_3))

In [50]:
# Dataset (example)
data = {
    'client_id':['00'+str(i) for i in range(1,n_clients+1)],
    'nb_trades':trades,
    'nb_products':products,
    'aum':aum,
    'age':age
}

df = pd.DataFrame(data)
df

Unnamed: 0,client_id,nb_trades,nb_products,aum,age
0,001,3,4,298890,36
1,002,1,4,130392,44
2,003,2,1,160614,45
3,004,4,1,201998,54
4,005,1,4,229418,44
...,...,...,...,...,...
95,0096,20,17,657509,75
96,0097,20,8,715030,79
97,0098,20,16,437662,82
98,0099,21,12,761807,85


In [51]:
scaler = StandardScaler()

df_pre = scaler.fit_transform(df[['nb_trades','nb_products','aum','age']])

In [52]:
df_gmm = df.copy()
df_gmm_pre = df_pre.copy()

#### Clustering

In [53]:
gm = GaussianMixture(n_components=3, random_state=0).fit(df_gmm_pre)

In [54]:
df_gmm['label'] = gm.predict(df_gmm_pre)

In [55]:
dict_label = {
    0:'emerging clients',
    1:'active clients',
    2:'passive clients'
}

#### Cluster quality

In [56]:
df_gmm['silhouette'] = metrics.silhouette_samples(df_gmm_pre, df_gmm['label'])

In [57]:
dict_silhouette_clusters = {
    0:np.mean(df_gmm[df_gmm['label']==0]['silhouette']),
    1:np.mean(df_gmm[df_gmm['label']==1]['silhouette']),
    2:np.mean(df_gmm[df_gmm['label']==2]['silhouette'])
}

In [58]:
np.random.seed(0)

fig = go.Figure()

cluster_labels = df_gmm['label'].unique()

for idx, cluster_number in enumerate(cluster_labels):
    
    df_cluster = df_gmm[df_gmm['label']==cluster_number].copy()
    n_count = df_cluster.shape[0]
    
    # Define centers for each cluster
    center = mu_list[idx]
    
    # Generate coordinates
    df_simul_cluster = pd.DataFrame()
    df_simul_cluster['x'] = np.random.normal(center[0],1-dict_silhouette_clusters[cluster_number],n_count)
    df_simul_cluster['y'] = np.random.normal(center[1],1-dict_silhouette_clusters[cluster_number],n_count)
    
    # Compute the distance to center
    df_simul_cluster['distance_to_center'] = df_simul_cluster.apply(lambda x: distance_to_center(x, center), axis=1)
    
    # Associate coordinates to points w.r.t their distance to the center
    df_simul_cluster = df_simul_cluster.sort_values(by='distance_to_center')
    df_cluster = df_cluster.sort_values(by='silhouette', ascending=False)
    df_cluster['x'] = df_simul_cluster['x'].values
    df_cluster['y'] = df_simul_cluster['y'].values
    
    d = df_cluster.drop(columns=['x','y','label']).to_dict('index')
    info = [d[x] for x in d.keys()]
    info_str = [str(x).replace(',','<br>').replace('{','').replace('}','') for x in info]
#     df_cluster['info'] = info_str
    
    # Graph
    fig.add_trace(go.Scatter(
                             x=df_cluster['x'], 
                             y=df_cluster['y'], 
                             text=['client ' + val for val in df_cluster['client_id'].values],
                             name=dict_label[cluster_number],
                             mode='markers',
                             marker=dict(size=20),
                             marker_line=dict(width=2),
                             hovertemplate=info_str
                            )
                 )

fig.update_layout(yaxis_range=[-2,2])
fig.update_layout(xaxis_range=[-1,2])
fig.update_layout({'showlegend':False})
fig.update_xaxes(visible=False, scaleanchor='y', scaleratio=1)
fig.update_yaxes(visible=False, scaleanchor='x', scaleratio=1)
fig.show()