In [None]:
import pandas as pd

from sklearn.preprocessing import StandardScaler
import time
from sklearn.preprocessing import LabelBinarizer
from scipy.sparse import hstack
import datetime as dt

import numpy as np
from sklearn.cluster import MiniBatchKMeans
from hdbscan import HDBSCAN
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt

from pylab import rcParams
rcParams['figure.figsize'] = 35, 40
%matplotlib inline

import matplotlib.cm as cm
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly import __version__
from plotly import graph_objs as go
from plotly.graph_objs import *

## Читаем данные

In [None]:
df=pd.read_pickle('MG_Sales_customer.pickle',compression='gzip')

In [None]:
#выбираем покупателей для дальнейшего анализа
sales_sum=df.groupby('Покупатель')['Количество'].sum()
#выкидываем со слишком большими продажами (сводные карты) и тех кто купил один раз
sales_sum.drop(sales_sum[(sales_sum>133)|(sales_sum==1)].index, inplace=True)
customers_name=list(sales_sum.index)
del sales_sum

#делаем выборку
select=df.loc[(df['Покупатель'].isin(customers_name))&(df['Дата']>=(dt.datetime(2014,1,1))),['Покупатель','ПокупательПол','ПокупательДатаРождения','ВидИзделия','ПодвидИзделия','СтильДизайна','ВидДизайна','ОсновнойКамень','ГруппаТовара','Коллекция','ЦветМеталла','ТоварСреднийВес','Размер','Вес','Количество']]
del customers_name
del df

#Подготовка датасета
#ЦветМеталла=list(map(lambda xx: xx,list(select['ЦветМеталла'].unique())))
def codeMetall(_str):    
    for str_split in _str.lower().split():
        if str_split=='серебро': return 0
        if str_split=='золото': return 10
        if str_split=='зол.': return 11
        if str_split=='платина': return 20
        if str_split=='сплав': return -10
    return -20

select['ПокупательПолКод']=select['ПокупательПол'].map(lambda xx: {'Ж':0, 'М':1, '<Неопределено>':None}[xx])
select['ЦветМеталлаКод']=select['ЦветМеталла'].map(lambda xx: codeMetall(xx))
select['ПокупательПолКод'].fillna(select['ПокупательПолКод'].median(),inplace=True)
select['ПокупательГодРождения']=select['ПокупательДатаРождения'].dt.year
select['ПокупательГодРождения']=select['ПокупательГодРождения'].map(lambda xx: None if xx<1917 else xx)
select['ПокупательГодРождения']=select['ПокупательГодРождения'].map(lambda xx: None if xx>2010 else xx)
select['ПокупательГодРождения'].fillna(select['ПокупательГодРождения'].median(),inplace=True)
select.drop(['ПокупательДатаРождения','ПокупательПол','ЦветМеталла','ПокупательПолКод','ПокупательГодРождения'],  axis=1, inplace=True)
#выборка колонок
numerical_columns = [c for c in select.columns if select[c].dtype.name != 'object']
categorial_columns = [c for c in select.columns if select[c].dtype.name == 'object']


#Dummy-кодирование и шкалируем
lb_style = LabelBinarizer(sparse_output=True)
concList=[]
for col in categorial_columns:
    concList.append(lb_style.fit_transform(select[col]))    
concList.append(StandardScaler().fit_transform(select[numerical_columns]))#добавляем шклированные значения числовых переменных
X=hstack(concList)

del concList
print('shape ',X.shape)
print('Prepare finished')

In [None]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=150, n_iter=5)
svd_representation = svd.fit_transform(X)
var1=np.cumsum(np.round(svd.explained_variance_ratio_, decimals=5)*100)
plt.plot(var1[-50:])

#расчитываем оптимальное количество компонент
#более 90% дисперсии и шаг приращения каждой следующей компоненты <10^-4
optimal_n=np.intersect1d(np.argwhere(var1>90.),np.argwhere(svd.explained_variance_ratio_<=10**-4))[0]
print(optimal_n)#171

if optimal_n==None:
    raise 'Not enough n_components!'

svd = TruncatedSVD(n_components=optimal_n, n_iter=7)
svd_representation = svd.fit_transform(X)
print('reduced')

In [None]:
del X

init_notebook_mode(connected = True)

lim=30000
val='ЦветМеталлаКод'
#----------
ЦветМеталла=list(select.iloc[:lim][val].unique())
colors = list(iter(cm.rainbow(np.linspace(0, 1, len(ЦветМеталла)))))
cl=select.iloc[:lim][val].map(lambda xx: ЦветМеталла.index(xx))

trace1 = go.Scatter3d(#
    x=svd_representation[:lim, 0],
    y=svd_representation[:lim, 1],
    z=svd_representation[:lim, 2],
    mode='markers',
    marker=dict(
        size=2,
        colorscale='Rainbow',#Greys, YlGnBu, Greens, YlOrRd, Bluered, RdBu, Reds, Blues, Picnic, Rainbow, Portland, Jet, Hot, Blackbody, Earth, Electric, Viridis
        opacity=0.8,
        color=cl,
        colorbar=ColorBar(
                title=val
        )
    )
)

data = [trace1]
layout = go.Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='SVD продаж')
print(ЦветМеталла)

def compKMeans(min_clusters,max_clusters):
    if min_clusters==max_clusters: max_clusters+=1
    rg=range(min_clusters, max_clusters)
    if __name__ == '__main__':
        inertia = []
        for k in rg:
            hdb_t1 = time.time()
            hdb = MiniBatchKMeans(n_clusters=k,max_iter=150,max_no_improvement=25,n_init=15,tol=.01,batch_size=15,random_state=17).fit(X)
            #hdb = KMeans(n_clusters=k,n_jobs=15,max_iter=100,n_init=2,precompute_distances=True,verbose=3,random_state=17).fit(X)
            hdb_labels = hdb.labels_
            hdb_elapsed_time = time.time() - hdb_t1
            
            inertia.append(np.sqrt(hdb.inertia_))
            
            # Number of clusters in labels, ignoring noise if present.
            n_clusters_hdb_ = len(set(hdb_labels)) - (1 if -1 in hdb_labels else 0)
    
            #print('\n\n++ KMeans Results')
            print('n_clusters: %d' % k)
            #print('Estimated number of clusters: %d' % n_clusters_hdb_)
            print('Elapsed time to cluster: %.4f s' % hdb_elapsed_time)
            #print('Silhouette Coefficient: %0.3f'
             #     % metrics.silhouette_score(X, hdb_labels,random_state=17))
    
    from pylab import rcParams
    rcParams['figure.figsize'] = 14, 8
    plt.plot(rg, inertia, marker='s');
    plt.xlabel('$k$')
    plt.ylabel('$J(C_k)$')
    
    return hdb

hdb=compKMeans(10,11)

In [None]:
n_clusters=7


hdb_t1 = time.time()
hdb = MiniBatchKMeans(n_clusters=n_clusters,max_iter=150,max_no_improvement=25,n_init=15,tol=.01,batch_size=15,random_state=17).fit(svd_representation)
#hdb = MiniBatchKMeans(n_clusters=n_clusters,max_iter=150,max_no_improvement=25,n_init=15,tol=.01,batch_size=15,random_state=17).fit(X)


hdb_elapsed_time = time.time() - hdb_t1
print('MiniBatchKMeans Elapsed time to cluster: %.4f s' % (hdb_elapsed_time))

In [None]:
from sklearn.model_selection import ShuffleSplit
#подбор оптимального количества точек
ss = ShuffleSplit(n_splits=1, train_size=50000)
subs= ss.split(svd_representation)

print('HDBSCAN starts.')
for min_cluster_size in list(np.arange(600,1500,100)):
    for index in subs:
        print('min_cluster_size=',min_cluster_size)
        
        hdb_t1 = time.time()
        X=svd_representation[index[0]]

        hdb = HDBSCAN(min_cluster_size=int(min_cluster_size),min_samples=1,core_dist_n_jobs=14).fit(X)

        cluster_labels=hdb.labels_
        # Number of clusters in labels, ignoring noise if present.
        n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)

        hdb_elapsed_time = time.time() - hdb_t1
        print('HDBSCAN Elapsed time to cluster: %4.1f h' % (hdb_elapsed_time/3600))

        hdb_t1 = time.time()

        fig, (ax1) = plt.subplots(1, 1)
        fig.set_size_inches(18, 8)

        # The 1st subplot is the silhouette plot
        # The silhouette coefficient can range from -1, 1 
        ax1.set_xlim([-1, 1])
        # The (n_clusters+1)*10 is for inserting blank space between silhouette
        # plots of individual clusters, to demarcate them clearly.
        ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

        # The silhouette_score gives the average value for all the samples.
        # This gives a perspective into the density and separation of the formed
        # clusters
        silhouette_avg = silhouette_score(X, cluster_labels)
        print('Elapsed time to cluster: %6.1f m' % ((time.time()-hdb_t1)/60))
        print("For n_clusters =", n_clusters,
              "The average silhouette_score is :", silhouette_avg)

        # Compute the silhouette scores for each sample
        sample_silhouette_values = silhouette_samples(X, cluster_labels)

        y_lower = 10
        for i in range(n_clusters):
            # Aggregate the silhouette scores for samples belonging to
            # cluster i, and sort them
            ith_cluster_silhouette_values = \
                sample_silhouette_values[cluster_labels == i]

            ith_cluster_silhouette_values.sort()

            size_cluster_i = ith_cluster_silhouette_values.shape[0]
            y_upper = y_lower + size_cluster_i

            color = cm.spectral(float(i) / n_clusters)
            ax1.fill_betweenx(np.arange(y_lower, y_upper),
                              0, ith_cluster_silhouette_values,
                              facecolor=color, edgecolor=color, alpha=0.7)

            # Label the silhouette plots with their cluster numbers at the middle
            ax1.text(-0.5, y_lower + 0.5 * size_cluster_i, str(i))

            # Compute the new y_lower for next plot
            y_lower = y_upper + 10  # 10 for the 0 samples

        ax1.set_title("The silhouette plot for the various clusters.")
        ax1.set_xlabel("The silhouette coefficient values")
        ax1.set_ylabel("Cluster label")

        # The vertical line for average silhouette score of all the values
        ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

        ax1.set_yticks([])  # Clear the yaxis labels / ticks
        ax1.set_xticks([-1,-0.8,-0.6,-0.4,-0.2, 0, 0.2, 0.4, 0.6, 0.8, 1])

        plt.suptitle(("Silhouette analysis for HDBSCAN clustering on sample data "
                      "with n_clusters = %d" % n_clusters),
                     fontsize=14, fontweight='bold')

        plt.show()

In [None]:
hdb_t1 = time.time()
if __name__ == '__main__':
    hdb = HDBSCAN(min_cluster_size=11000,min_samples=1,core_dist_n_jobs=14).fit(svd_representation)

hdb_labels=hdb.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters = len(set(hdb_labels)) - (1 if -1 in hdb_labels else 0)
    
hdb_elapsed_time = time.time() - hdb_t1
print('HDBSCAN Elapsed time to cluster: %4.1f h' % (hdb_elapsed_time/3600))

In [None]:
from sklearn.model_selection import ShuffleSplit

ss = ShuffleSplit(n_splits=1, train_size=40000)
subs= ss.split(svd_representation,hdb.labels_)

for index in subs:
    hdb_t1 = time.time()
    silhouette_avg = silhouette_score(svd_representation[index[0]], hdb.labels_[index[0]])
    print("For n_clusters =", n_clusters,
        "The average silhouette_score is :", silhouette_avg)

    print('Elapsed time to cluster: %6.1f m' % ((time.time()-hdb_t1)/60))

init_notebook_mode(connected = True)
lim=30000

trace1 = go.Scatter3d(#
    x=svd_representation[:lim, 0],
    y=svd_representation[:lim, 1],    
    z=svd_representation[:lim, 2], 
    mode='markers',
    marker=dict(
        size=2,
        colorscale='Rainbow',#Greys, YlGnBu, Greens, YlOrRd, Bluered, RdBu, Reds, Blues, Picnic, Rainbow, Portland, Jet, Hot, Blackbody, Earth, Electric, Viridis
        opacity=0.8,
        color=hdb.labels_[:lim],
        colorbar=ColorBar(
                title='Кластеры'
        )
    )
)

data = [trace1]
layout = go.Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='Кластеризация продаж')

In [None]:
import seaborn as sns
sns.set_context('poster')
sns.set_style('white')
sns.set_color_codes()

hdb.condensed_tree_.plot(select_clusters=True,
                               selection_palette=sns.color_palette('deep', 10))

In [None]:
select['ЦветМеталлаКод']=select['ЦветМеталлаКод'].map(lambda xx: {0: 'серебро', 10: 'золото', 11: 'золото', 20: 'платина',-10: 'сплав', -20: 'прочее'}[xx])
#select['ПокупательПолКод']=select['ПокупательПолКод'].map(lambda xx: {0: 'Ж', 1: 'М'}[xx])

In [None]:
#Выводим данные по кластерам в сводную таблицу
clusters_data=pd.DataFrame(columns=select.columns)
for cl in range(hdb.cluster_centers_.shape[0]):
    sel=select.loc[hdb.labels_==cl].describe(include='all')
    clust_info=sel.loc[['top','mean']]
    clusters_data.loc[cl+1]=pd.concat([clust_info.fillna('').sum(axis=0),clust_info.fillna(0).sum(axis=0)])
    #clusters_data.loc[cl+1,'Количество']=sel.loc['count','Покупатель']
    clusters_data.loc[cl+1,'УникальныхПокупателей']=sel.loc['unique','Покупатель']
    clusters_data.loc[cl+1,'ЧастотаПокупок']=sel.loc['freq','Покупатель']
clusters_data.drop('Количество',axis=1,inplace=True)
clusters_data.sort_values('УникальныхПокупателей',inplace=True, ascending=False)
clusters_data

In [None]:
var1=np.cumsum(np.round(svd.explained_variance_ratio_, decimals=4)*100)
plt.plot(var1[-50:])

In [None]:
 svd_representation.shape
    

In [None]:
from sklearn.model_selection import ShuffleSplit
ss = ShuffleSplit(n_splits=1, train_size=50000)
subs= ss.split(svd_representation,hdb.labels_)
index=list(subs)[0][0]

hdb.labels_[index]

In [None]:
np.arange(500,1500,100)

from IPython.display import display
for cl in range(6):
    print('cluster=',(cl+1))
    display(select.loc[hdb.labels_==cl].describe(include='all'))

select.loc[hdb_labels==0].groupby('ВидИзделия')['Количество'].sum().sort_values(ascending=False)