In [1]:
import numpy as np
import pandas as pd

from scipy.spatial import distance_matrix
from scipy.cluster.hierarchy import dendrogram, fcluster

import fastcluster

from scipy.cluster.hierarchy import cophenet
from scipy.spatial.distance import pdist

from matplotlib import pyplot as plt
%matplotlib inline

In [10]:
df = pd.read_csv('preprocessed.csv')

n = 40000 #len(df) #179149

df_slice = df.iloc[:n, :]

products = len(df_slice.code.unique())

In [11]:
df_slice.head()

Unnamed: 0,order_id,code,quantity,price,pis_cofins,icms,tax_substitution,category,sale_date,source_channel,sale_yearmon,sale_year,sale_month,sale_day,gross_margin,net_margin
0,bcb59c839e78b2601374cbad9239ca7b,e6762ba2ffbca07ab6cee7551caeaad5,1,978.9,90.5483,0.0,191.8416,4ece547755cba9e7fc14125bc895f31b,2016-06-11,b76eb9b8fc0f17098812da9117d3e500,2016-06,2016,6,11,0.249619,0.353096
1,4e91ee6b95895771dc9ee524e910a902,e6762ba2ffbca07ab6cee7551caeaad5,1,1036.29,95.8568,176.1693,0.0,4ece547755cba9e7fc14125bc895f31b,2016-06-11,b76eb9b8fc0f17098812da9117d3e500,2016-06,2016,6,11,0.476299,0.213799
2,88eb0ac86af1a521c0831298d22dea8b,e6762ba2ffbca07ab6cee7551caeaad5,1,978.9,90.5483,0.0,191.8416,4ece547755cba9e7fc14125bc895f31b,2016-06-12,b76eb9b8fc0f17098812da9117d3e500,2016-06,2016,6,12,0.249619,0.353096
3,dee418152a36314b4aee6ce9cf94fcbf,e6762ba2ffbca07ab6cee7551caeaad5,1,978.9,90.5483,176.202,0.0,4ece547755cba9e7fc14125bc895f31b,2016-06-13,b76eb9b8fc0f17098812da9117d3e500,2016-06,2016,6,13,0.445596,0.173096
4,1c175bc61b9b659bbf011b2e5e3dcec6,e6762ba2ffbca07ab6cee7551caeaad5,1,976.05,90.2846,0.0,192.3325,4ece547755cba9e7fc14125bc895f31b,2016-06-13,b76eb9b8fc0f17098812da9117d3e500,2016-06,2016,6,13,0.246925,0.351477


### Normalizando as variáveis numéricas

In [12]:
normalizable = ['quantity', 'price', 'pis_cofins', 'icms', 'tax_substitution', 'gross_margin', 'net_margin']
df_norm = df_slice.loc[:, normalizable]
df_norm = (df_norm - df_norm.mean()) / df_norm.std()

df_norm = df_norm.drop(['quantity'], axis = 1) # Removing quantity because it distorts the clustering and it's the variable to forecast

### Transformação de variáveis categóricas em dummies

In [13]:
non_normalizable = [c for c in df_slice.columns.tolist() if c not in normalizable]
df_clustering = df_slice.loc[:, non_normalizable]

categorical = ['code', 'category', 'source_channel', 'sale_year', 'sale_month', 'sale_day']
prefix = ['code', 'category', 'source_channel', 'sale_year', 'sale_month', 'sale_day']
df_clustering = pd.get_dummies(df_clustering, columns=categorical, prefix=categorical)

df_clustering = df_clustering.drop(['sale_yearmon', 'sale_date'], axis = 1)

In [16]:
df_clustering = pd.concat([df_clustering, df_norm], axis=1)
df_clustering.head()

Unnamed: 0,order_id,code_09f544ec2a74c89abeec7b0590fc2d11,code_0bbe09e34a11e8e31cf49d6f8df2992d,code_193628b6634713730d3c506f2da0ff58,code_1dbe25b2fd344aed0c444fd6b715525b,code_29424aaf6e27a8dbe4b7273a0a39131d,code_2ab0e87dbce6ac45502aa1d2a8265933,code_32ceebf3efea1d04ace4183d20d4da5b,code_3657af9de7395eaba0dbcbcaa6fd90be,code_373cc1cfc10a45488be6b97bd5e94c44,...,icms,tax_substitution,gross_margin,net_margin,price,pis_cofins,icms.1,tax_substitution.1,gross_margin.1,net_margin.1
0,bcb59c839e78b2601374cbad9239ca7b,0,0,0,0,0,0,0,0,0,...,-0.91469,6.658135,-0.839158,0.900117,4.401853,4.397767,-0.91469,6.658135,-0.839158,0.900117
1,4e91ee6b95895771dc9ee524e910a902,0,0,0,0,0,0,0,0,0,...,5.47376,-0.651159,0.800188,-0.237955,4.741831,4.737242,5.47376,-0.651159,0.800188,-0.237955
2,88eb0ac86af1a521c0831298d22dea8b,0,0,0,0,0,0,0,0,0,...,-0.91469,6.658135,-0.839158,0.900117,4.401853,4.397767,-0.91469,6.658135,-0.839158,0.900117
3,dee418152a36314b4aee6ce9cf94fcbf,0,0,0,0,0,0,0,0,0,...,5.474946,-0.651159,0.578144,-0.570505,4.401853,4.397767,5.474946,-0.651159,0.578144,-0.570505
4,1c175bc61b9b659bbf011b2e5e3dcec6,0,0,0,0,0,0,0,0,0,...,-0.91469,6.676839,-0.858641,0.886892,4.384969,4.380903,-0.91469,6.676839,-0.858641,0.886892


In [None]:
pdist_clustering = fastcluster.pdist(df_clustering.iloc[:, 1:])

In [None]:
Z = fastcluster.linkage(df_clustering.iloc[:, 1:], 'average', preserve_input=False)

In [None]:
def fancy_dendrogram(*args, **kwargs):
    max_d = kwargs.pop('max_d', None)
    if max_d and 'color_threshold' not in kwargs:
        kwargs['color_threshold'] = max_d
    annotate_above = kwargs.pop('annotate_above', 0)

    ddata = dendrogram(*args, **kwargs)

    if kwargs.get('no_plot', True):
        plt.title('Hierarchical Clustering Dendrogram')
        plt.xlabel('points')
        plt.ylabel('distance')
        for i, d, c in zip(ddata['icoord'], ddata['dcoord'], ddata['color_list']):
            x = 0.5 * sum(i[1:3])
            y = d[1]
            if y > annotate_above:
                plt.plot(x, y, 'o', c=c)
                plt.annotate("%.3g" % y, (x, y), xytext=(0, -5),
                             textcoords='offset points',
                             va='top', ha='center')
    return ddata

In [None]:
fancy_dendrogram(
    Z,
    leaf_rotation=90.,
    leaf_font_size=12.,
    truncate_mode='lastp',  # show only the last p merged clusters
    p=30,  # show only the last p merged clusters
)
plt.show()

### Usar o Coeficiente de Correlação Cofenética para comparar (correlacionar) os pares de distâncias de todas as amostras com aqueles inferidos pelo agrupamento hierarquizado (clustering). Quanto mais próximo de 1, melhor o agrupamento preservou as distâncias originais

In [None]:
c, coph_dists = cophenet(Z, pdist_clustering)
c

### Trazer os labels dos agrupamentos de volta para o dataframe

In [None]:
products = len(df_slice.code.unique())
products

In [None]:
df_slice['labels'] = fcluster(Z, products, criterion='maxclust')

### Analisando outliers

In [None]:
df_slice['sale_yearmon'].value_counts()

In [None]:
outliers_clusters = df_slice['labels'].value_counts()
outliers_clusters = outliers_clusters[outliers_clusters == 1].index.tolist()
df_slice[df_slice['labels'].isin(outliers_clusters)]

### Outliers com preços mais elevados e quantidades maiores

### Separar Mês 05 para Teste de Forecast

In [None]:
df_slice[df_slice['sale_yearmon'] < '2017-05'][['code', 'quantity', 'sale_date', 'labels']].to_csv('train_clustered.csv', index=False)

In [None]:
df_slice[df_slice['sale_yearmon'] >= '2017-05'][['code', 'quantity', 'sale_date', 'labels']].to_csv('test_clustered.csv', index=False)