In [None]:
# default_exp nlp.fasttext

# FastText - UMAP - HDBSCAN

> API details.

In [None]:
import os
import time
import sys
import inspect

import pandas as pd
import numpy as np

import datetime

try:
    from fasttext import train_unsupervised
    import fasttext
except:
    from fastText import train_unsupervised
    import fastText
    
import umap.umap_ as umap
import hdbscan

import plotly.graph_objs as go
import plotly.offline as py
import plotly.graph_objs as go
from IPython.display import Image,display

In [None]:
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Get Data

In [None]:
fasttext_data_folder = "../data/fasttext_data"
train_data = os.path.join(fasttext_data_folder,"text_file.text")

In [None]:
text_df = pd.read_csv(train_data, names=["itemDesc"])
text_df.head()

Unnamed: 0,itemDesc
0,MINI DONUTS FOURRES ASSORTI X6
1,NAVETTE ASSORTIMENT X30
2,BRIOCHE MOUNA 400G
3,BRIOCHETTE ST GENIX X4 LOCAL
4,BRIOCHE ST GENIX LOCAL


## FastText training

In [None]:
cbow_model = os.path.join(fasttext_data_folder,"model_cbow.bin")
skipgram_model = os.path.join(fasttext_data_folder,"model_skipgram.bin")


**train_unsupervised parameters**
    * input             # training file path (required)
    lr                # learning rate [0.1]
    dim               # size of word vectors [100]
    ws                # size of the context window [5]
    epoch             # number of epochs [5]
    minCount          # minimal number of word occurences [1]
    minCountLabel     # minimal number of label occurences [1]
    minn              # min length of char ngram [0]
    maxn              # max length of char ngram [0]
    neg               # number of negatives sampled [5]
    wordNgrams        # max length of word ngram [1]
    loss              # loss function {ns, hs, softmax, ova} [softmax]
    bucket            # number of buckets [2000000]
    thread            # number of threads [number of cpus]
    lrUpdateRate      # change the rate of updates for the learning rate [100]
    t                 # sampling threshold [0.0001]
    label             # label prefix ['__label__']
    verbose           # verbose [2]
    pretrainedVectors # pretrained word vectors (.vec file) for supervised learning []

In [None]:
model_to_use = "skipgram"

if model_to_use == "skipgram":
    # Skipgram model :
    model_skipgram = train_unsupervised(train_data, model='skipgram')
    model_skipgram.save_model(skipgram_model)

else:
    #Cbow model :
    model_cbow = train_unsupervised(train_data, model='cbow')
    model_cbow.save_model(cbow_model)

In [None]:
# Loading of the fastext pretained model
model_skipgram=fasttext.load_model(skipgram_model)




In [None]:
model_skipgram.get_dimension()

100

In [None]:
# The get_sentence_vector takes all vectors for all the words in the query, divide each of them by their respective norm, and then average all vectors together
def query_to_vector(col_query, model_fastText):
    vector = col_query.apply(lambda x:model_fastText.get_sentence_vector(x.replace('\n',' ')))
    return vector

In [None]:
%timeit text_df['vector'] = query_to_vector(text_df['itemDesc'], model_skipgram)
del model_skipgram # We do not need the pretrained-vector in memory

13.1 ms ± 831 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
text_df.head(10)

Unnamed: 0,itemDesc,vector
0,MINI DONUTS FOURRES ASSORTI X6,"[0.02189265, 0.10787461, -0.014234907, -0.0226..."
1,NAVETTE ASSORTIMENT X30,"[0.014975755, 0.04644253, -0.04249201, -0.0160..."
2,BRIOCHE MOUNA 400G,"[-0.0522879, 0.001154994, 0.0076559577, -0.075..."
3,BRIOCHETTE ST GENIX X4 LOCAL,"[-0.009413259, -0.017262919, -0.007244551, -0...."
4,BRIOCHE ST GENIX LOCAL,"[-0.01687116, -0.03279358, 0.000291368, -0.017..."
5,NAVETTE X20,"[0.026375797, -0.00427916, -0.047491483, -0.00..."
6,TARTE AU SUCRE 6P LOCAL,"[-0.022558345, -0.087581225, -0.011884119, 0.0..."
7,MINI SUISSE X8,"[0.09554361, 0.010526974, 0.14405009, -0.13391..."
8,4 P RAIS+4 P CHOC.+4 CROIS. PB,"[0.012128612, 0.0345035, -0.043019157, 0.03134..."
9,X5 POCHE TALOAK,"[0.011469854, 0.007848586, 0.007189207, -0.033..."


## Dimension reduction with UMAP

In [None]:
# We regroup all the vectors as a numpy array
vecs=text_df.vector.values
vecs=np.stack(vecs, axis=0)
vecs.shape

(583, 100)

In [None]:
col_names = ["FT_"+str(x) for x in range(0,100)]
print(len(col_names))
test = pd.DataFrame(vecs, columns=col_names)

100


In [None]:
pd.concat([text_df,test], axis=1)

Unnamed: 0,itemDesc,vector,FT_0,FT_1,FT_2,FT_3,FT_4,FT_5,FT_6,FT_7,FT_8,FT_9,FT_10,FT_11,FT_12,FT_13,FT_14,FT_15,FT_16,FT_17,FT_18,FT_19,FT_20,FT_21,FT_22,FT_23,FT_24,FT_25,FT_26,FT_27,FT_28,FT_29,FT_30,FT_31,FT_32,FT_33,FT_34,FT_35,FT_36,FT_37,FT_38,FT_39,FT_40,FT_41,FT_42,FT_43,FT_44,FT_45,FT_46,FT_47,FT_48,FT_49,FT_50,FT_51,FT_52,FT_53,FT_54,FT_55,FT_56,FT_57,FT_58,FT_59,FT_60,FT_61,FT_62,FT_63,FT_64,FT_65,FT_66,FT_67,FT_68,FT_69,FT_70,FT_71,FT_72,FT_73,FT_74,FT_75,FT_76,FT_77,FT_78,FT_79,FT_80,FT_81,FT_82,FT_83,FT_84,FT_85,FT_86,FT_87,FT_88,FT_89,FT_90,FT_91,FT_92,FT_93,FT_94,FT_95,FT_96,FT_97,FT_98,FT_99
0,MINI DONUTS FOURRES ASSORTI X6,"[0.02189265, 0.10787461, -0.014234907, -0.0226...",0.021893,0.107875,-0.014235,-0.022613,0.028204,0.087868,-0.041379,0.034871,-0.038356,0.059184,-0.016758,-0.067529,-0.010123,0.04163,0.0186,-0.033242,-0.008634,-0.075442,-0.076242,-0.031079,-0.031463,-0.010116,-0.019682,0.002145,-0.017118,0.034094,0.046754,0.013679,-0.010289,-0.016059,-0.001146,-0.051525,0.071037,-0.055208,0.027236,-0.115201,-0.011364,-0.041341,-0.006985,-0.002378,0.018476,-0.029233,-0.05906,-0.060528,0.035874,-0.015067,0.071754,0.067342,0.02484,0.022026,-0.020118,-0.039323,-0.07698,0.03325633,0.004085,0.00725,0.00576,-0.037471,-0.006596,0.033018,0.10929,-0.067883,-0.014137,-0.016261,0.012785,-0.035506,-0.030905,-0.034955,-0.025966,-0.051804,0.035805,0.080714,-0.037328,0.019347,-0.025617,0.018011,0.074893,0.010634,-0.003897,0.003587,0.047191,-0.005397,0.038612,-0.001835,0.001973,-0.01114,-0.029869,-0.032659,0.048333,0.0062,-0.016012,-0.074827,0.016927,-0.076514,0.034602,-0.009376,0.104716,-0.032211,-0.043635,-0.098906
1,NAVETTE ASSORTIMENT X30,"[0.014975755, 0.04644253, -0.04249201, -0.0160...",0.014976,0.046443,-0.042492,-0.01602,0.060589,-0.001802,-0.007963,0.154584,-0.047432,0.0012,0.032157,-0.059554,0.000718,0.001321,0.161345,-0.091178,-0.071525,0.090844,-0.095655,0.029134,-0.081196,0.044388,-0.016175,0.02427,-0.010981,-0.07822,0.084995,0.005192,-0.034364,-0.063775,-0.033751,0.063376,0.003899,-0.017413,-0.007193,-0.06229,-0.106323,-0.081042,-0.02346,-0.035445,0.023878,0.077496,0.040673,-0.087632,-0.048149,0.029242,-0.020232,-0.013576,-0.000162,-0.003269,0.059127,0.015152,-0.114693,-0.05696078,-0.020701,-0.028144,-0.116559,0.071883,-0.00263,-0.021492,-0.057264,-0.058594,0.019816,-0.075325,-0.025922,0.038827,-0.069613,-0.000352,0.081464,-0.058647,0.071187,-0.005142,-0.045691,-0.052592,-0.041623,0.09452,0.106152,-0.012685,-0.102757,0.048036,-0.119291,0.007339,0.008281,0.128907,0.028417,-0.079641,0.045603,-0.008531,-0.040964,-0.006577,-0.094621,-0.04771,0.011104,-0.078821,0.081602,0.060157,0.022344,-0.001476,-0.122729,-0.096902
2,BRIOCHE MOUNA 400G,"[-0.0522879, 0.001154994, 0.0076559577, -0.075...",-0.052288,0.001155,0.007656,-0.075826,0.011916,0.064263,-0.018506,0.027885,0.002566,-0.022193,-0.051448,0.038506,-0.013118,-0.007563,0.048938,0.06603,0.004098,0.000231,0.013453,-0.028538,0.067185,0.081719,-0.071251,0.062017,-0.029551,-0.018661,-0.022595,-0.008173,0.016262,-0.010919,-0.005681,0.020875,-0.018665,-0.064767,-0.04966,0.037674,-0.018416,-0.010805,-0.148198,-0.020311,0.04614,0.063624,-0.055314,0.033542,-0.02462,0.047605,0.064045,0.028681,0.077954,-0.031763,-0.018746,-0.010235,-0.048626,-0.0232327,-0.143056,-0.02913,0.02699,0.028632,0.018002,0.063775,-0.027766,-0.055045,-0.087512,-0.024886,-0.017916,-0.036391,0.023741,0.018399,-0.055502,0.05611,0.060544,0.01717,-0.041984,-0.019636,-0.081729,-0.075883,0.020914,0.087724,-0.014154,0.004644,-0.041628,0.026734,0.09623,0.105805,-0.008227,-0.062245,-0.01589,0.058891,-0.071408,-0.023343,0.039281,-0.050999,-0.062892,-0.08043,-0.132587,-0.143349,-0.024183,-0.058106,0.012511,-0.110218
3,BRIOCHETTE ST GENIX X4 LOCAL,"[-0.009413259, -0.017262919, -0.007244551, -0....",-0.009413,-0.017263,-0.007245,-0.020935,0.027331,0.004281,-0.031993,0.004024,-0.055252,-0.096062,-0.016239,0.009742,-0.006918,-0.030301,0.017237,-0.032608,0.027304,-0.014444,-0.017272,0.008747,0.092449,0.043832,0.010388,-0.025744,-0.058279,-0.08321,0.089228,-0.024887,-0.01906,0.078222,-0.047219,0.026244,0.035602,-0.052397,-0.042218,-0.020123,-0.00578,-0.04064,-0.095895,-0.037615,0.038064,0.018639,-0.065958,0.009204,0.006499,0.03235,0.034347,-0.046257,0.007359,0.038289,0.023252,-0.023397,-0.067081,-0.041963,-0.059333,-0.029338,-0.073214,0.018323,0.033629,-0.015795,-0.054034,-0.016758,0.001131,0.053178,-0.015028,0.000778,-0.051565,-0.024575,-0.065954,0.001848,-0.001302,-0.065051,0.037626,-0.011649,0.003174,0.110847,0.094201,0.000216,0.056755,0.00675,-0.054492,-0.040869,0.086781,0.000353,-0.047047,-0.005542,-0.040545,-0.045441,-0.069074,0.003833,-0.054055,0.025256,-0.0069,-0.001449,-0.022238,0.010084,0.029685,0.014023,0.005983,-0.029698
4,BRIOCHE ST GENIX LOCAL,"[-0.01687116, -0.03279358, 0.000291368, -0.017...",-0.016871,-0.032794,0.000291,-0.017919,0.015038,0.026463,-0.052469,0.006603,-0.071688,-0.119074,-0.034064,0.000557,0.072748,-0.035513,0.004059,-0.028297,0.0538,0.01525,-0.025116,-0.001646,0.109998,0.0338,-0.033218,-0.033244,-0.094368,-0.063856,0.0691,-0.035381,-0.036636,0.092399,-0.000623,0.002687,0.039491,-0.032728,-0.03126,0.010385,0.009843,-0.054438,-0.102099,-0.009694,0.055944,-0.013697,-0.059724,0.044047,-0.003094,0.011312,0.030869,-0.047064,0.010296,0.018251,0.004799,-0.011626,-0.071569,-0.02008079,-0.046603,-0.018174,-0.058751,0.029818,0.051645,-0.004435,-0.029343,0.014425,0.066525,0.117471,-0.004681,0.016662,-0.046441,-0.015734,-0.040655,0.029139,0.005387,-0.036833,0.018789,-0.003052,-0.042523,0.089602,0.064173,-0.018503,0.03618,0.021938,-0.041944,-0.063878,0.1159,0.044965,-0.071323,-0.006916,-0.069146,-0.01205,0.000464,0.017097,-0.003941,0.069586,-0.011811,0.053294,-0.059059,-0.020572,0.008423,-0.030807,-0.025168,-0.036669
5,NAVETTE X20,"[0.026375797, -0.00427916, -0.047491483, -0.00...",0.026376,-0.004279,-0.047491,-0.003983,0.136632,-0.03408,0.076736,0.098081,0.04283,-0.046722,-0.052582,0.000506,-0.000609,-0.032093,0.017714,0.019219,-0.095653,-0.058896,-0.034221,0.067282,-0.054073,-0.036608,0.08073,-0.056929,-0.037028,-0.00708,0.031535,0.134368,-0.068124,0.002148,-0.018411,0.034251,-0.01438,-0.003181,-0.090125,-0.078457,0.002607,-0.078076,-0.0206,0.066655,0.050635,0.048887,-0.01372,-0.080355,0.070606,-0.02764,-0.144004,0.180135,-0.053933,-0.10732,0.017125,-0.074281,-0.052647,0.04238446,-0.037213,0.035008,-0.048648,-0.034572,0.029993,-0.059687,-0.061503,-0.086053,0.1387,-0.01732,0.093028,-0.012752,-0.054483,-0.026414,0.168715,-0.077599,0.026388,0.019606,0.007915,-0.043366,0.023887,0.128012,-0.002427,-0.025626,-0.027296,-0.001296,-0.03145,-0.063722,-0.048211,0.071122,0.089876,-0.057895,0.104798,0.057363,-0.11646,-0.017596,-0.032979,-0.031798,0.059383,-0.10651,0.059109,0.026094,0.059829,0.007648,0.017787,-0.08352
6,TARTE AU SUCRE 6P LOCAL,"[-0.022558345, -0.087581225, -0.011884119, 0.0...",-0.022558,-0.087581,-0.011884,0.032449,-0.027057,-0.014646,0.050587,-0.044931,-0.092235,-0.024257,0.006416,0.042916,-0.070155,0.013814,0.019952,-0.019092,0.028434,0.009004,-0.062452,-0.045626,-0.016477,0.009886,-0.000727,0.031561,-0.024817,-0.034528,0.033128,-0.041183,-0.044121,0.058677,-0.051304,0.011015,-0.052567,-0.031451,-0.04041,0.037588,0.033636,-0.084022,-0.029479,0.061447,-0.048129,0.00697,-0.002861,-0.030836,-0.02796,-0.023762,0.063677,-0.066145,-0.014294,-0.020325,-0.012455,0.042742,0.032008,0.01914604,-0.039446,-0.054061,-0.081206,0.075217,-0.008487,-0.013916,0.000837,0.010595,0.037071,0.103887,0.00633,0.025734,-0.07935,0.036712,0.007111,0.020939,-0.072725,-0.000201,-0.013502,-0.000582,0.055781,-0.009728,0.09861,0.033393,0.040582,0.027963,-0.068681,-0.009925,0.016061,0.055287,-0.004078,-0.027298,0.021851,-0.017429,-0.0065,-0.048586,-0.064322,0.026178,-0.04235,0.022127,-0.054704,0.049771,0.00982,-0.051229,0.003193,-0.026428
7,MINI SUISSE X8,"[0.09554361, 0.010526974, 0.14405009, -0.13391...",0.095544,0.010527,0.14405,-0.133913,0.065542,0.070541,0.026395,0.052987,0.022732,0.060017,-0.012144,-0.029572,-0.094068,-0.07145,0.003006,0.028519,-0.08153,-0.103607,-0.078786,-0.053476,0.015133,-0.016004,0.104116,-0.049165,-0.03481,0.009619,-0.031889,0.111253,-0.089998,0.05156,-0.082798,-0.032395,0.046196,-0.014931,-0.107602,-0.05247,-0.048247,-0.053488,0.039826,-0.021962,0.063367,0.001082,-0.042307,-0.059877,-0.02139,-0.012479,0.030794,0.058588,0.045505,0.021521,-0.042564,0.074309,-0.036796,0.08138306,-0.086861,0.021132,0.026161,-0.01165,0.088202,0.031306,0.069232,-0.067446,-0.096793,0.037922,0.033964,-0.020623,0.024035,0.070193,0.092777,0.102043,-0.139863,0.024585,0.062817,-0.163406,0.00272,-0.045962,0.062267,0.000551,0.030123,-0.03364,0.042558,-0.04939,-0.0059,-0.027347,0.076493,0.05654,-0.079625,0.020372,0.086324,-0.03913,0.026031,-0.033761,0.02648,-0.036964,-0.017316,-0.037459,0.034762,0.036744,0.014643,0.054865
8,4 P RAIS+4 P CHOC.+4 CROIS. PB,"[0.012128612, 0.0345035, -0.043019157, 0.03134...",0.012129,0.034504,-0.043019,0.031349,0.037943,-0.033716,-0.017028,0.053265,-0.011027,0.074995,0.015844,-0.019859,-0.012782,-0.013312,-0.040039,0.000855,-0.004575,0.101911,0.008681,-0.055465,-0.010427,0.00288,0.091634,0.008084,-0.003379,0.086172,0.055282,0.027551,0.005733,0.008921,-0.001698,0.040414,-0.052231,-0.07635,0.00494,-0.001557,0.12185,-0.018253,2.2e-05,-0.046328,-0.037183,0.009333,-0.022143,0.042838,-0.024325,-0.064779,0.052069,0.042633,0.00132,0.006042,-0.042953,-0.018443,-0.09006,-0.05335692,0.059462,-0.025956,0.055219,0.076949,-0.014598,-0.073576,-0.012437,0.044895,0.059667,-0.080325,0.046159,0.0414,0.03128,0.029624,-0.026657,-0.026112,0.04625,0.038576,0.044235,0.062187,-0.113027,0.011125,0.04348,0.015996,0.035488,0.033779,0.031622,0.03366,-0.023738,0.086169,0.025445,0.032409,-0.080573,0.042239,-0.049532,-0.003257,0.058517,-0.02844,0.009034,-0.060736,0.008528,-0.03473,0.09512,0.027728,0.056127,-0.046385
9,X5 POCHE TALOAK,"[0.011469854, 0.007848586, 0.007189207, -0.033...",0.01147,0.007849,0.007189,-0.033267,0.040569,-0.079482,-0.044749,-0.06828,-0.005136,0.050507,0.034945,0.052473,0.061694,0.004226,0.118597,-0.091059,-0.062447,0.076999,-0.069344,-0.066043,0.002638,-0.084529,0.036221,-0.020505,-0.054586,0.011136,-0.058893,-0.017254,-0.054525,-0.058704,-0.051798,-0.047438,-0.026418,0.032054,-0.005937,0.020134,0.019183,0.098005,-0.009964,-0.066307,0.097642,0.026228,-0.013857,0.059719,-0.038234,-0.006589,0.024641,0.088505,-0.042255,0.121902,0.005624,0.0271,-0.050557,-0.1025975,-0.07045,-0.020385,-0.092639,0.056461,-0.117415,-0.047426,0.042648,0.067869,-0.072982,-0.053574,0.065276,0.024996,0.125791,0.067204,-0.04165,0.005828,-0.03471,0.029113,0.054635,0.005524,0.033242,-0.064113,0.014816,0.071879,-0.024383,0.067243,-0.060502,0.052161,0.111784,-0.079745,-0.000571,-0.109835,0.053299,-0.01692,-0.009449,0.082346,0.060959,0.007856,0.107262,-0.018148,-0.057196,0.107396,-0.089576,-0.02774,-0.02951,-0.019647


In [None]:
fit = umap.UMAP(n_neighbors=30,min_dist=0.1,n_components=3,metric='cosine',random_state=42)
%time u = fit.fit_transform(vecs)

text_df['x']=u[:,0]
text_df['y']=u[:,1]
text_df['z']=u[:,2]

CPU times: user 5.64 s, sys: 279 ms, total: 5.92 s
Wall time: 7.26 s


In [None]:
u.shape

(583, 3)

## 3D Vizualisation

In [None]:
# This is the function to plot the queries in the embedding space.
# Here we reduce the embedding to a 3 dimensionnal space
def plot_cluster(df,iscolored=False,name='',interactive=True):
    
    if interactive:
        if iscolored:
            color=df['cluster'].values
        else: color = df['x']
        trace1 = go.Scatter3d(
            x=df['x'],
            y=df['y'],
            z=df['z'],

            mode='markers',
            marker=dict(
                size=3,
                color=color,                # set color to an array/list of desired values
                colorscale='Viridis',   # choose a colorscale
                opacity=0.3
            ),
            text=color
        )

        data = [trace1]
        layout = go.Layout(
            margin=dict(
                l=0,
                r=0,
                b=0,
                t=0
            )
        )
        fig = go.Figure(data=data, layout=layout)
        file='../data/fasttext_data/'+name+'.html'
        py.iplot(fig, filename=file)
        py.plot(fig, filename=file,auto_open=False)

In [None]:
# dont forget to create the folder "images" in your directory
is_interactive=True
filename='umap_embedding_description'

if is_interactive:
    plot_cluster(text_df,False,filename)
else:
    display(Image('../data/fasttext_data/'+filename+'.png'))

## Clustering with HDBSCAN

In [None]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=5,min_samples=5)

In [None]:
clusterer

HDBSCAN(algorithm='best', allow_single_cluster=False, alpha=1.0,
        approx_min_span_tree=True, cluster_selection_epsilon=0.0,
        cluster_selection_method='eom', core_dist_n_jobs=4,
        gen_min_span_tree=False, leaf_size=40,
        match_reference_implementation=False, memory=Memory(location=None),
        metric='euclidean', min_cluster_size=5, min_samples=5, p=None,
        prediction_data=False)

In [None]:
%time text_df['cluster']= clusterer.fit_predict(u)

CPU times: user 21.3 ms, sys: 2.69 ms, total: 24 ms
Wall time: 21.7 ms


In [None]:
text_df.groupby(['cluster'])['itemDesc'].apply(list).apply(len).sort_values(ascending=False)[0:20]

cluster
-1     189
 21     76
 11     27
 22     27
 0      21
 26     21
 1      20
 6      20
 7      16
 13     14
 10     14
 12     13
 14     11
 2      10
 5       9
 16      9
 3       9
 4       9
 8       9
 18      8
Name: itemDesc, dtype: int64

In [None]:
# The cluster number -1 means actually that the algorithm considered it as noise.
# So we will remove the noise
denoised = text_df[text_df.cluster!=-1]

In [None]:
denoised

Unnamed: 0,itemDesc,vector,x,y,z,cluster
0,MINI DONUTS FOURRES ASSORTI X6,"[0.02189265, 0.10787461, -0.014234907, -0.0226...",-0.399989,8.043009,0.023928,15
2,BRIOCHE MOUNA 400G,"[-0.0522879, 0.001154994, 0.0076559577, -0.075...",3.646831,8.56121,-2.679703,21
3,BRIOCHETTE ST GENIX X4 LOCAL,"[-0.009413259, -0.017262919, -0.007244551, -0....",3.223323,9.836374,-1.557101,22
4,BRIOCHE ST GENIX LOCAL,"[-0.01687116, -0.03279358, 0.000291368, -0.017...",3.437589,9.720731,-1.623649,22
7,MINI SUISSE X8,"[0.09554361, 0.010526974, 0.14405009, -0.13391...",0.549603,8.045481,0.598263,8
8,4 P RAIS+4 P CHOC.+4 CROIS. PB,"[0.012128612, 0.0345035, -0.043019157, 0.03134...",1.030909,7.811539,-2.037988,7
10,X4 DONUTS ASSORTIS -40%MG,"[-0.013870355, 0.11059206, -0.050391905, 0.003...",-1.07971,8.982529,0.299735,0
12,BRIOCHE NANTERRE PB 200G,"[-0.05712038, -0.08311469, 0.025220992, -0.002...",2.983588,7.975422,-2.491688,21
14,MAXI BEIGNET POMME X2,"[-0.042241365, -0.040620063, -0.015010707, -0....",0.27299,9.709663,-0.315733,11
17,MINI VIENNOISERIE X10,"[0.047347154, -0.05774416, 0.05095637, -0.0957...",0.392011,7.782959,-0.272686,14


In [None]:
# Number of clusters
print('Total number of clusters: '+str(len(denoised.cluster.unique())))

Total number of clusters: 27


In [None]:
clusters=denoised.groupby(['cluster'])['itemDesc'].apply(list)

In [None]:
cluster_i = clusters[0]
print(len(cluster_i))
cluster_i

21


['X4 DONUTS ASSORTIS -40%MG',
 'DONUTS PARTY X4',
 'DONUTS FOURRES CHOCOLAT X4',
 'DONUTS CHOCOLAT X4',
 'DONUTS RAYES ASSORTIS X4',
 'DONUTS FOURRES FRAISE X4',
 'DONUTS -40%MG DECORS CHOCO X4',
 'DONUTS -40%MG NATURE X4',
 '76G MINI DONUTS MILKA  X4',
 'DONUTS SUCRE X4 LOCAL',
 'DONUTS CHOCOLAT X4 LOCAL',
 'MIX BOX DONUT X4',
 'DONUTS HALLOWEEN X4',
 'X4 DONUTS MILKA',
 'DONUTS BLACK & GOLD X4',
 'DONUTS FRAISE SIMPSON X4',
 'DONUTS CHOCOLAT SIMPSON X4',
 'DONUTS ASSORTIS X8',
 'DONUTS MILKA X4',
 'DONUTS ASSORTIS X4',
 'DONUTS MILKA X4']

In [None]:
for i, cluster_i in enumerate(clusters):
    print("---------")
    print("cluster {} size:{}".format(i,len(cluster_i)))
    print(cluster_i)

---------
cluster 0 size:21
['X4 DONUTS ASSORTIS -40%MG', 'DONUTS PARTY X4', 'DONUTS FOURRES CHOCOLAT X4', 'DONUTS CHOCOLAT X4', 'DONUTS RAYES ASSORTIS X4', 'DONUTS FOURRES FRAISE X4', 'DONUTS -40%MG DECORS CHOCO X4', 'DONUTS -40%MG NATURE X4', '76G MINI DONUTS MILKA  X4', 'DONUTS SUCRE X4 LOCAL', 'DONUTS CHOCOLAT X4 LOCAL', 'MIX BOX DONUT X4', 'DONUTS HALLOWEEN X4', 'X4 DONUTS MILKA', 'DONUTS BLACK & GOLD X4', 'DONUTS FRAISE SIMPSON X4', 'DONUTS CHOCOLAT SIMPSON X4', 'DONUTS ASSORTIS X8', 'DONUTS MILKA X4', 'DONUTS ASSORTIS X4', 'DONUTS MILKA X4']
---------
cluster 1 size:20
['KIDS CRUNCHY DONUT X2', 'DONUTS DAIM BLISTER X2', 'DONUTS DAIM X2', 'DONUTS DAIM FOURRES X2', 'DONUTS OREO BLISTER X2', 'DONUT FOURRE CACAO X2', 'DONUT FOURRE FRAISE X2', 'DONUT OREO FOURRE X2', 'DONUTS CHOCOLAT X2', 'DONUTS SUCRE X2', 'DONUT WHITE X2', 'DONUTS GOLDEN WINTER X2', 'DONUTS SPECULOOS X2', 'DONUTS CONFETTIS X2', 'DONUTS CHOCOLAT CAKE X2', '75G DONUT ZEBRE FOURRE CACAO', 'DONUTS OREO X2', 'DONUT FOUR

## TF-IDF into the clusters

In [None]:
def get_top_grams(cluster_j):
    tokens = nltk.word_tokenize(" ".join(cluster_j))

    #Create your bigrams
    tgs = nltk.trigrams(tokens)
    bgs = nltk.bigrams(tokens)

    #compute frequency distribution for all the bigrams in the text
    fdist = nltk.FreqDist(tgs)
    trigram = " ".join(fdist.most_common(1)[0][0]), fdist.most_common(1)[0][1]
    fdist = nltk.FreqDist(bgs)
    bigram = " ".join(fdist.most_common(1)[0][0]), fdist.most_common(1)[0][1]
    fdist = nltk.FreqDist(tokens)
    onegram = fdist.most_common(1)[0][0],fdist.most_common(1)[0][1]

    return trigram, bigram, onegram

In [None]:
# Cluster description
for i, cluster_i in enumerate(clusters):
    print("---------")
    print("cluster {} size:{}".format(i,len(cluster_i)))
    print(get_top_grams(cluster_i))

---------
cluster 0 size:21


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('X4 DONUTS ASSORTIS', 3), ('X4 DONUTS', 15), ('X4', 20))
---------
cluster 1 size:20


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('X2 DONUTS DAIM', 3), ('X2 DONUTS', 10), ('X2', 18))
---------
cluster 2 size:10


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('4 MICRO DONUTS', 6), ('4 MICRO', 6), ('DONUTS', 8))
---------
cluster 3 size:9


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('PAINS AUX RAISINS', 4), ('AUX RAISINS', 7), ('AUX', 9))
---------
cluster 4 size:9


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('CROISSANT PUR BEURRE', 5), ('PUR BEURRE', 6), ('CROISSANT', 8))
---------
cluster 5 size:9


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('AU BEURRE AOP', 4), ('AU BEURRE', 4), ('CROISSANT', 9))
---------
cluster 6 size:20


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('AU BEURRE AOP', 3), ('PAIN CHOCOLAT', 14), ('PAIN', 18))
---------
cluster 7 size:16


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('PAIN CHOC PB', 4), ('CHOC PB', 5), ('PB', 16))
---------
cluster 8 size:9


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('X8 200G MINI', 6), ('X8 200G', 6), ('MINI', 9))
---------
cluster 9 size:7


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('X12 MINI BEIGNET', 3), ('MINI BEIGNET', 5), ('BEIGNET', 7))
---------
cluster 10 size:14


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('PASTEIS DE NATA', 5), ('DE NATA', 8), ('X4', 11))
---------
cluster 11 size:27


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('BEIGNET POMME X2', 3), ('MAXI BEIGNET', 5), ('BEIGNET', 14))
---------
cluster 12 size:13


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('550G BRETZELS PAR', 2), ('550 G', 4), ('BRETZELS', 11))
---------
cluster 13 size:14


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('CHAUSSONS AUX POMMES', 7), ('AUX POMMES', 12), ('AUX', 13))
---------
cluster 14 size:11


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('X10 MINI VIENNOISERIE', 2), ('MINI VIENNOISERIE', 8), ('MINI', 10))
---------
cluster 15 size:5


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('MINI DONUTS FOURRES', 2), ('MINI DONUTS', 3), ('MINI', 5))
---------
cluster 16 size:9


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('PEPITES DE CHOCO', 3), ('BRESSANE PEPITES', 3), ('PEPITES', 8))
---------
cluster 17 size:7


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('ANIMATION LOCAL BEIGNET', 4), ('LOCAL BEIGNET', 6), ('BEIGNET', 7))
---------
cluster 18 size:8


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('POGNE DE ROMANS', 5), ('POGNE DE', 5), ('DE', 7))
---------
cluster 19 size:5


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('500G MOUNA DELICES', 1), ('500G MOUNA', 2), ('500G', 2))
---------
cluster 20 size:6


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('220G BLISTER BUGNES', 2), ('220G BLISTER', 4), ('BLISTER', 5))
---------
cluster 21 size:76


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('400G BRIOCHE NANTERRE', 3), ('400G BRIOCHE', 17), ('BRIOCHE', 66))
---------
cluster 22 size:27


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('LOCAL BRIOCHETTES PRALINES', 3), ('LOCAL BRIOCHE', 16), ('LOCAL', 24))
---------
cluster 23 size:8


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('LOCAL TARTE BRESSANE', 3), ('TARTE BRESSANE', 6), ('BRESSANE', 8))
---------
cluster 24 size:6


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('BRESSANE SUCRE LOCAL', 2), ('LOCAL BRESSANE', 4), ('BRESSANE', 6))
---------
cluster 25 size:7


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('LOCAL JESUITES X2', 2), ('X2 LOCAL', 6), ('LOCAL', 7))
---------
cluster 26 size:21


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('X 4 LOCAL', 3), ('LOCAL COQUILLE', 6), ('LOCAL', 20))


In [None]:
tokens = nltk.word_tokenize(" ".join(cluster_i))

#Create your bigrams
tgs = nltk.trigrams(tokens)
bgs = nltk.bigrams(tokens)

#compute frequency distribution for all the bigrams in the text
fdist = nltk.FreqDist(tgs)
trigram = " ".join(fdist.most_common(1)[0][0]), fdist.most_common(1)[0][1]
fdist = nltk.FreqDist(bgs)
bigram = " ".join(fdist.most_common(1)[0][0]), fdist.most_common(1)[0][1]
fdist = nltk.FreqDist(tokens)
onegram = fdist.most_common(1)[0][0],fdist.most_common(1)[0][1]

print(trigram, bigram, onegram)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

('X 4 LOCAL', 3) ('LOCAL COQUILLE', 6) ('LOCAL', 20)


In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()