In [None]:
# default_exp nlp.fasttext

# FastText - UMAP - HDBSCAN

> API details.

In [None]:
import os
import time
import sys
import inspect

import pandas as pd
import numpy as np

import datetime

try:
    from fasttext import train_unsupervised
    import fasttext
except:
    from fastText import train_unsupervised
    import fastText
    
import umap.umap_ as umap
import hdbscan

import plotly.graph_objs as go
import plotly.offline as py
import plotly.graph_objs as go
from IPython.display import Image,display

In [None]:
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Get Data

In [None]:
fasttext_data_folder = "../data/fasttext_data"
train_data = os.path.join(fasttext_data_folder,"text_file.text")

In [None]:
text_df = pd.read_csv(train_data, names=["itemDesc"])
text_df.head()

Unnamed: 0,itemDesc
0,MINI DONUTS FOURRES ASSORTI X6
1,NAVETTE ASSORTIMENT X30
2,BRIOCHE MOUNA 400G
3,BRIOCHETTE ST GENIX X4 LOCAL
4,BRIOCHE ST GENIX LOCAL


## FastText training

In [None]:
cbow_model = os.path.join(fasttext_data_folder,"model_cbow.bin")
skipgram_model = os.path.join(fasttext_data_folder,"model_skipgram.bin")


**train_unsupervised parameters**
    * input             # training file path (required)
    lr                # learning rate [0.1]
    dim               # size of word vectors [100]
    ws                # size of the context window [5]
    epoch             # number of epochs [5]
    minCount          # minimal number of word occurences [1]
    minCountLabel     # minimal number of label occurences [1]
    minn              # min length of char ngram [0]
    maxn              # max length of char ngram [0]
    neg               # number of negatives sampled [5]
    wordNgrams        # max length of word ngram [1]
    loss              # loss function {ns, hs, softmax, ova} [softmax]
    bucket            # number of buckets [2000000]
    thread            # number of threads [number of cpus]
    lrUpdateRate      # change the rate of updates for the learning rate [100]
    t                 # sampling threshold [0.0001]
    label             # label prefix ['__label__']
    verbose           # verbose [2]
    pretrainedVectors # pretrained word vectors (.vec file) for supervised learning []

In [None]:
model_to_use = "skipgram"

if model_to_use == "skipgram":
    # Skipgram model :
    model_skipgram = train_unsupervised(train_data, model='skipgram')
    model_skipgram.save_model(skipgram_model)

else:
    #Cbow model :
    model_cbow = train_unsupervised(train_data, model='cbow')
    model_cbow.save_model(cbow_model)

In [None]:
# Loading of the fastext pretained model
model_skipgram=fasttext.load_model(skipgram_model)




In [None]:
model_skipgram.get_dimension()

100

In [None]:
# The get_sentence_vector takes all vectors for all the words in the query, divide each of them by their respective norm, and then average all vectors together
def query_to_vector(col_query, model_fastText):
    vector = col_query.apply(lambda x:model_fastText.get_sentence_vector(x.replace('\n',' ')))
    return vector

In [None]:
%timeit text_df['vector'] = query_to_vector(text_df['itemDesc'], model_skipgram)
del model_skipgram # We do not need the pretrained-vector in memory

21.1 ms ± 6.23 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
text_df.head(10)

Unnamed: 0,itemDesc,vector
0,MINI DONUTS FOURRES ASSORTI X6,"[0.02158853, 0.10800936, -0.014428361, -0.0226..."
1,NAVETTE ASSORTIMENT X30,"[0.01491683, 0.0463899, -0.042345867, -0.01604..."
2,BRIOCHE MOUNA 400G,"[-0.052592173, 0.0015068303, 0.0072030267, -0...."
3,BRIOCHETTE ST GENIX X4 LOCAL,"[-0.009621026, -0.016895643, -0.0072085024, -0..."
4,BRIOCHE ST GENIX LOCAL,"[-0.017043956, -0.03252602, 0.0002465509, -0.0..."
5,NAVETTE X20,"[0.026373366, -0.0044601816, -0.047210522, -0...."
6,TARTE AU SUCRE 6P LOCAL,"[-0.022768188, -0.087489955, -0.011846682, 0.0..."
7,MINI SUISSE X8,"[0.09550478, 0.01049311, 0.14422202, -0.133910..."
8,4 P RAIS+4 P CHOC.+4 CROIS. PB,"[0.011772488, 0.03474198, -0.043323327, 0.0313..."
9,X5 POCHE TALOAK,"[0.0113559365, 0.007889184, 0.0071355393, -0.0..."


## Dimension reduction with UMAP

In [None]:
# We regroup all the vectors as a numpy array
vecs=text_df.vector.values
vecs=np.stack(vecs, axis=0)
vecs.shape

(583, 100)

In [None]:
col_names = ["FT_"+str(x) for x in range(0,100)]
print(len(col_names))
test = pd.DataFrame(vecs, columns=col_names)

100


In [None]:
pd.concat([text_df,test], axis=1)

Unnamed: 0,itemDesc,vector,FT_0,FT_1,FT_2,FT_3,FT_4,FT_5,FT_6,FT_7,FT_8,FT_9,FT_10,FT_11,FT_12,FT_13,FT_14,FT_15,FT_16,FT_17,FT_18,FT_19,FT_20,FT_21,FT_22,FT_23,FT_24,FT_25,FT_26,FT_27,FT_28,FT_29,FT_30,FT_31,FT_32,FT_33,FT_34,FT_35,FT_36,FT_37,FT_38,FT_39,FT_40,FT_41,FT_42,FT_43,FT_44,FT_45,FT_46,FT_47,FT_48,FT_49,FT_50,FT_51,FT_52,FT_53,FT_54,FT_55,FT_56,FT_57,FT_58,FT_59,FT_60,FT_61,FT_62,FT_63,FT_64,FT_65,FT_66,FT_67,FT_68,FT_69,FT_70,FT_71,FT_72,FT_73,FT_74,FT_75,FT_76,FT_77,FT_78,FT_79,FT_80,FT_81,FT_82,FT_83,FT_84,FT_85,FT_86,FT_87,FT_88,FT_89,FT_90,FT_91,FT_92,FT_93,FT_94,FT_95,FT_96,FT_97,FT_98,FT_99
0,MINI DONUTS FOURRES ASSORTI X6,"[0.02158853, 0.10800936, -0.014428361, -0.0226...",0.021589,0.108009,-0.014428,-0.022685,0.02828,0.087975,-0.041295,0.035083,-0.038354,0.059446,-0.016814,-0.067185,-0.00977,0.041939,0.018721,-0.032885,-0.008577,-0.075262,-0.076178,-0.03112,-0.031405,-0.009856,-0.019507,0.001981,-0.017272,0.034201,0.04696,0.013545,-0.010608,-0.01541,-0.001328,-0.051276,0.070788,-0.055409,0.026858,-0.114945,-0.011401,-0.041776,-0.007304,-0.002424,0.018402,-0.028861,-0.059076,-0.060262,0.035896,-0.014707,0.072236,0.066961,0.024299,0.021932,-0.020327,-0.039282,-0.077175,0.033308,0.003746,0.007083,0.005596,-0.03753,-0.006734,0.032848,0.109271,-0.067665,-0.014027,-0.016231,0.012769,-0.035605,-0.031053,-0.034607,-0.025912,-0.051481,0.035385,0.080805,-0.037025,0.019314,-0.025771,0.018008,0.075179,0.010719,-0.003863,0.003526,0.04726,-0.005215,0.038773,-0.001793,0.001866,-0.01152315,-0.029741,-0.032821,0.048355,0.006067,-0.015916,-0.074651,0.016909,-0.076681,0.034382,-0.009295,0.105053,-0.032185,-0.043723,-0.099093
1,NAVETTE ASSORTIMENT X30,"[0.01491683, 0.0463899, -0.042345867, -0.01604...",0.014917,0.04639,-0.042346,-0.016042,0.060447,-0.001797,-0.008045,0.154593,-0.047424,0.001132,0.032217,-0.059574,0.000717,0.00123,0.161351,-0.091212,-0.071427,0.090881,-0.095705,0.029092,-0.081243,0.044408,-0.016194,0.024331,-0.010957,-0.078223,0.084997,0.005291,-0.034417,-0.063888,-0.033734,0.063391,0.003938,-0.017342,-0.007127,-0.062274,-0.106297,-0.080983,-0.02347,-0.035458,0.023864,0.077621,0.040681,-0.087621,-0.048138,0.029328,-0.020172,-0.013593,-4.1e-05,-0.003258,0.059128,0.015102,-0.114664,-0.056938,-0.020718,-0.02813,-0.116602,0.071858,-0.002723,-0.021435,-0.057342,-0.058563,0.01987,-0.075311,-0.025875,0.038814,-0.069627,-0.000348,0.081428,-0.058699,0.071183,-0.005207,-0.045708,-0.052555,-0.041583,0.094484,0.106114,-0.012816,-0.10272,0.048042,-0.119289,0.007316,0.008341,0.129051,0.028428,-0.07965221,0.045516,-0.008542,-0.040991,-0.006551,-0.094721,-0.047693,0.011115,-0.078789,0.081529,0.060254,0.02235,-0.001472,-0.1227,-0.096947
2,BRIOCHE MOUNA 400G,"[-0.052592173, 0.0015068303, 0.0072030267, -0....",-0.052592,0.001507,0.007203,-0.07568,0.012186,0.064446,-0.018036,0.027879,0.003005,-0.02149,-0.05138,0.038776,-0.012552,-0.00717,0.048757,0.066186,0.004172,0.000296,0.013349,-0.029195,0.067025,0.082093,-0.071041,0.061429,-0.029163,-0.01911,-0.02245,-0.008152,0.015607,-0.009915,-0.005514,0.020968,-0.018824,-0.065464,-0.050438,0.037773,-0.01794,-0.011465,-0.148411,-0.020821,0.046366,0.064355,-0.05489,0.033843,-0.025245,0.047857,0.064039,0.028168,0.077153,-0.032028,-0.019151,-0.00938,-0.048796,-0.02281,-0.142978,-0.029365,0.026998,0.028427,0.018528,0.06395,-0.028166,-0.054642,-0.087238,-0.024618,-0.01825,-0.036454,0.023793,0.018883,-0.055154,0.056277,0.059927,0.017362,-0.042239,-0.01966,-0.082078,-0.076041,0.021497,0.087942,-0.01411,0.004494,-0.041924,0.027201,0.096872,0.105393,-0.008537,-0.06275854,-0.015903,0.058681,-0.071392,-0.024214,0.039547,-0.05093,-0.062387,-0.081011,-0.132281,-0.143439,-0.023694,-0.057964,0.012188,-0.110044
3,BRIOCHETTE ST GENIX X4 LOCAL,"[-0.009621026, -0.016895643, -0.0072085024, -0...",-0.009621,-0.016896,-0.007209,-0.020957,0.027357,0.004364,-0.031856,0.004262,-0.055088,-0.095801,-0.016329,0.009801,-0.00662,-0.030099,0.016967,-0.03248,0.027324,-0.014296,-0.017306,0.008483,0.092372,0.04401,0.01045,-0.026137,-0.058371,-0.083154,0.089349,-0.024888,-0.019418,0.078616,-0.047237,0.026298,0.035347,-0.05276,-0.042598,-0.020195,-0.005494,-0.0411,-0.09617,-0.037849,0.037929,0.018922,-0.066139,0.009404,0.006308,0.032553,0.034506,-0.046652,0.006822,0.038092,0.023245,-0.022983,-0.067152,-0.041628,-0.059555,-0.029489,-0.073096,0.018267,0.033898,-0.015798,-0.054062,-0.016792,0.00092,0.053209,-0.015172,0.000645,-0.051357,-0.024229,-0.065708,0.001955,-0.001535,-0.065037,0.037564,-0.011814,0.003169,0.110819,0.094337,0.000234,0.056564,0.006655,-0.054713,-0.040696,0.087064,0.000148,-0.047299,-0.00572983,-0.040439,-0.045602,-0.069049,0.003415,-0.053852,0.025098,-0.006903,-0.00186,-0.022255,0.01026,0.030097,0.013966,0.005794,-0.029853
4,BRIOCHE ST GENIX LOCAL,"[-0.017043956, -0.03252602, 0.0002465509, -0.0...",-0.017044,-0.032526,0.000247,-0.018057,0.015052,0.026441,-0.052488,0.006981,-0.071462,-0.118717,-0.034114,0.00077,0.073269,-0.035295,0.003912,-0.028325,0.053939,0.015395,-0.025127,-0.001886,0.109929,0.034045,-0.033123,-0.033706,-0.094361,-0.063712,0.069389,-0.035298,-0.036871,0.092882,-0.000819,0.002794,0.039104,-0.03312,-0.031653,0.010485,0.010365,-0.055,-0.102433,-0.00981,0.055596,-0.013228,-0.059973,0.044242,-0.003224,0.011762,0.03102,-0.04752,0.009492,0.018104,0.004663,-0.011203,-0.07176,-0.01981,-0.046685,-0.018393,-0.058645,0.029507,0.051854,-0.004461,-0.029406,0.014498,0.066554,0.117318,-0.004641,0.016679,-0.046308,-0.015066,-0.040346,0.029158,0.005021,-0.036723,0.018906,-0.003323,-0.042709,0.089683,0.064288,-0.01859,0.036137,0.021723,-0.042132,-0.063719,0.116375,0.044792,-0.071725,-0.007117828,-0.06906,-0.012224,0.000405,0.016538,-0.003618,0.06947,-0.011848,0.052702,-0.058942,-0.020378,0.009045,-0.031049,-0.025424,-0.036888
5,NAVETTE X20,"[0.026373366, -0.0044601816, -0.047210522, -0....",0.026373,-0.00446,-0.047211,-0.003986,0.13641,-0.034128,0.076796,0.098087,0.042687,-0.046791,-0.052565,0.000333,-0.000769,-0.032209,0.017774,0.019159,-0.095643,-0.058929,-0.03433,0.067317,-0.054158,-0.036627,0.08058,-0.056811,-0.037066,-0.007115,0.031365,0.134445,-0.06829,0.001844,-0.018295,0.034209,-0.014284,-0.003102,-0.090001,-0.078513,0.002591,-0.078003,-0.02061,0.066645,0.050632,0.048764,-0.013538,-0.08026,0.070648,-0.027712,-0.144002,0.180187,-0.053777,-0.107354,0.017199,-0.074376,-0.052435,0.042345,-0.03725,0.035002,-0.048585,-0.03457,0.029972,-0.059649,-0.061567,-0.08608,0.138632,-0.017341,0.09296,-0.012815,-0.05442,-0.026605,0.168651,-0.077669,0.026518,0.019559,0.00789,-0.043322,0.024021,0.128048,-0.002613,-0.025747,-0.027386,-0.001176,-0.031338,-0.063706,-0.048356,0.071228,0.089964,-0.05784636,0.10474,0.057235,-0.116384,-0.017537,-0.033049,-0.031784,0.059364,-0.10628,0.059081,0.026256,0.059706,0.007656,0.01778,-0.08346
6,TARTE AU SUCRE 6P LOCAL,"[-0.022768188, -0.087489955, -0.011846682, 0.0...",-0.022768,-0.08749,-0.011847,0.03241,-0.02706,-0.014671,0.050644,-0.044816,-0.092003,-0.024154,0.006444,0.043038,-0.069915,0.013889,0.019765,-0.019192,0.028457,0.00915,-0.062381,-0.045677,-0.016574,0.009984,-0.000732,0.031447,-0.024754,-0.034556,0.033296,-0.041203,-0.044363,0.058866,-0.051534,0.011089,-0.05267,-0.031516,-0.040577,0.037617,0.033684,-0.084216,-0.029658,0.061474,-0.048175,0.007165,-0.00289,-0.030739,-0.02797,-0.023548,0.063721,-0.066304,-0.014463,-0.020269,-0.012556,0.042882,0.031751,0.01936,-0.039577,-0.054091,-0.081174,0.075169,-0.008417,-0.014039,0.000773,0.010862,0.037163,0.103904,0.006253,0.025801,-0.079379,0.036897,0.007254,0.020989,-0.07285,-0.000144,-0.013581,-0.000533,0.055765,-0.009756,0.098686,0.033447,0.040477,0.027928,-0.068837,-0.009896,0.016342,0.055276,-0.004113,-0.02757353,0.021815,-0.017437,-0.006534,-0.048651,-0.064126,0.02618,-0.042213,0.022005,-0.054726,0.049645,0.009985,-0.051206,0.003074,-0.026498
7,MINI SUISSE X8,"[0.09550478, 0.01049311, 0.14422202, -0.133910...",0.095505,0.010493,0.144222,-0.133911,0.065381,0.070608,0.026402,0.052915,0.022738,0.060115,-0.012193,-0.029717,-0.09412,-0.071478,0.002774,0.02869,-0.081551,-0.103555,-0.078878,-0.053568,0.015113,-0.016059,0.104229,-0.049161,-0.034806,0.009399,-0.031948,0.111138,-0.090147,0.051556,-0.082641,-0.032442,0.046291,-0.014736,-0.1075,-0.052519,-0.048485,-0.053416,0.039807,-0.021999,0.063315,0.000929,-0.04207,-0.059837,-0.021428,-0.012739,0.030817,0.058511,0.045577,0.021391,-0.042455,0.07426,-0.03677,0.081526,-0.087132,0.021244,0.02625,-0.011575,0.088286,0.03138,0.069226,-0.06727,-0.097002,0.038095,0.033742,-0.020776,0.024199,0.070046,0.09276,0.102065,-0.139784,0.024447,0.062611,-0.163363,0.002858,-0.046088,0.06222,0.000641,0.030015,-0.033453,0.042568,-0.049416,-0.006047,-0.027346,0.076679,0.05639534,-0.079668,0.020379,0.08644,-0.039014,0.026124,-0.033861,0.026547,-0.036755,-0.017359,-0.037483,0.034584,0.036827,0.014651,0.054873
8,4 P RAIS+4 P CHOC.+4 CROIS. PB,"[0.011772488, 0.03474198, -0.043323327, 0.0313...",0.011772,0.034742,-0.043323,0.031303,0.038077,-0.033529,-0.016927,0.053415,-0.010683,0.075392,0.015769,-0.019399,-0.012199,-0.012943,-0.039994,0.000866,-0.004267,0.102078,0.008711,-0.05578,-0.010417,0.003119,0.091861,0.007804,-0.003262,0.086163,0.055333,0.027552,0.0055,0.009491,-0.002099,0.040433,-0.052547,-0.076567,0.004448,-0.001484,0.12193,-0.018835,-0.000192,-0.046602,-0.037107,0.009934,-0.022254,0.042935,-0.024407,-0.064557,0.052079,0.042172,0.000865,0.006061,-0.043188,-0.018083,-0.09042,-0.053211,0.05928,-0.026122,0.055086,0.076846,-0.014737,-0.073626,-0.012573,0.045278,0.060027,-0.080178,0.04616,0.04137,0.031036,0.030114,-0.026533,-0.025932,0.045734,0.038719,0.044239,0.062152,-0.113015,0.01121,0.04393,0.016127,0.035625,0.033566,0.031361,0.033886,-0.023253,0.085987,0.02524,0.03220323,-0.080469,0.042208,-0.049841,-0.003472,0.05866,-0.028167,0.009111,-0.061025,0.008465,-0.034792,0.095592,0.027904,0.055908,-0.046442
9,X5 POCHE TALOAK,"[0.0113559365, 0.007889184, 0.0071355393, -0.0...",0.011356,0.007889,0.007136,-0.03333,0.04058,-0.079546,-0.044715,-0.068227,-0.005109,0.050623,0.035,0.052434,0.061816,0.004308,0.118599,-0.090994,-0.062457,0.077021,-0.069325,-0.066126,0.00262,-0.084458,0.036272,-0.020614,-0.054565,0.011081,-0.058807,-0.017243,-0.054609,-0.05852,-0.051808,-0.047441,-0.026566,0.031869,-0.006074,0.020196,0.019345,0.09785,-0.010075,-0.066354,0.097638,0.026346,-0.013761,0.059785,-0.038295,-0.006504,0.024678,0.088418,-0.042483,0.121871,0.005538,0.027227,-0.050656,-0.102577,-0.070431,-0.020475,-0.092597,0.056381,-0.117321,-0.047393,0.042564,0.067874,-0.072918,-0.053618,0.065229,0.024985,0.125751,0.06733,-0.04156,0.005831,-0.034771,0.029158,0.05465,0.005492,0.033147,-0.064128,0.014936,0.071913,-0.024378,0.067204,-0.060488,0.052196,0.111866,-0.079808,-0.000653,-0.109955,0.053258,-0.016964,-0.009451,0.082196,0.061094,0.007824,0.107326,-0.018288,-0.057123,0.107361,-0.08945,-0.027807,-0.029604,-0.019643


In [None]:
fit = umap.UMAP(n_neighbors=30,min_dist=0.1,n_components=3,metric='cosine',random_state=42)
%time u = fit.fit_transform(vecs)

text_df['x']=u[:,0]
text_df['y']=u[:,1]
text_df['z']=u[:,2]

CPU times: user 5.83 s, sys: 107 ms, total: 5.94 s
Wall time: 6.27 s


In [None]:
u.shape

(583, 3)

## 3D Vizualisation

In [None]:
# This is the function to plot the queries in the embedding space.
# Here we reduce the embedding to a 3 dimensionnal space
def plot_cluster(df,iscolored=False,name='',interactive=True):
    
    if interactive:
        if iscolored:
            color=df['cluster'].values
        else: color = df['x']
        trace1 = go.Scatter3d(
            x=df['x'],
            y=df['y'],
            z=df['z'],

            mode='markers',
            marker=dict(
                size=3,
                color=color,                # set color to an array/list of desired values
                colorscale='Viridis',   # choose a colorscale
                opacity=0.3
            ),
            text=color
        )

        data = [trace1]
        layout = go.Layout(
            margin=dict(
                l=0,
                r=0,
                b=0,
                t=0
            )
        )
        fig = go.Figure(data=data, layout=layout)
        file='../data/fasttext_data/'+name+'.html'
        py.iplot(fig, filename=file)
        py.plot(fig, filename=file,auto_open=False)

In [None]:
is_interactive=False
filename='umap_embedding_description'

if is_interactive:
    plot_cluster(text_df,False,filename)

## Clustering with HDBSCAN

In [None]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=5,min_samples=5)

In [None]:
clusterer

HDBSCAN(algorithm='best', allow_single_cluster=False, alpha=1.0,
        approx_min_span_tree=True, cluster_selection_epsilon=0.0,
        cluster_selection_method='eom', core_dist_n_jobs=4,
        gen_min_span_tree=False, leaf_size=40,
        match_reference_implementation=False, memory=Memory(location=None),
        metric='euclidean', min_cluster_size=5, min_samples=5, p=None,
        prediction_data=False)

In [None]:
%time text_df['cluster']= clusterer.fit_predict(u)

CPU times: user 28.7 ms, sys: 3.83 ms, total: 32.5 ms
Wall time: 30.9 ms


In [None]:
text_df.groupby(['cluster'])['itemDesc'].apply(list).apply(len).sort_values(ascending=False)[0:20]

cluster
-1     161
 12     71
 8      33
 13     28
 18     27
 9      24
 20     22
 16     22
 11     19
 6      17
 10     15
 2      13
 14     12
 22     11
 23     11
 15     11
 5       9
 4       9
 3       9
 1       9
Name: itemDesc, dtype: int64

In [None]:
# The cluster number -1 means actually that the algorithm considered it as noise.
# So we will remove the noise
denoised = text_df[text_df.cluster!=-1]

In [None]:
denoised

Unnamed: 0,itemDesc,vector,x,y,z,cluster
0,MINI DONUTS FOURRES ASSORTI X6,"[0.02158853, 0.10800936, -0.014428361, -0.0226...",0.645584,-2.659455,-3.38308,9
2,BRIOCHE MOUNA 400G,"[-0.052592173, 0.0015068303, 0.0072030267, -0....",-3.299386,-4.585773,-2.454369,12
3,BRIOCHETTE ST GENIX X4 LOCAL,"[-0.009621026, -0.016895643, -0.0072085024, -0...",-2.899542,-4.460345,-4.154674,18
4,BRIOCHE ST GENIX LOCAL,"[-0.017043956, -0.03252602, 0.0002465509, -0.0...",-3.165269,-4.390998,-4.015838,18
7,MINI SUISSE X8,"[0.09550478, 0.01049311, 0.14422202, -0.133910...",-0.511436,-2.663689,-2.772304,0
12,BRIOCHE NANTERRE PB 200G,"[-0.057377636, -0.082840875, 0.024681043, -0.0...",-2.361974,-4.934412,-2.334155,12
14,MAXI BEIGNET POMME X2,"[-0.042263057, -0.040797967, -0.015237959, -0....",-0.207774,-3.361046,-4.676132,8
15,BOULE BRIOCHE CHOCOLAT X4,"[-0.009851376, 0.0010039834, -0.041003454, -0....",-1.273335,-5.33517,-2.631642,4
16,MINI VIENNOISERIE BIO X15,"[0.06694245, -0.072235204, -0.016080922, -0.05...",0.112263,-3.454064,-2.806441,9
17,MINI VIENNOISERIE X10,"[0.04730966, -0.057482123, 0.0504705, -0.09562...",-0.039886,-3.381804,-2.8741,9


In [None]:
# Number of clusters
print('Total number of clusters: '+str(len(denoised.cluster.unique())))

Total number of clusters: 26


In [None]:
clusters=denoised.groupby(['cluster'])['itemDesc'].apply(list)

In [None]:
cluster_i = clusters[0]
print(len(cluster_i))
cluster_i

9


['MINI SUISSE X8',
 'MINI GOURMANDISES X8',
 '200G MINI BEIGNET ABRICOT X8',
 'MINI PANIER SALE X8',
 '200G MINI BEIGNET CARAMEL X8',
 '200G MINI BEIGNET CHOCOLAT X8',
 '200G MINI BEIGNET POMME X8',
 '200G MINI BEIGNET FRAMBOISE X8',
 '200G MINI BEIGNET CHOC BLAN X8']

In [None]:
for i, cluster_i in enumerate(clusters):
    print("---------")
    print("cluster {} size:{}".format(i,len(cluster_i)))
    print(cluster_i)

---------
cluster 0 size:9
['MINI SUISSE X8', 'MINI GOURMANDISES X8', '200G MINI BEIGNET ABRICOT X8', 'MINI PANIER SALE X8', '200G MINI BEIGNET CARAMEL X8', '200G MINI BEIGNET CHOCOLAT X8', '200G MINI BEIGNET POMME X8', '200G MINI BEIGNET FRAMBOISE X8', '200G MINI BEIGNET CHOC BLAN X8']
---------
cluster 1 size:9
['500G GACHE PEPITES CHOCOLAT', 'SUISSE CHOCOLATX2', 'BRESSANE PEPITES CHOCO 12P LOC', 'BRESSANE PEPITES DE CHOCO LOC', 'CHINOIS PEPITES CHOCOLAT', 'TORSADE CHOCO X 2', 'AGNEAU PASCAL PEPITES', 'SUISSE PEPITES CHOCO X2', 'BRIOCHE PEPITES DE CHOCO. LOC']
---------
cluster 2 size:13
['POGNE DE ROMANS 500 CF', 'SAINT-GENIX 500 CF', '500G POGNE DE ROMANS', '500G MOUNA DELICES DAUPHINOIS', '500G GATEAU DE PAQUES AUX FRTS', 'MOUNA 500 CF', '500G POGNE DE ROMANS', 'CHINOIS CREME PATISSIERE 500G', '500G POGNE DE ROMANS', '500G POGNE DE ROMANS CUIT SURG', '3F.ABRICOT+3P.RAISIN+3P.CHOCO', '500G MOUNA CUIT SURGELE', 'AGNEAU DE PAQUES 170G']
---------
cluster 3 size:9
['220G OREILLETTE CR

## TF-IDF into the clusters

In [None]:
def get_top_grams(cluster_j):
    tokens = nltk.word_tokenize(" ".join(cluster_j))

    #Create your bigrams
    tgs = nltk.trigrams(tokens)
    bgs = nltk.bigrams(tokens)

    #compute frequency distribution for all the bigrams in the text
    fdist = nltk.FreqDist(tgs)
    trigram = " ".join(fdist.most_common(1)[0][0]), fdist.most_common(1)[0][1]
    fdist = nltk.FreqDist(bgs)
    bigram = " ".join(fdist.most_common(1)[0][0]), fdist.most_common(1)[0][1]
    fdist = nltk.FreqDist(tokens)
    onegram = fdist.most_common(1)[0][0],fdist.most_common(1)[0][1]

    return trigram, bigram, onegram

In [None]:
# Cluster description
for i, cluster_i in enumerate(clusters):
    print("---------")
    print("cluster {} size:{}".format(i,len(cluster_i)))
    print(get_top_grams(cluster_i))

---------
cluster 0 size:9


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('X8 200G MINI', 6), ('X8 200G', 6), ('MINI', 9))
---------
cluster 1 size:9


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('PEPITES DE CHOCO', 2), ('PEPITES CHOCOLAT', 2), ('PEPITES', 7))
---------
cluster 2 size:13


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('POGNE DE ROMANS', 5), ('POGNE DE', 5), ('500G', 8))
---------
cluster 3 size:9


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('+10 % BLISTER', 2), ('BLISTER BUGNES', 3), ('220G', 6))
---------
cluster 4 size:9


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('PB BOULE BRIOCHE', 4), ('BOULE BRIOCHE', 7), ('BOULE', 7))
---------
cluster 5 size:9


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('PAINS AUX RAISINS', 4), ('AUX RAISINS', 7), ('AUX', 8))
---------
cluster 6 size:17


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('CHAUSSONS AUX POMMES', 7), ('AUX POMMES', 12), ('AUX', 13))
---------
cluster 7 size:8


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('X12 MINI BEIGNET', 2), ('MINI BEIGNET', 5), ('BEIGNET', 7))
---------
cluster 8 size:33


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('BEIGNET POMME X2', 3), ('MAXI BEIGNET', 5), ('X2', 17))
---------
cluster 9 size:24


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('MINI DONUTS FOURRES', 2), ('MINI VIENNOISERIE', 9), ('MINI', 20))
---------
cluster 10 size:15


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('4 + 1', 3), ('550 G', 4), ('BRETZELS', 11))
---------
cluster 11 size:19


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('X2 DONUT FOURRE', 4), ('X2 DONUTS', 10), ('X2', 18))
---------
cluster 12 size:71


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('400G BRIOCHE NANTERRE', 3), ('400G BRIOCHE', 17), ('BRIOCHE', 63))
---------
cluster 13 size:28


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('CROISSANT PUR BEURRE', 6), ('PUR BEURRE', 12), ('CROISSANT', 20))
---------
cluster 14 size:12


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('PASTEIS DE NATA', 5), ('DE NATA', 8), ('DE', 10))
---------
cluster 15 size:11


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('4 MICRO DONUTS', 6), ('4 MICRO', 6), ('DONUTS', 10))
---------
cluster 16 size:22


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('DONUT CHUPA CHUPS', 4), ('X4 DONUTS', 11), ('X4', 18))
---------
cluster 17 size:7


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('ANIMATION LOCAL BEIGNET', 4), ('LOCAL BEIGNET', 6), ('BEIGNET', 7))
---------
cluster 18 size:27


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('LOCAL BRIOCHE COURONNE', 3), ('LOCAL BRIOCHE', 17), ('LOCAL', 23))
---------
cluster 19 size:7


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('PAIN CHOC PB', 3), ('PAIN CHOC', 3), ('PB', 7))
---------
cluster 20 size:22


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('X2 PAIN CHOCOLAT', 3), ('PAIN CHOCOLAT', 15), ('PAIN', 20))
---------
cluster 21 size:6


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('X 4 LOCAL', 5), ('X 4', 6), ('X', 6))
---------
cluster 22 size:11


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('LOCAL TARTE BRESSANE', 4), ('TARTE BRESSANE', 6), ('BRESSANE', 11))
---------
cluster 23 size:11


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('X10 LOCAL COQUILLETTE', 3), ('LOCAL COQUILLE', 6), ('LOCAL', 11))
---------
cluster 24 size:8


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('X2 LOCAL JESUITES', 2), ('X2 LOCAL', 6), ('LOCAL', 8))
---------
cluster 25 size:5


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(('MOUNA LOCAL GRIAUDE', 1), ('LOCAL GACHE', 2), ('LOCAL', 5))


In [None]:
tokens = nltk.word_tokenize(" ".join(cluster_i))

#Create your bigrams
tgs = nltk.trigrams(tokens)
bgs = nltk.bigrams(tokens)

#compute frequency distribution for all the bigrams in the text
fdist = nltk.FreqDist(tgs)
trigram = " ".join(fdist.most_common(1)[0][0]), fdist.most_common(1)[0][1]
fdist = nltk.FreqDist(bgs)
bigram = " ".join(fdist.most_common(1)[0][0]), fdist.most_common(1)[0][1]
fdist = nltk.FreqDist(tokens)
onegram = fdist.most_common(1)[0][0],fdist.most_common(1)[0][1]

print(trigram, bigram, onegram)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

('MOUNA LOCAL GRIAUDE', 1) ('LOCAL GACHE', 2) ('LOCAL', 5)


In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_connectors.gcp.ipynb.
Converted 01_nlp.fasttext.ipynb.
Converted 02_forecasting.dataprep.ipynb.
Converted 03_models.catboost.ipynb.
Converted 04_nlp.nbsvm.ipynb.
Converted index.ipynb.
