### LDA- sklearn:

In [1]:
import logging
import config

from IPython import display
from dataset.TPL import *
import dataset.preprocessing as pre

import numpy as np
import pandas as pd
import time
import pickle
import ftn
# from pprint import pprint
import matplotlib.pyplot as plt

import pyLDAvis
import pyLDAvis.sklearn

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

%matplotlib inline
%reload_ext autoreload
%autoreload 2
logging.basicConfig(level=logging.INFO)


ModuleNotFoundError: No module named 'dataset'

Antes de comenzar, destaquemos la metodología que vamos a utilizar para llevar a cabo el modelo:
- 1) Colección de la data
- 2) Prepocesamiento de la información
- 3) Implementación del modelo LDA-sktl
- 4) Visualización de los datos

Comencemos:

### 1) Colección de la data: 

In [2]:
#Importamos la data:
gravity1=TplGravity(config=config.ds_small)

In [3]:
# Some rows are small (<5 sec) or unbounded (>30m), this criteria will restrict to only the good ones
gravity1.index = gravity1.index[gravity1.index["SECONDS"] > 5]
gravity1.index = gravity1.index[gravity1.index["SECONDS"] < 1800]

print(len(gravity1.index))

34


### 2) Preprocesamiento de la información:

La función generate_serie_SES nos arrojará el dataset prepocesado con el cual vamos a trabajar, su utilidad radica en:
- permite trabajar cada evento como un token.
- limpia la información que no aporta en el análisis a través colorize2
- estandariza el formato para el resto de los dataset.

(para más información de esta función revisar "../The Notebook 0/ 1-. Aboutdata.ipynb" y "../The Notebook 0/ 2-. Countvectorizer().ipynb"

In [8]:
class NumbersAndTelescopes(pre.Numbers, pre.Telescopes):
    pass

colorize1 = pre.Numbers()
# colorize2 = NumbersAndTelescopes()
colorize2 = pre.TplGravityColor()

In [12]:
def generate_serie_SES(obs): 
    chunks=[]
    for i in obs.index.index.values:
        T = obs.load_trace(i)
        T['color'] = T['event'].apply( lambda x: colorize2.color(x).replace(' ','_')  )
        chunks.append( pd.Series([ list(T['color'].values) ],index=[i]) )

    return pd.concat(chunks)

In [13]:
%%time
#Aplicamos la función:
gravity_SES=generate_serie_SES(gravity1)

CPU times: user 45.7 s, sys: 1.97 s, total: 47.7 s
Wall time: 51.5 s


In [14]:
gravity_SES

0     [GRAVITY_{}-{}_(yellow), Start{}_{}), COU_AG_A...
1     [GRAVITY_{}-{}_(yellow), Start{}_{}), DET1_DIT...
2     [GRAVITY_{}-{}_(yellow), Start{}_{}), COU_GS_F...
3     [Start{}_{}), GRAVITY_{}-{}_(yellow), COU_AG_P...
4     [Start{}_{}), COU_AG_OBJCNT_=_'F', COU_AG_DELT...
5     [Start{}_{}), COU_AG_GSSOURCE_=_'SCIENCE', COU...
6     [Start{}_{}), GRAVITY_{}-{}_(yellow), COU_AG_O...
7     [Start{}_{}), COU_AG_PMA_=_'{}', COU_AG_START_...
8     [Start{}_{}), DET2_NDIT_SKY_=_'{}', DET1_DIT_=...
9     [GRAVITY_{}-{}_(yellow), Start{}_{}), COU_GS_M...
10    [GRAVITY_{}-{}_(yellow), Start{}_{}), INS.LAMP...
12    [Start{}_{}), GRAVITY_gen_cal_{}_(yellow), INS...
15    [Start{}_{}), DET1_DIT_=_'{}', DET1_NDIT_=_'{}...
16    [Start{}_{}), SEQ_SWITCHLASER_=_'F', SEQ_STAND...
18    [GRAVITY_gen_cal_{}_(yellow), Start{}_{}), SEQ...
19    [Start{}_{}), GRAVITY_gen_cal_{}_(yellow), SEQ...
20    [Start{}_{}), SEQ_FDDL_MIN_=_'{}', SEQ_FDDL_TO...
21    [GRAVITY_gen_tec_{}_(yellow), Start{}_{}),

In [7]:
#Para el caso de querer re-ejecutar el modelo con una configuración distinta, guardamos nuestra data completa prepocesada:
file1=open("gravity_SES","wb")
pickle.dump(gravity_SES,file1)
file1.close()

In [8]:
#Checkpoint 1: importamos la data gravity_SES
gravity_SES=pickle.load(open("gravity_SES","rb"))

Una vez que ya tenemos la información prepocesada con la cual vamos a llevar a cabo el modelo, corresponde separarla en "información de entrenamiento" e "información de testeo" a través de la función "traintestdata" que implementa la herramienta **train_test_split()** de sklearn para este propósito: 

In [9]:
def traintestdata(obs_serie,num):
    
    #obs_serie: dataset prepocesada.
    #num: rate: represent the proportion of the dataset to include in the test split
    X_train, X_test = train_test_split(obs_serie,  test_size=num, random_state=42)
    
    #X_train: info for training 
    #Y_test: info for testing
    return X_train,X_test

In [10]:
#Separamos la train_data de la test_data
gravity_train,gravity_test=ftn.traintestdata(gravity_SES,0.2)

In [11]:
# guardamos nuestra data en un pickle para no tener que re-ejecutar todo lo anterior:
#file2=open("gravity_train","wb")
#pickle.dump(gravity_train,file2)
#file2.close()

In [12]:
#file3=open("gravity_test","wb")
#pickle.dump(gravity_test,file3)
#file3.close()

In [13]:
#Checkpoint 2: importamos la data gravity_train
gravity_train=pickle.load(open("gravity_train","rb"))

Ahora corresponde crear nuestro diccionario, para ello utilizaremos la función CountVectorizer() de SKlearn y un tokenizador personalizado con la instrucción de separar las palabras cuando encuentre un espacio y respete los caracteres especiales, de esta manera nos aseguramos que la tokenización respete la premisa principal del prepocesamiento que es considerar a cada **evento como un token**:

In [14]:
import re
#Implementamos un tokenizador que separa las palabras unicamente cuando encuentre espacios.
def my_tokenizer(text):
    # split based on whitespace
    return re.split("\\s+",text)

In [15]:
def vectorizer(info_train):
    
    #Info_train: data prepocessed for training 
    
    #Clase del CountVectorizer() con el tokenizador implementado
    vect=CountVectorizer(tokenizer=my_tokenizer,lowercase=False)
    
    #Dejamos todo como str (la info contiene números)
    info_train=info_train.apply(str)
    
    #Aplicamos el C_V y obtenemos la matrix token-frequency
    info_train = vect.fit_transform(info_train)
    
    #return de matrix token-frequency in info_train and the class CountVectorizer()
    return info_train, vect

In [16]:
#Creamos la matriz toke-frequency para nuestra data de entrenamiento:
info_train_v, v= vectorizer(gravity_train)

In [17]:
#file4=open("v_class","wb")
#pickle.dump(v,file4)
#file4.close()

In [18]:
#Checkpoint 3: importamos la clase CountVectorizer()
v=pickle.load(open("v_class","rb"))

In [19]:
print('Numéro de traces: ',info_train_v.shape[0])
print('Número de tokens: ',info_train_v.shape[1])

Numéro de traces:  793
Número de tokens:  10582


In [20]:
#Mostramos el diccionario (le estamos asignando un identificador numérico único a cada token del dataset_train)
list(v.vocabulary_)[:10]

["['Started_at_{}_(underlined)',",
 "'GRAVITY_single_acq_--_GRAVITY_Single_Mode_Acquisition_(yellow)',",
 '"COU_AG_ALPHA_=_\'{}\'",',
 '"COU_AG_DELTA_=_\'{}\'",',
 '"COU_AG_OPTIMIZE_=_\'F\'",',
 '"COU_AG_PMA_=_\'{}\'",',
 '"AOS_AO_MODE_=_\'FULL_AO\'",',
 '"COU_AG_GSSOURCE_=_\'SCIENCE\'",',
 '"COU_AG_PMD_=_\'{}\'",',
 '"COU_AG_START_=_\'F\'",']

In [21]:
#Mostramos en orden alfabético el vocabulario:
v.get_feature_names()[:10]

['"##Exposure_\'{}\'_by_\'NGCIR2\'_aborted.",',
 '"\'finally\'_block_executed_(brown)",',
 '">>Application_\'gvoControl\'_started_(r320701).",',
 '">>Application_\'gvoControl\'_starting_(r320701)_...",',
 '"AOS_AO_MODE_=_\'FULL_AO\'",',
 '"Application_\'gvoControl\'_terminated_(r320701).",',
 '"Archiver_last_reply_(Buf:\'{}_(mymerge_10.arf)\')",',
 '"Archiver_last_reply_(Buf:\'{}_(mymerge_100.arf)\')",',
 '"Archiver_last_reply_(Buf:\'{}_(mymerge_101.arf)\')",',
 '"Archiver_last_reply_(Buf:\'{}_(mymerge_103.arf)\')",']

### **3) Implementación del modelo**: 

Importamos el modelo LatentDirichletAllocation directamente de sklearn y configuramos los siguientes parámetros:
- n_components: número de tópicos:
- max_iter: the maximum number of passes over the training data.
- learning_method: method used to update_component. Only used in fit method. In general, if the data size is large, the online update will be much faster than the batch update.
- learning_offset: A (positive) parameter that downweights early iterations in online learning
- random_state: Pass an int for reproducible results across multiple function calls.

Para más información de los parámetros con los que vamos a trabajar y con los que dejamos de lado (el objetivo es hacer andar el modelo, más adelante profundizaremos en la optimización) revisar la website oficial de [LDA-SKTL](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html)

In [22]:
%%time
#número de tópicos
n_topics = 10

#configuramos el modelo
ldamodel = LatentDirichletAllocation(n_components = n_topics, max_iter=10, learning_method='online',learning_offset=50., random_state=0)

#le entregamos nuestra matriz token-frequency creada anteriormente
ldamodel.fit(info_train_v)

# making LDA TOP MATRIX USING CORPUS TF
lda_topic_modelling = ldamodel.fit_transform(info_train_v)

CPU times: user 18 s, sys: 2.06 s, total: 20.1 s
Wall time: 18.5 s


Con el modelo ya funcionando procedemos a visualizar la información, para ello crearemos las keys (vector que contiene el número del tópico predominante para cada trace) a través de la función **get_keys()** y luego procedemos a contar y categorizar estas keys a través de **key_to_count()**

In [23]:
#return an integer list of predicted topic catergories for a given topic matrix
def get_keys(topic_matrix):
    #topic_matrix is the top matrix created with  ldamodel.fit_transform(info_train_v)
    
    # print(topic_matrix.argmax(axis = 1)) # axis = 1, will return maximum index in that array 
    keys = topic_matrix.argmax(axis = 1).tolist()
    print("length of the keys is: ",len(keys))
    return keys

In [24]:
#Return a tuple of topic categories and their accompanying magnitude for a given list of keys
from collections import Counter
def key_to_count(keys):
    count_pairs = Counter(keys).items()
    # print("Count_pairs",count_pairs)
    categories = [pair[0] for pair in count_pairs]
    # print("categories",categories)
    counts = [pair[1] for pair in count_pairs]
    # print("Counts: ",counts)
    return (categories, counts)

In [25]:
lda_keys = get_keys(lda_topic_modelling)
print("keys: ",lda_keys)

lda_categories, lda_count = key_to_count(lda_keys)

length of the keys is:  793
keys:  [5, 1, 5, 5, 7, 5, 5, 1, 1, 5, 7, 1, 5, 5, 5, 5, 5, 5, 6, 6, 1, 1, 3, 1, 1, 5, 5, 1, 5, 1, 5, 5, 5, 1, 5, 1, 6, 6, 1, 6, 6, 6, 1, 6, 5, 1, 6, 5, 1, 5, 5, 1, 1, 1, 1, 1, 1, 5, 7, 9, 6, 5, 1, 5, 1, 5, 7, 1, 6, 6, 6, 5, 1, 1, 7, 7, 5, 6, 5, 1, 1, 7, 5, 5, 6, 1, 5, 5, 1, 1, 6, 6, 5, 1, 1, 1, 6, 1, 1, 1, 1, 1, 1, 6, 1, 5, 1, 1, 7, 1, 5, 5, 5, 5, 5, 6, 5, 5, 1, 5, 5, 6, 1, 1, 1, 6, 5, 7, 0, 5, 1, 1, 6, 5, 5, 0, 5, 1, 5, 5, 1, 1, 1, 5, 1, 5, 1, 5, 5, 5, 5, 6, 5, 1, 5, 5, 5, 5, 1, 5, 5, 5, 5, 1, 5, 1, 1, 6, 6, 5, 7, 5, 1, 7, 1, 1, 5, 5, 7, 1, 1, 6, 6, 1, 1, 6, 1, 1, 5, 5, 6, 6, 5, 1, 5, 1, 5, 6, 9, 6, 1, 5, 7, 1, 1, 6, 9, 6, 7, 5, 1, 5, 1, 1, 1, 5, 6, 6, 1, 1, 6, 5, 5, 1, 6, 1, 1, 5, 1, 6, 5, 5, 5, 5, 0, 1, 6, 6, 6, 6, 6, 6, 5, 7, 1, 5, 6, 1, 5, 5, 1, 0, 5, 6, 6, 1, 1, 6, 5, 5, 1, 5, 5, 1, 1, 1, 1, 6, 6, 5, 5, 1, 1, 6, 5, 5, 5, 5, 6, 1, 1, 1, 1, 1, 5, 5, 5, 5, 1, 1, 1, 5, 1, 1, 5, 1, 1, 1, 1, 1, 5, 5, 6, 5, 5, 5, 6, 1, 1, 1, 1, 7, 0, 6, 1, 6, 6, 1, 0, 5, 5, 6

In [26]:
for i in range(0,len(lda_count)):
    print('Topic: {} frequency: {}'.format(lda_categories[i], lda_count[i]))

Topic: 5 frequency: 320
Topic: 1 frequency: 246
Topic: 7 frequency: 37
Topic: 6 frequency: 139
Topic: 3 frequency: 2
Topic: 9 frequency: 28
Topic: 0 frequency: 21


**4) Visualización de la información:** Con el fin de mostrar de una manera ordenada los resultados arrojados por el modelo mostramos lo siguiente:
- 4.1) El tópico predominante para cada documento perteneciente a la información de entrenamiento
- 4.2) la contribución de cada tópico para cada documento
- 4.3) Número de documentos asociados a cada tópico
- 4.4) Tabla que contenga la contribución de cada palabra a su respectivo tópico 
- 4.5) Las palabras más importantes asociadas a cada tópico
- 4.6) [pyLDAvis](https://pyldavis.readthedocs.io/en/latest/index.html): 

4.1) Topico predominante:

In [27]:
#Mostramos el tópico predominante para cada trace_traindata

def topic_dom(lda_model,info_vec):
    #lda_model: modelo LDA ya creado
    #info_vec: matriz token-frequency
    
    lda_output = lda_model.transform(info_vec)
    # column names
    topicnames = ["Topic" + str(i) for i in range(lda_model.n_components)]
    # index names
    docnames = ["Doc" + str(i) for i in range(info_vec.shape[0])]
    # Make the pandas dataframe
    df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
    
    # Get dominant topic for each document
    dominant_topic = np.argmax(df_document_topic.values, axis=1)
    df_document_topic['dominant_topic'] = dominant_topic
    
    return df_document_topic['dominant_topic']

In [28]:
topic_dominant= topic_dom(ldamodel,info_train_v)
topic_dominant

Doc0      5
Doc1      1
Doc2      5
Doc3      5
Doc4      7
         ..
Doc788    1
Doc789    1
Doc790    5
Doc791    1
Doc792    6
Name: dominant_topic, Length: 793, dtype: int64

4.2) topic-document matrix:

In [29]:
def topic_document(lda_model, info_vec):
    #lda_model: modelo LDA ya creado
    #info_vec: matriz token-frequency
    
    doc_topic = lda_model.transform(info_vec)
    doc_topic_df = pd.DataFrame(data=doc_topic)
    return doc_topic_df

In [30]:
topic_doc_infotrain= topic_document(ldamodel,info_train_v)
topic_doc_infotrain

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.000030,0.000030,0.000030,0.000030,0.000030,0.905831,0.000030,0.000030,0.000030,0.093926
1,0.000223,0.997995,0.000223,0.000223,0.000223,0.000223,0.000223,0.000223,0.000223,0.000223
2,0.000086,0.000086,0.000086,0.000086,0.000086,0.999225,0.000086,0.000086,0.000086,0.000086
3,0.000029,0.000029,0.000029,0.000029,0.000029,0.921973,0.000029,0.000029,0.000029,0.077796
4,0.000042,0.000042,0.000042,0.000042,0.000042,0.227124,0.244085,0.528496,0.000042,0.000042
...,...,...,...,...,...,...,...,...,...,...
788,0.000045,0.999594,0.000045,0.000045,0.000045,0.000045,0.000045,0.000045,0.000045,0.000045
789,0.000420,0.667454,0.000420,0.000420,0.000420,0.000420,0.000420,0.000420,0.000420,0.329184
790,0.000017,0.000017,0.000017,0.000017,0.000017,0.914703,0.085162,0.000017,0.000017,0.000017
791,0.000040,0.999639,0.000040,0.000040,0.000040,0.000040,0.000040,0.000040,0.000040,0.000040


4.3) Número de documentos asociados a cada tópico:

In [31]:
def topic_distribution(lda_model, info_vec):
    
    #lda_model: modelo LDA ya creado
    #info_vec: matriz token-frequency
    
    lda_output = lda_model.transform(info_vec)
    # column names
    topicnames = ["Topic" + str(i) for i in range(lda_model.n_components)]
    # index names
    docnames = ["Doc" + str(i) for i in range(info_vec.shape[0])]
    # Make the pandas dataframe
    df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
    
    # Get dominant topic for each document
    dominant_topic = np.argmax(df_document_topic.values, axis=1)
    df_document_topic['dominant_topic'] = dominant_topic
    

    df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
    df_topic_distribution.columns = ['Topic Num', 'Num Documents']
    
    return  df_topic_distribution
    

In [32]:
topic_num_doc= topic_distribution(ldamodel, info_train_v)
topic_num_doc

Unnamed: 0,Topic Num,Num Documents
0,5,323
1,1,246
2,6,137
3,7,37
4,9,27
5,0,21
6,3,2


4.4) Tabla que contenga la contribución de cada palabra a su respectivo tópico 

In [33]:
def topic_keyword(lda_model, vect_class):
    #lda_model: modelo previamente creado
    #vect_class: nombre de la clase countvectorizer() usada antes
    
    df_topic_keywords = pd.DataFrame(lda_model.components_)

    # Assign Column and Index
    topicnames = ["Topic" + str(i) for i in range(lda_model.n_components)]
    df_topic_keywords.columns = vect_class.get_feature_names()
    df_topic_keywords.index = topicnames
    
    return df_topic_keywords

In [34]:
topic_keyword(ldamodel, v)

Unnamed: 0,"""##Exposure_'{}'_by_'NGCIR2'_aborted."",","""'finally'_block_executed_(brown)"",",""">>Application_'gvoControl'_started_(r320701)."",",""">>Application_'gvoControl'_starting_(r320701)_..."",","""AOS_AO_MODE_=_'FULL_AO'"",","""Application_'gvoControl'_terminated_(r320701)."",","""Archiver_last_reply_(Buf:'{}_(mymerge_10.arf)')"",","""Archiver_last_reply_(Buf:'{}_(mymerge_100.arf)')"",","""Archiver_last_reply_(Buf:'{}_(mymerge_101.arf)')"",","""Archiver_last_reply_(Buf:'{}_(mymerge_103.arf)')"",",...,"['GRAVITY_gen_tec_AcqFibB_--_Calibrate_AcqCam_FiberPosB_(yellow)',","['GRAVITY_gen_tec_PTref_--_Calibrate_Pupil_Tracker_Reference_(yellow)',","['GRAVITY_gen_tec_SaveDb_--_Save_Instrument_Data_Bases_(yellow)',","['GRAVITY_gen_tec_WFSref_--_Calibrate_WFS_Reference_(yellow)',","['GRAVITY_gen_tec_checkFddl_--_Check_FDDL_non-linearity_calibration_(yellow)',","['GRAVITY_gen_tec_checkMetZero_--_Check_Metrology_Zero_(yellow)',","['GRAVITY_single_acq_--_GRAVITY_Single_Mode_Acquisition_(yellow)',","['GRAVITY_single_obs_calibrator_--_Single_Field_Calibrator_(yellow)',","['GRAVITY_single_obs_exp_--_Single_Field_Science_(yellow)',","['Started_at_{}_(underlined)',"
Topic0,0.139884,0.135311,0.144261,0.132099,0.133401,0.139131,0.13588,0.13346,0.134667,0.134251,...,0.130348,0.135242,0.137779,0.136349,0.137971,0.135854,0.140051,0.132555,0.130345,10.942819
Topic1,0.148454,0.135685,0.140562,0.148221,0.136572,0.154534,0.773581,0.140216,0.140268,0.134465,...,8.302863,0.201686,0.150107,4.868499,6.825574,0.89631,0.143987,0.133637,0.136759,153.162167
Topic2,0.131379,0.136277,0.880963,0.875353,0.132627,0.860686,0.130763,0.135259,0.139862,0.136983,...,0.139339,0.135363,0.139007,0.132936,0.135819,0.131674,0.136923,0.130324,0.13466,0.195142
Topic3,0.131552,0.126395,0.134839,0.128747,0.134264,0.139891,0.131248,0.137758,0.136072,0.131576,...,0.139018,0.132179,17.943842,0.137217,0.130883,0.131501,0.133255,0.131409,0.127076,12.304869
Topic4,0.138641,0.136179,0.135287,0.133041,0.136903,0.126423,0.133781,0.128078,0.132612,0.134194,...,0.13055,0.136915,0.126952,0.133234,0.131621,0.138093,0.138489,0.132603,0.13241,0.147071
Topic5,0.145632,305.167168,0.142012,0.130255,124.293644,0.135281,1.335944,8.486132,0.574248,12.437459,...,0.145024,0.145299,0.401149,0.124148,0.16575,0.996447,57.319751,30.259037,14.402708,178.125908
Topic6,0.874118,0.147723,0.134303,0.142352,0.133023,0.129058,10.834906,0.153376,1.368016,0.141527,...,0.134708,6.684099,0.133914,0.138704,0.12924,8.464859,0.129288,0.15583,0.137225,64.894018
Topic7,0.137981,3.770153,0.132164,0.134479,2.691303,0.13301,0.305966,0.132625,0.131319,0.127382,...,0.138412,0.15401,0.183571,1.179305,1.090047,0.323285,2.229616,0.383625,0.980372,6.855296
Topic8,0.134713,0.132669,0.133001,0.135499,0.140328,0.137286,0.139404,0.139026,0.125121,0.132628,...,0.126785,0.140648,0.126864,0.135126,0.126648,0.142504,0.130301,0.132766,0.124826,0.136988
Topic9,0.13404,32.202491,0.129664,0.137568,39.100703,0.131985,0.132461,0.1431,0.131494,0.133419,...,0.135979,0.142357,0.135603,0.13377,0.131559,0.129388,9.673428,0.412902,0.141964,19.045992


4.5) Las palabras más importantes asociadas a cada tópico

In [35]:
# Show top n keywords for each topic
def show_topics(vect_class, lda_model, n_words):
    
    df_topic_keywords=topic_keyword(lda_model, vect_class)
    keywords = np.array(vect_class.get_feature_names())
    topic_keywords = []
    
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
     
    df_topic_keywords = pd.DataFrame(topic_keywords)
    df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
    df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
    
    return df_topic_keywords

In [36]:
df_words_topic=show_topics(v, ldamodel, 5)
df_words_topic

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4
Topic 0,"'detach_to_NGCIR1',","'Forward(b)_STANDBY_to_NGCIR1',","'DB_EVENT_SAFETY_CHECK_DISABLED_...',","'Forward(b)_STANDBY_to_NGCIR3',","'Send_reply_to_command_STANDBY',"
Topic 1,"'Executing_SETUP_command_...',","'SETUP_command_done.',","'{}_(SpringGreen4)',",'SETUP_-expoId_{}_-noExposure_-function_INS.FA...,'SETUP_-expoId_{}_-noExposure_-function_INS.FA...
Topic 2,"'Executing_SETUP_command_...',","'SETUP_command_done.',","'{}_-_ic0fbBase.C:{}:_Usage_of_""*.INS*.DICT*""_...","'{}_(SpringGreen4)',","'Exit_:_application_received_a_KILL_command',"
Topic 3,"'Done',",'(errCheckStackSize)_Error_Stack_TOO_BIG_(Size...,"'{}_{}_{}_?_???????????????',","'{}_{}_{}_?_??????????[?#{}?#{}?',","'{}_{}_{}_?_??????????????????????????I',"
Topic 4,'qsemuLocal.c_{}_{}_{}_W_remote_environment_re...,'ccsERR_REMOTE_LINK_:_Error_in_the_remote_conn...,"'seqERR_DB_READ_SYMBOLIC_:_cannot_read_""@lgvme...",'ccsERR_DB_LCU_:_Fail_of_(_DBREAD_)_in_env._(_...,'(errAdd)_parameter_error_is_a_NULL_pointer_(e...
Topic 5,"'INS.LAMP15.START_Lamp_turned_on.',","'INS.LAMP17.STOP_Lamp_turned_off.',","'INS.LAMP15.STOP_Lamp_turned_off.',","'INS.LAMP19.START_Lamp_turned_on.',","'INS.LAMP13.STOP_Lamp_turned_off.',"
Topic 6,'WARNING:_new_header_block_is_added_to_the_FIT...,"'Time_{}:_{}',","'{}_(SpringGreen4)',","'Executing_SETUP_command_...',","'SETUP_command_done.',"
Topic 7,'lccERR_ATTR_TYPE_:_invalid_attribute_type_(to...,"'(errPutInStack)_Error_Stack_{}_is_FULL,_modul...",'lccERR_DE_TYPE_:_invalid_data_type_(totFlagTe...,'caiERR_WRITE_:_database_direct_write_attribut...,'caiERR_WRITE_:_database_direct_write_attribut...
Topic 8,'caiERR_PARAMETER_:_invalid_parameter_unexpect...,'tacERR_DBWRITE_:_Failed_to_write_database_({}...,'tacBgTask.c:{}_called_with_too_many_arguments...,"""caiERR_INTERNAL_:_internal_function_call_'cai...","""caiERR_INTERNAL_:_internal_function_call_'cai..."
Topic 9,"'INS.LAMP19.STOP_Lamp_turned_off.',","'INS.LAMP13.START_Lamp_turned_on.',","'INS.LAMP17.START_Lamp_turned_on.',","'INS.LAMP13.STOP_Lamp_turned_off.',","'Move_X,Y_{}_to_{},{}',"


4.6) pyLDAvis

In [37]:
pyLDAvis.enable_notebook()
# 1 parameter: LDA model
# 2 parameter: matrix topic-frequency
# 3 parameter: class CountVectorizer()
panel = pyLDAvis.sklearn.prepare(ldamodel, info_train_v, v, mds='tsne')
panel

  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


Ahora que ya pudimos ver toda la información que arrojó nuestro modelo, procederemos a testearlo en el siguiente notebook.