In [26]:
import codecs
import re
import warnings

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=FutureWarning) 

In [27]:
stop_words = set(['de','la','que','el','en','y','a','los','del','se','las','por','un','para','con','no','una','su','al','lo','como','mas','pero','sus',
                     'le','ya','o','este','si','porque','esta','entre','cuando','muy','sin','sobre','tambien','me','hasta','hay','donde','quien','desde',
                     'todo','nos','durante','todos','uno','les','ni','contra','otros','ese','eso','ante','ellos','e','esto','mi','antes','algunos','que',
                     'unos','yo','otro','otras','otra','el','tanto','esa','estos','mucho','quienes','nada','muchos','cual','poco','ella','estar','estas',
                     'algunas','algo','nosotros','mi','mis','tu','te','ti','tu','tus','ellas','nosotras','vosotros','vosotras','os','mio','mia','mios',
                     'mias','tuyo','tuya','tuyos','tuyas','suyo','suya','suyos','suyas','nuestro','nuestra','nuestros','nuestras','vuestro','vuestra',
                     'vuestros','vuestras','esos','esas','estoy','estas','esta','estamos','estais','estan','este','estes','estemos','esteis','esten','estare',
                     'estaras','estara','estaremos','estareis','estaran','estaria','estarias','estariamos','estariais','estarian','estaba','estabas','estabamos',
                     'estabais','estaban','estuve','estuviste','estuvo','estuvimos','estuvisteis','estuvieron','estuviera','estuvieras','estuvieramos',
                     'estuvierais','estuvieran','estuviese','estuvieses','estuviesemos','estuvieseis','estuviesen','estando','estado','estada','estados','estadas',
                     'estad','he','has','ha','hemos','habeis','han','haya','hayas','hayamos','hayais','hayan','habre','habras','habra','habremos','habreis',
                     'habran','habria','habrias','habriamos','habriais','habrian','habia','habias','habiamos','habiais','habian','hube','hubiste','hubo','hubimos',
                     'hubisteis','hubieron','hubiera','hubieras','hubieramos','hubierais','hubieran','hubiese','hubieses','hubiesemos','hubieseis','hubiesen',
                     'habiendo','habido','habida','habidos','habidas','soy','eres','es','somos','sois','son','sea','seas','seamos','seais','sean','sere','seras',
                     'sera','seremos','sereis','seran','seria','serias','seriamos','seriais','serian','era','eras','eramos','erais','eran','fui','fuiste','fue',
                     'fuimos','fuisteis','fueron','fuera','fueras','fueramos','fuerais','fueran','fuese','fueses','fuesemos','fueseis','fuesen','sintiendo','sentido',
                     'sentida','sentidos','sentidas','siente','sentid','tengo','tienes','tiene','tenemos','teneis','tienen','tenga','tengas','tengamos','tengais',
                     'tengan','tendre','tendras','tendra','tendremos','tendreis','tendran','tendria','tendrias','tendriamos','tendriais','tendrian','tenia','tenias',
                     'teniamos','teniais','tenian','tuve','tuviste','tuvo','tuvimos','tuvisteis','tuvieron','tuviera','tuvieras','tuvieramos','tuvierais',
                     'tuvieran','tuviese','tuvieses','tuviesemos','tuvieseis','tuviesen','teniendo','tenido','tenida','tenidos','tenidas','tened', 'iii',
                     'iv', 'v', 'vi', 'vii', 'viii'])

# Obtiene el texto
def getText(file):
    text =[]
    with codecs.open('D:/Documentos/maestria_mcdi/3er semestre/NLP/docs/texto/' + file , 'r', encoding='utf8') as f:
        for line in f.readlines():
            clean_line = remove_esp_characters(line)
            if len(clean_line) > 0:
                text.append(clean_line)
    return text

#elimina caracteres especiales
def remove_esp_characters(txt):     
    text = re.sub(r'\W+',' ', txt)    
    text = re.sub(r'[ ]{2,}',' ', text)  
    return text.strip()

#clean texto
def clean(str):     
    txt = re.sub(r'[^a-zñ]', ' ', str)    
    txt = re.sub(r'[ ]{2,}',' ', txt)    
    text = re.sub(r'\n',' ', txt)       
    return text

# Remueve stop words
def remove_stop_words(str):      
    word = ""        
    text = str.split()        
    for _text in text:        
        if _text not in stop_words:
            #Agrega al diccionario palabras que tenga 2 caracteres en adelante
            if(len(clean(_text)) > 2):                
                word += _text  + " " 
            
    return word 

In [28]:
"""
Archivos creados en el proceso de creación de words embeddings se eliminó puntuación y acentos 

text_vf sin pre-procesamiento
text_lemma_vf  Se aplico proceso de lematización
text_stemming_vf Se aplico proceso de stemming

"""
files = ["text_vf.txt" , "text_lemma_vf.txt", "text_stemming_vf.txt"]

list_text = []
for f in files:
    list_text.append(" ".join(getText(f)))


# LDA con proceso de lematización

In [29]:
def get_vectorizer(_text):
    # Vectorizar TF
    vectorizer = CountVectorizer(stop_words=list(stop_words))
    X_vec_TF = vectorizer.fit_transform(_text)

    # Vectorizar TF-IDF
    tfidfvec = TfidfVectorizer(stop_words=list(stop_words))
    X_vec_TFIDF = tfidfvec.fit_transform(_text)
    
    X_vec_dense_TF = X_vec_TF.todense()
    X_vec_dense_TFIDF = X_vec_TFIDF.todense()
    
    print("Sparsicity TF: ", ((X_vec_dense_TF > 0).sum()/X_vec_dense_TF.size)*100, "%")
    print("Sparsicity TFIDF: ", ((X_vec_dense_TFIDF > 0).sum()/X_vec_dense_TFIDF.size)*100, "%")
    
    return X_vec_TF, vectorizer, X_vec_TFIDF, tfidfvec

def get_LDA(X_vec_TF, X_vec_TFIDF, _components):        
    lda_model = LatentDirichletAllocation(n_components=_components,
                                      max_iter=10,
                                      random_state=100,
                                      batch_size=128,
                                      evaluate_every = -1,
                                      n_jobs = -1
                                     )
    lda_tf = lda_model.fit_transform(X_vec_TF)
    lda_tfidf = lda_model.fit_transform(X_vec_TFIDF)
    
    print("Log Likelihood TF: ", lda_model.score(X_vec_TF))
    print("Log Likelihood TFIDF: ", lda_model.score(X_vec_TFIDF))

    print("Perplexity TF: ", lda_model.perplexity(X_vec_TF))
    print("Perplexity TFIDF: ", lda_model.perplexity(X_vec_TF))
    
    return lda_model
        
    

## Tópicos con counter Vectorizer

In [30]:
text_ssp_lemma = remove_stop_words(list_text[1])

In [31]:
X_vec_TF, vectorizer, X_vec_TFIDF, tfidfvec = get_vectorizer([text_ssp_lemma])
lda_model = get_LDA(X_vec_TF, X_vec_TFIDF, 30)

Sparsicity TF:  100.0 %
Sparsicity TFIDF:  100.0 %
Log Likelihood TF:  -68626668.98206554
Log Likelihood TFIDF:  -608.9552899085048
Perplexity TF:  85233107075.57246
Perplexity TFIDF:  85233107075.57246


In [33]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, X_vec_TF, vectorizer, mds='tsne')
panel

## Tópicos con counter TFIDF Vectorizer

In [35]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, X_vec_TFIDF, tfidfvec, mds='tsne')
panel

# LDA con proceso de Stemming

## Tópicos con counter Vectorizer

In [36]:
text_ssp_stemm = remove_stop_words(list_text[2])

In [16]:
X_vec_TF, vectorizer, X_vec_TFIDF, tfidfvec = get_vectorizer([text_ssp_stemm])
lda_model = get_LDA(X_vec_TF, X_vec_TFIDF, 30)

Sparsicity TF:  100.0 %
Sparsicity TFIDF:  100.0 %
Log Likelihood TF:  -65720194.867064625
Log Likelihood TFIDF:  -573.4606574998336
Perplexity TF:  24493193796.518078
Perplexity TFIDF:  24493193796.518078


In [37]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, X_vec_TF, vectorizer, mds='tsne')
panel

## Tópicos con counter TFIDF Vectorizer

In [38]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, X_vec_TFIDF, tfidfvec, mds='tsne')
panel

# LDA sin pre-procesamientos

## Tópicos con counter Vectorizer

In [19]:
X_vec_TF, vectorizer, X_vec_TFIDF, tfidfvec = get_vectorizer([list_text[0]])
lda_model = get_LDA(X_vec_TF, X_vec_TFIDF, 30)

Sparsicity TF:  100.0 %
Sparsicity TFIDF:  100.0 %
Log Likelihood TF:  -77280918.21004155
Log Likelihood TFIDF:  -856.7168282222799
Perplexity TF:  818936935440.143
Perplexity TFIDF:  818936935440.143


In [39]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, X_vec_TF, vectorizer, mds='tsne')
panel

## Tópicos con counter TFIDF Vectorizer

In [40]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, X_vec_TFIDF, tfidfvec, mds='tsne')
panel