# Paso 0: Dependencias

In [None]:
#configuración en google colab de spark y pyspark
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
#!pip install smart-open
#ref: https://radimrehurek.com/gensim/
!pip install gensim
!pip install nltk
!pip install pyLDAvis
# descargar datos 'metadata.csv' de: https://www.kaggle.com/allen-institute-for-ai/CORD-19-research-challenge
# https://www.kaggle.com/allen-institute-for-ai/CORD-19-research-challenge?select=metadata.csv

In [None]:
import pandas as pd
import numpy as np
import nltk
import re

In [None]:
# settings en nltk: tokenizador y stopwords
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
stop_words_nltk = set(stopwords.words('english'))

In [None]:
# cargar datos en pandas
#df = pd.read_csv('metadata.csv')
df = pd.read_csv('gdrive/MyDrive/st1800-241/datasets/metadata.csv')

In [None]:
# analisis descriptivo de datos

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.count()

In [None]:
# Inicio del proceso LDA
# filtrar las columnas de interés
df = df[['cord_uid','title','abstract']]
df.head()

In [None]:
# función general para preparación de datos: tokenización, remoción caracteres especiales
# minusculas.... no incluido stemming y lematización.
def textprep(line):
    tokens = nltk.word_tokenize(str(line))
    tokens = [w.lower() for w in tokens if len(w)>1]
    tokens = [re.sub(r'[^A-Za-z0-9]+','',w) for w in tokens]
    tokens = [w for w in tokens if w not in stop_words_nltk] 
    return tokens

In [None]:
# creación de columna con tokenización de una columna de interés especifica
df['tokens_title'] = df.apply(lambda row: textprep(row['title']), axis=1)
df.head()

### Construir el BoW (diccionario) de términos

In [None]:
# Creación del BoW - en gensim es Dictionary
from gensim.corpora import Dictionary
dictionary = Dictionary(df.tokens_title)

In [None]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in df.tokens_title]

In [None]:
print(corpus[:5])

In [None]:
print(dictionary)

### Construir matriz de documentos vs términos

In [None]:
# libreria para paralelizar
import multiprocessing as mp
import time

t0 = time.time()
pool = mp.Pool(mp.cpu_count())
doc_term_matrix = pool.map(dictionary.doc2bow, [sentence for sentence in df.tokens_title])
pool.close()
print(time.time()-t0)

### Construir modelo LDA

In [None]:
from gensim.models.ldamulticore import LdaMulticore

t0 = time.time()
lda_model = LdaMulticore(doc_term_matrix, num_topics=20, id2word = dictionary, passes=10, workers=10)
print(time.time()-t0)

In [None]:
def assigntopic(doc):
    vector = lda_model[dictionary.doc2bow(doc)] 
    # opción 1: todos los tópicos ordenados de mayor a menor, podria ser topN tambien asi: return vector[:5] n=5
    vector = sorted(vector, key=lambda item: -item[1])
    # opción 2: asignar el tópico mayor a cada documento
    #vector = max(vector,key=lambda item: item[1])
    return vector

In [None]:
df['topics'] = df.apply(lambda row: assigntopic(row['tokens_title']), axis=1)
df.head()

### Ejemplos de tópicos del modelo

In [None]:
# Mostrar los términos y sus pesos de un documento
print(list(lda_model[doc_term_matrix[0]]))

# Mostrar los términos más relevantes de los tópicos más relevantes tópico y sus pesos
print(lda_model.print_topics(num_topics=10, num_words=3))

In [None]:
lda_topic_assignment = [max(p,key=lambda item: item[1]) for p in lda_model[corpus]]

## Visualización de todos los tópicos

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

t0 = time.time()
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, doc_term_matrix, dictionary, sort_topics = False)
print(time.time()-t0)
vis

### Guardar la visualización en un archivo HTML

In [None]:
pyLDAvis.save_html(vis, 'lda_visualization.html')

In [None]:
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamulticore import LdaModel

# Create a corpus from a list of texts
common_dictionary = Dictionary(common_texts)
common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]
# Train the model on the corpus.
lda = LdaModel(common_corpus, num_topics=10)

In [None]:
print(len(common_texts))

In [None]:
for t in common_texts:
    print(t)

In [None]:
for text in common_texts:
    print(common_dictionary.doc2bow(text))

In [None]:
print(lda)

In [None]:
# Create a new corpus, made of previously unseen documents.
other_texts = [
    ['computer', 'time', 'graph'],
    ['survey', 'response', 'eps','trees'],
    ['human', 'system', 'computer']
]
other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]
unseen_doc = other_corpus[1]
vector = lda[unseen_doc]  # get topic probability distribution for a document

In [None]:
print(vector)

In [None]:
for text in other_texts:
    print(text)

In [None]:
for text in other_texts:
    print(common_dictionary.doc2bow(text))
for text in other_texts:
    print()

In [None]:
def assigntopic(doc):
    vector = lda[doc] 
    return vector

In [None]:
other_texts = [
    ['computer', 'time', 'graph'],
    ['survey', 'response', 'eps','trees'],
    ['human', 'system', 'computer']
]
other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]
unseen_doc = other_corpus[1]
vector = lda[unseen_doc]  # get topic probability distribution for a document