In [170]:
import os
import pandas as pd
import gensim
import nltk

from gensim.utils import simple_preprocess 
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
stemmer = PorterStemmer()

In [171]:
desktop = os.path.join(os.path.expanduser('~'), 'Desktop')
large_file = os.path.join(desktop, 'large_file')
processed_wiki_sub = os.path.join(large_file, 'processed_wiki_sub')
text_path = os.path.join(processed_wiki_sub, 'processed_wiki.textaa')

In [172]:
with open(text_path, 'r') as file:
    text = file.read()
line = text.split('\n')
df = pd.DataFrame(line, columns = ['article'])

In [173]:
df.count()

article    102100
dtype: int64

In [174]:
def lemmatize_stemming(text):
    '''
    lemmatize text, without pos tag, lemmatizer treats every word as noun. pos='v' tells lemmatizer to treat 
    each word as verb.
    '''
    word = WordNetLemmatizer().lemmatize(text, pos='v')
    return stemmer.stem(word)

def preprocess(text):
    result = []
    # convert document into list of lowercase tokens, filter based on token length
    for token in simple_preprocess(text):
        if token not in STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [175]:
text = df.iloc[0].values[0]
sample = df.sample(50000)

In [None]:
processed_docs = df['article'].map(preprocess)

In [None]:
processed_docs[:10]

In [None]:
'''
Build dictionary from processed docs. 
Dictionary example attributes: 
- token2id
- id2token
- dfs (token document frequency)
'''
dictionary = gensim.corpora.Dictionary(processed_docs)

In [None]:
# for i in range(10):
#     print("{} appears {} times".format(dictionary.id2token[i], dictionary.dfs[i]))
# dictionary.num_pos

In [None]:
'''
Filter dictionary. 
- Remove tokens with dfs<15, more than 0.5 fraction of total corpus size, keep_n most frequent tokens
'''
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [None]:
'''
Gensim doc2bow
For each document, create a bagofwords representation
'''
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs] # bow rep for each article

In [None]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [None]:
from gensim.models import LdaMulticore
lda_model = LdaMulticore(bow_corpus, num_topics=100, id2word=dictionary, passes=2, workers=8)

In [None]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

In [None]:
lda_model.show_topic(0)

In [None]:
test_set = os.path.join(processed_wiki_sub,'processed_wiki.textab')
with open(test_set, 'r') as file2:
    text = file2.read()
line = text.split('\n')    
df2 = pd.DataFrame(line, columns=['article'])
df2.head()

In [None]:
test_doc = df2.iloc[0]
test_doc = test_doc['article']
test_doc

In [None]:
bow_vector = dictionary.doc2bow(preprocess(test_doc))
lda_model[bow_vector]