In [1]:
import os
import pandas as pd
import gensim
import nltk
import tempfile

from gensim.utils import simple_preprocess 
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
import pickle
from gensim.models import LdaMulticore
stemmer = PorterStemmer()

In [2]:
home = os.path.expanduser('~')
repo_path = 'Documents/repos/mystuff/lda101' #change to path of repo
repo = os.path.join(home, repo_path) 
pickled_data = os.path.join(repo, 'pickled_data')
models = os.path.join(repo,'models')

In [3]:
postfix = 'abcdefghijklmno'
def _assemble_w2v_file(postfix):
    processed_docs = tempfile.NamedTemporaryFile(mode='ab', delete=True)

    for letter in postfix:
        fname = os.path.join(pickled_data,'processed_docs.pkl.a'+letter)
        with open(fname, 'rb') as infile:
            processed_docs.write(infile.read())

    return processed_docs

def preprocess(text):
    result = []
    # convert document into list of lowercase tokens, filter based on token length
    for token in simple_preprocess(text):
        if token not in STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result
    

In [4]:
with _assemble_w2v_file(postfix) as processed_docs_file:
    f = open(processed_docs_file.name,'rb')
    processed_docs = pickle.load(f)
    f.close()
# load models and data
with open(os.path.join(repo,'dictionary.pkl'), 'rb') as file:
    dictionary = pickle.load(file)
with open(os.path.join(repo,'bow_corpus.pkl'), 'rb') as file:
    bow_corpus = pickle.load(file)
#bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs] # bow rep for each article
lda_model = LdaMulticore.load(os.path.join(models, 'lda.model'))


In [5]:
class LDAModelDriver:
    def __init__(self, dictionary, bow_corpus, lda_model):
        self.dictionary = dictionary
        self.bow_corpus = bow_corpus
        self.lda_model = lda_model
    
    def lemmatize_stemming(self, text):
        '''
        lemmatize text, without pos tag, lemmatizer treats every word as noun. pos='v' tells lemmatizer to treat 
        each word as verb.
        '''
        word = WordNetLemmatizer().lemmatize(text, pos='v')
        return stemmer.stem(word)
    
    def preprocess(self, text):
        result = []
        # convert document into list of lowercase tokens, filter based on token length
        for token in simple_preprocess(text):
            if token not in STOPWORDS and len(token) > 3:
                result.append(self.lemmatize_stemming(token))
        return result

    def model_topics(self, text):
        bow_vector = self.dictionary.doc2bow(self.preprocess(text))
        for index, score in sorted(self.lda_model[bow_vector], key=lambda tup: -1*tup[1]):
            print("Score: {}\t Topic: {}".format(score, self.lda_model.print_topic(index, 5)))

In [13]:
from scrapers.cna_scraper import CNAScraper
lda_model_driver = LDAModelDriver(dictionary, bow_corpus, lda_model)
url = 'https://www.channelnewsasia.com/news/asia/taiwan-train-crash-yilan-derail-9-year-old-youngest-10850902'
lda_model_driver.model_topics(CNAScraper.get_text(url))

Prasing article..
