In [1]:
import os
import pandas as pd
import gensim
import nltk
import tempfile

from gensim.utils import simple_preprocess 
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
import pickle
from gensim.models import LdaMulticore
stemmer = PorterStemmer()

In [2]:
home = os.path.expanduser('~')
repo_path = 'Documents/repos/mystuff/lda101' #change to path of repo
repo = os.path.join(home, repo_path) 
pickled_data = os.path.join(repo, 'pickled_data')
models = os.path.join(repo,'models')

In [3]:
postfix = 'abcdefghijklmno'
def _assemble_w2v_file(postfix):
    processed_docs = tempfile.NamedTemporaryFile(mode='ab', delete=True)

    for letter in postfix:
        fname = os.path.join(pickled_data,'processed_docs.pkl.a'+letter)
        with open(fname, 'rb') as infile:
            processed_docs.write(infile.read())

    return processed_docs    

In [4]:
with _assemble_w2v_file(postfix) as processed_docs_file:
    f = open(processed_docs_file.name,'rb')
    processed_docs = pickle.load(f)
    f.close()
# load models and data
with open(os.path.join(repo,'dictionary.pkl'), 'rb') as file:
    dictionary = pickle.load(file)
with open(os.path.join(repo,'bow_corpus.pkl'), 'rb') as file:
    bow_corpus = pickle.load(file)
#bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs] # bow rep for each article
lda_model = LdaMulticore.load(os.path.join(models, 'lda.model'))


In [5]:
class LDAModelDriver:
    def __init__(self, dictionary, bow_corpus, lda_model):
        self.dictionary = dictionary
        self.bow_corpus = bow_corpus
        self.lda_model = lda_model
    
    def lemmatize_stemming(self, text):
        '''
        lemmatize text, without pos tag, lemmatizer treats every word as noun. pos='v' tells lemmatizer to treat 
        each word as verb.
        '''
        word = WordNetLemmatizer().lemmatize(text, pos='v')
        return stemmer.stem(word)
    
    def preprocess(self, text):
        result = []
        # convert document into list of lowercase tokens, filter based on token length
        for token in simple_preprocess(text):
            if token not in STOPWORDS and len(token) > 3:
                result.append(self.lemmatize_stemming(token))
        return result

    def model_topics(self, text):
        bow_vector = self.dictionary.doc2bow(self.preprocess(text))
        for index, score in sorted(self.lda_model[bow_vector], key=lambda tup: -1*tup[1]):
            print("Score: {}\t Topic: {}".format(score, self.lda_model.print_topic(index, 5)))

In [15]:
from scrapers.bbc_scraper import BBCScraper
scraper = BBCScraper
lda_model_driver = LDAModelDriver(dictionary, bow_corpus, lda_model)
url = 'https://www.bbc.com/news/science-environment-45918770'
lda_model_driver.model_topics(scraper.get_text(url))

Prasing article..
200
Score: 0.5117155313491821	 Topic: 0.017*"island" + 0.005*"park" + 0.005*"nation" + 0.005*"citi" + 0.005*"north"
Score: 0.29152533411979675	 Topic: 0.008*"speci" + 0.006*"sexual" + 0.005*"anim" + 0.004*"human" + 0.004*"male"
Score: 0.12298126518726349	 Topic: 0.012*"seri" + 0.011*"episod" + 0.007*"film" + 0.007*"star" + 0.006*"anim"
Score: 0.0701688900589943	 Topic: 0.023*"game" + 0.014*"team" + 0.013*"player" + 0.010*"season" + 0.008*"play"


In [17]:
text = """Nanyang Technological University (NTU) said yesterday (Oct 8) that it “regrets” holding a career fair last month mostly for top-performing students only, adding that it will open the event to the entire graduating cohort next year.
From 2019, regardless of grades, students will be allowed to submit their resumes in advance by posting them on a website for employers to consider. The employers taking part in the NTUtopia career fair will then draw up a shortlist of students to be invited to the event.
NTU said this would be a change from the format of the event last month, in which the university itself shortlisted the invitees based on their academic results and co-curricular activity (CCA) records. About 3,600 students or 60 per cent of the graduating cohort were eventually invited.
Some students told The Sunday Times, which broke the story, that the event had exclusively targeted those from scholar programmes or with a minimum grade point average of 3.75 out of 5.
The criteria sparked an outcry among students, including those invited to the event, with many saying that it was unfair to limit participation based on grades.
Others also criticised the NTU move.
Chairperson of Government Parliamentary Committee for Education Denise Phua said: “Academic results is only one proxy indicator to determine the suitability of an applicant.
“These days, there are jobs that require skills, experiences and attributes not related to one’s academic scores.”
Managing partner of executive headhunting firm Leadership Advisory Inc Daniel Soh felt that career fairs were meant for employers to meet a diverse slate of candidates regardless of academic results.
Mr Paul Heng, managing director of NeXT Career Consulting Group, said: “To have a targeted career fair spells elitism, which is what we’re supposed to be moving away from.
“From a human resource perspective, to be able to perform a job well, you need many more qualities than academic performance - what about soft skills or communication skills?”
A spokesman from the Ministry of Education (MOE), which was among 45 participating employers in NTUtopia, told ST earlier that organising exclusive career fairs could “send the wrong message, and our education institutions should avoid that”.
On Sept 28, MOE announced a slew of changes to exams and assessments at the primary and secondary school level. Education Minister Ong Ye Kung said the motivation was to move away from a narrow focus on grades and help students discover the joy of learning.
In a statement yesterday, Professor Tan Ooi Kiang, NTU’s associate provost (undergraduate education), expressed regret for the controversy. 
He said: “We acknowledge that by making NTUtopia selective on the basis of academic achievements and CCA records, it may have sent an unintended message about employability factors.”
“Moving forward, NTU will encourage employers to look at a broader set of criteria, such as CCA involvement and leadership, internship experience and overseas exposure,” he added.
By the end of this month, all NTU graduating students will be invited to upload their resumes on a website that major employers, including the 45 that participated in NTUtopia, can access.
This means students who missed out on last month’s NTUtopia can still be considered by the participating employers.
Final-year NTU School of Social Sciences student Hee Yu Quan, 24, who was invited to the fair, praised the “prompt and appropriate” action by NTU. 
“I feel it treats the feedback loop quite seriously,” he told The Straits Times."""
lda_model_driver.model_topics(text)

Score: 0.38810989260673523	 Topic: 0.007*"airlin" + 0.007*"univers" + 0.006*"compani" + 0.005*"engin" + 0.005*"aircraft"
Score: 0.2543071508407593	 Topic: 0.011*"parti" + 0.008*"govern" + 0.008*"elect" + 0.006*"polit" + 0.005*"minist"
Score: 0.13664384186267853	 Topic: 0.026*"school" + 0.009*"colleg" + 0.005*"univers" + 0.004*"student" + 0.004*"educ"
Score: 0.08387739956378937	 Topic: 0.009*"econom" + 0.007*"social" + 0.006*"develop" + 0.006*"countri" + 0.005*"world"
Score: 0.0505591444671154	 Topic: 0.009*"file" + 0.007*"window" + 0.006*"version" + 0.006*"user" + 0.006*"chess"
Score: 0.029956869781017303	 Topic: 0.017*"album" + 0.014*"record" + 0.014*"music" + 0.012*"band" + 0.010*"song"
Score: 0.02955508604645729	 Topic: 0.011*"chines" + 0.008*"dynasti" + 0.008*"roman" + 0.006*"china" + 0.005*"emperor"
Score: 0.023050619289278984	 Topic: 0.008*"stori" + 0.008*"book" + 0.007*"write" + 0.007*"novel" + 0.006*"publish"
