# Following this resource
https://www.kdnuggets.com/2019/09/overview-topics-extraction-python-latent-dirichlet-allocation.html

https://github.com/FelixChop/MediumArticles/blob/master/LDA-BBC.ipynb

# Initial Setup

## Imports

In [1]:
# Basics
import pandas as pd
import numpy as np

# Set random seed
np.random.seed(42)

# Plotting
import plotly.express as px

# Database
from JobsDb import JobsDb

# Data Processing
from DataProcessor import data_processor

# Build corpus
from gensim import corpora

# Latent Dirichlet Allocation
from gensim import models


## Loading the Data

In [2]:
db = JobsDb()
df = db.load_table_as_df('jobs')
db.close()
df = df.iloc[9680:]
data = df.copy()
data = data.reset_index().drop(['id', 'index'], axis=1)
print(df.shape)
data.head()

(9485, 4)


Unnamed: 0,title,url,description
0,Railcar Verifier/Transload Team Member/Data Entry,https://www.careerjet.com/jobad/us5194732b36a6...,\nCompany Overview Come join a Winning Team! ...
1,Data Entry Clerk,https://www.careerjet.com/jobad/us83f88fb60b47...,"\n prepare, compile and sort documents for dat..."
2,Data Scientist,https://www.careerjet.com/jobad/us466d6146a815...,\n \n Data Scientist is responsible for co...
3,Provider Data Specialist,https://www.careerjet.com/jobad/uscb5cda0893f6...,\n \n Title: Provider Data Specialist Loc...
4,Security Data Architect,https://www.careerjet.com/jobad/us00dc3c284dbd...,"\nOur Mission At Dobbs Defense, we deliver mi..."


## Extracting Documents

In [3]:
docs = list(data['description'])
doc = docs[0]
doc

'\\nCompany Overview  Come join a Winning Team! Since 1970, Plastic Express has been leading the bulk trucking, bulk terminal, packaging, and warehousing needs of the plastics industry. Our strategic locations, modern systems, and dedicated employees allow us to provide custom tailored logistical solutions to fulfill the most challenging needs of our customers. Plastic Express operates from 15 warehouse locations and 37 rail terminals across the US. At many of the Plastic Express sites, we also handle some non-plastic commodities, which include; paper rolls, steel, building materials and other dry bulk materials. Plastic Express owns and operates roughly 130 trucks, with approximately 200 trailers performing full bulk truck distribution business. Plastic Express is headquartered in City of Industry, CA and has over 300 employees nationwide. Our goal has always been to exceed our customers’ expectations, and our “Can Do” attitude is what differentiates us from the competition.  Position

## Processing Data

In [None]:
processed_data = data_processor(docs)

## Tokenizing Documents

In [None]:
def doc_tokenizer(doc):
    doc = doc.replace('\\n','').lower()
    sentences = sent_tokenize(doc)
    doc_tokens = [word_tokenize(sentence) for sentence in sentences]
    return doc_tokens

In [None]:
%time doc_tokens = doc_tokenizer(doc)
doc_tokens

## Parts of Speech Tagging

In [None]:
# Inspired from https://stackoverflow.com/a/15590384
def get_wordnet_pos(treebank_tag):
    """Converts a treebank POS tag to a wordnet POS tag."""
    if treebank_tag.startswith('J'):
        tag = wordnet.ADJ
    elif treebank_tag.startswith('V'):
        tag = wordnet.VERB
    elif treebank_tag.startswith('N'):
        tag = wordnet.NOUN
    elif treebank_tag.startswith('R'):
        tag = wordnet.ADV
    else:
        tag = ''
    return tag

def sentence_pos_tagger(sentence):
    """Takes a sentence as a list of tokens and returns a list of wordnet POS tagged tokens"""
    treebank_tags = pos_tag(sentence)
    wordnet_tags = [ 
        (treebank_tag[0], get_wordnet_pos(treebank_tag[1])) for treebank_tag in treebank_tags
    ]
    return wordnet_tags

def doc_pos_tagger(doc_tokens):
    pos_tags = [
        sentence_pos_tagger(sentence) for sentence in doc_tokens
    ]
    return pos_tags

In [None]:
%time doc_tags = doc_pos_tagger(doc_tokens)
doc_tags

## Lemmatization

In [None]:
lemmatizer = WordNetLemmatizer()

def tag_lemmatizer(pos_tag):
    """Lemmatized a POS tagged word."""
    if pos_tag[1] != '':
        lemmatized_word = lemmatizer.lemmatize(pos_tag[0], pos_tag[1])
    else:
        lemmatized_word = pos_tag[0]
    return lemmatized_word
    
def sentence_lemmatizer(sentence_tags):
    """Lemmatize POS tagged words from a tagged sentence."""
    lemmatized_sentence = [
        tag_lemmatizer(pos_tag) for pos_tag in sentence_tags
    ]
    return lemmatized_sentence

def doc_lemmatizer(doc_tags):
    """Lemmetize tagged words from a job doc and flatten sentence nesting."""
    lemmatized_doc = []
    for sentence_tags in doc_tags:
        lemmatized_sentence = sentence_lemmatizer(sentence_tags) 
        lemmatized_doc.extend(lemmatized_sentence)
    return lemmatized_doc    

In [None]:
%time lemmatized_doc = doc_lemmatizer(doc_tags)
lemmatized_doc

## Removing Stopwords and Punctuation

In [None]:
def clean_doc(lemmatized_doc):
    my_stopwords = stopwords.words('english')
    cleaned_doc = [
        word for word in lemmatized_doc
        if word.isalpha() and word not in my_stopwords
        and len(word)>1
    ]
    return cleaned_doc

In [None]:
%time cleaned_doc = clean_doc(lemmatized_doc)
cleaned_doc

## Make Bigrams and Trigrams

In [None]:
def combine_grams(cleaned_doc):
    bigram_model = Phrases(cleaned_doc)
    trigram_model = Phrases(bigram_model[cleaned_doc], min_count=1)
    processed_doc = list(trigram_model[bigram_model[cleaned_doc]])
    return processed_doc

In [None]:
%time processed_doc = combine_grams(cleaned_doc)
processed_doc

## Data Processing

In [None]:
def doc_processor(doc):
    doc_tokens = doc_tokenizer(doc)
    doc_tags = doc_pos_tagger(doc_tokens)
    lemmatized_doc = doc_lemmatizer(doc_tags)
    cleaned_doc = clean_doc(lemmatized_doc)
    processed_doc = combine_grams(cleaned_doc)
    return processed_doc

In [None]:
%time processed_doc_2 = doc_processor(doc)
assert processed_doc == processed_doc_2, "Should match"

In [None]:
%time processed_docs = [doc_processor(doc) for doc in docs]

## Make token dictionary and corpus

In [None]:
dictionary_LDA = corpora.Dictionary(processed_docs)
dictionary_LDA.filter_extremes(no_below=3)
corpus = [dictionary_LDA.doc2bow(token) for token in processed_docs]

## Fit Model

In [None]:
np.random.seed(123456)
num_topics = 2
alpha = [0.01]*num_topics
eta = [0.01]*len(dictionary_LDA.keys())
lda_model = models.LdaModel(
    corpus, 
    num_topics=num_topics,
    id2word=dictionary_LDA,
    passes=4, 
    alpha=alpha,
    eta=eta
)
lda_model.save(f'../model/lda-{num_topics}topics')

## Inspecting Topics

In [None]:
for i,topic in lda_model.show_topics(formatted=True,
                                     num_topics=num_topics, 
                                     num_words=20):
    print(str(i)+": "+ topic)
    print()

In [None]:
lda_model[corpus[0]]

In [None]:
# https://cran.r-project.org/web/packages/LDAvis/vignettes/details.pdf
# Here a short legend to explain the vis:
# size of bubble: proportional to the proportions of the topics across the
# N total tokens in the corpus
# red bars: estimated number of times a given term was generated by a given topic
# blue bars: overall frequency of each term in the corpus
# -- Relevance of words is computed with a parameter lambda
# -- Lambda optimal value ~0.6 
# (https://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf)
%matplotlib inline
import pyLDAvis
import pyLDAvis.gensim
vis = pyLDAvis.gensim.prepare(
    topic_model=lda_model, 
    corpus=corpus, 
    dictionary=dictionary_LDA
)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)