# Following this resource
https://www.kdnuggets.com/2019/09/overview-topics-extraction-python-latent-dirichlet-allocation.html

https://github.com/FelixChop/MediumArticles/blob/master/LDA-BBC.ipynb

# Initial Setup

## Imports

In [18]:
# Basics
import pandas as pd
import numpy as np

# Set random seed
np.random.seed(42)

# Timing code execution
from tqdm import tqdm

# Flatten nested objects quickly
from itertools import chain

# Plotting
import plotly.express as px

# Database
from JobsDb import JobsDb

# Tokenization
from nltk.tokenize import sent_tokenize, word_tokenize

# Stopwords
from nltk.corpus import stopwords

# Parts of Speach Tagging
from nltk import pos_tag

# Lemmatization
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer

# n-grams
from gensim.models import Phrases

from gensim import corpora

# Latent Dirichlet Allocation
from gensim import models


## Loading the Data

In [2]:
db = JobsDb()
df = db.load_table_as_df('jobs')
tqdm.pandas()
db.close()
df = df.iloc[9680:]
data = df.copy()
data = data.reset_index().drop(['id', 'index'], axis=1)
print(df.shape)
data.head()

(9485, 4)


Unnamed: 0,title,url,description
0,Railcar Verifier/Transload Team Member/Data Entry,https://www.careerjet.com/jobad/us5194732b36a6...,\nCompany Overview Come join a Winning Team! ...
1,Data Entry Clerk,https://www.careerjet.com/jobad/us83f88fb60b47...,"\n prepare, compile and sort documents for dat..."
2,Data Scientist,https://www.careerjet.com/jobad/us466d6146a815...,\n \n Data Scientist is responsible for co...
3,Provider Data Specialist,https://www.careerjet.com/jobad/uscb5cda0893f6...,\n \n Title: Provider Data Specialist Loc...
4,Security Data Architect,https://www.careerjet.com/jobad/us00dc3c284dbd...,"\nOur Mission At Dobbs Defense, we deliver mi..."


## Extracting Documents

In [3]:
docs = list(data['description'])
doc = docs[0]
doc

'\\nCompany Overview  Come join a Winning Team! Since 1970, Plastic Express has been leading the bulk trucking, bulk terminal, packaging, and warehousing needs of the plastics industry. Our strategic locations, modern systems, and dedicated employees allow us to provide custom tailored logistical solutions to fulfill the most challenging needs of our customers. Plastic Express operates from 15 warehouse locations and 37 rail terminals across the US. At many of the Plastic Express sites, we also handle some non-plastic commodities, which include; paper rolls, steel, building materials and other dry bulk materials. Plastic Express owns and operates roughly 130 trucks, with approximately 200 trailers performing full bulk truck distribution business. Plastic Express is headquartered in City of Industry, CA and has over 300 employees nationwide. Our goal has always been to exceed our customers’ expectations, and our “Can Do” attitude is what differentiates us from the competition.  Position

## Tokenizing Documents

In [4]:
def doc_tokenizer(doc):
    doc = doc.replace('\\n','').lower()
    sentences = sent_tokenize(doc)
    doc_tokens = [word_tokenize(sentence) for sentence in sentences]
    return doc_tokens

In [5]:
%time doc_tokens = doc_tokenizer(doc)
doc_tokens

CPU times: user 40.6 ms, sys: 7.59 ms, total: 48.1 ms
Wall time: 49.1 ms


[['company', 'overview', 'come', 'join', 'a', 'winning', 'team', '!'],
 ['since',
  '1970',
  ',',
  'plastic',
  'express',
  'has',
  'been',
  'leading',
  'the',
  'bulk',
  'trucking',
  ',',
  'bulk',
  'terminal',
  ',',
  'packaging',
  ',',
  'and',
  'warehousing',
  'needs',
  'of',
  'the',
  'plastics',
  'industry',
  '.'],
 ['our',
  'strategic',
  'locations',
  ',',
  'modern',
  'systems',
  ',',
  'and',
  'dedicated',
  'employees',
  'allow',
  'us',
  'to',
  'provide',
  'custom',
  'tailored',
  'logistical',
  'solutions',
  'to',
  'fulfill',
  'the',
  'most',
  'challenging',
  'needs',
  'of',
  'our',
  'customers',
  '.'],
 ['plastic',
  'express',
  'operates',
  'from',
  '15',
  'warehouse',
  'locations',
  'and',
  '37',
  'rail',
  'terminals',
  'across',
  'the',
  'us',
  '.'],
 ['at',
  'many',
  'of',
  'the',
  'plastic',
  'express',
  'sites',
  ',',
  'we',
  'also',
  'handle',
  'some',
  'non-plastic',
  'commodities',
  ',',
  'which',


## Parts of Speech Tagging

In [6]:
# Inspired from https://stackoverflow.com/a/15590384
def get_wordnet_pos(treebank_tag):
    """Converts a treebank POS tag to a wordnet POS tag."""
    if treebank_tag.startswith('J'):
        tag = wordnet.ADJ
    elif treebank_tag.startswith('V'):
        tag = wordnet.VERB
    elif treebank_tag.startswith('N'):
        tag = wordnet.NOUN
    elif treebank_tag.startswith('R'):
        tag = wordnet.ADV
    else:
        tag = ''
    return tag

def sentence_pos_tagger(sentence):
    """Takes a sentence as a list of tokens and returns a list of wordnet POS tagged tokens"""
    treebank_tags = pos_tag(sentence)
    wordnet_tags = [ 
        (treebank_tag[0], get_wordnet_pos(treebank_tag[1])) for treebank_tag in treebank_tags
    ]
    return wordnet_tags

def doc_pos_tagger(doc_tokens):
    pos_tags = [
        sentence_pos_tagger(sentence) for sentence in doc_tokens
    ]
    return pos_tags

In [7]:
%time doc_tags = doc_pos_tagger(doc_tokens)
doc_tags

CPU times: user 2.57 s, sys: 120 ms, total: 2.69 s
Wall time: 2.74 s


[[('company', 'n'),
  ('overview', 'v'),
  ('come', 'v'),
  ('join', 'n'),
  ('a', ''),
  ('winning', 'v'),
  ('team', 'n'),
  ('!', '')],
 [('since', ''),
  ('1970', ''),
  (',', ''),
  ('plastic', 'n'),
  ('express', 'n'),
  ('has', 'v'),
  ('been', 'v'),
  ('leading', 'v'),
  ('the', ''),
  ('bulk', 'n'),
  ('trucking', 'n'),
  (',', ''),
  ('bulk', 'a'),
  ('terminal', 'n'),
  (',', ''),
  ('packaging', 'n'),
  (',', ''),
  ('and', ''),
  ('warehousing', 'v'),
  ('needs', 'n'),
  ('of', ''),
  ('the', ''),
  ('plastics', 'n'),
  ('industry', 'n'),
  ('.', '')],
 [('our', ''),
  ('strategic', 'a'),
  ('locations', 'n'),
  (',', ''),
  ('modern', 'a'),
  ('systems', 'n'),
  (',', ''),
  ('and', ''),
  ('dedicated', 'v'),
  ('employees', 'n'),
  ('allow', 'v'),
  ('us', ''),
  ('to', ''),
  ('provide', 'v'),
  ('custom', 'n'),
  ('tailored', 'a'),
  ('logistical', 'a'),
  ('solutions', 'n'),
  ('to', ''),
  ('fulfill', 'v'),
  ('the', ''),
  ('most', 'r'),
  ('challenging', 'a'),
  ('

## Lemmatization

In [8]:
lemmatizer = WordNetLemmatizer()

def tag_lemmatizer(pos_tag):
    """Lemmatized a POS tagged word."""
    if pos_tag[1] != '':
        lemmatized_word = lemmatizer.lemmatize(pos_tag[0], pos_tag[1])
    else:
        lemmatized_word = pos_tag[0]
    return lemmatized_word
    
def sentence_lemmatizer(sentence_tags):
    """Lemmatize POS tagged words from a tagged sentence."""
    lemmatized_sentence = [
        tag_lemmatizer(pos_tag) for pos_tag in sentence_tags
    ]
    return lemmatized_sentence

def doc_lemmatizer(doc_tags):
    """Lemmetize tagged words from a job doc and flatten sentence nesting."""
    lemmatized_doc = []
    for sentence_tags in doc_tags:
        lemmatized_sentence = sentence_lemmatizer(sentence_tags) 
        lemmatized_doc.extend(lemmatized_sentence)
    return lemmatized_doc    

In [9]:
%time lemmatized_doc = doc_lemmatizer(doc_tags)
lemmatized_doc

CPU times: user 4.18 ms, sys: 17 µs, total: 4.19 ms
Wall time: 4.21 ms


['company',
 'overview',
 'come',
 'join',
 'a',
 'win',
 'team',
 '!',
 'since',
 '1970',
 ',',
 'plastic',
 'express',
 'have',
 'be',
 'lead',
 'the',
 'bulk',
 'trucking',
 ',',
 'bulk',
 'terminal',
 ',',
 'packaging',
 ',',
 'and',
 'warehouse',
 'need',
 'of',
 'the',
 'plastic',
 'industry',
 '.',
 'our',
 'strategic',
 'location',
 ',',
 'modern',
 'system',
 ',',
 'and',
 'dedicate',
 'employee',
 'allow',
 'us',
 'to',
 'provide',
 'custom',
 'tailored',
 'logistical',
 'solution',
 'to',
 'fulfill',
 'the',
 'most',
 'challenging',
 'need',
 'of',
 'our',
 'customer',
 '.',
 'plastic',
 'express',
 'operate',
 'from',
 '15',
 'warehouse',
 'location',
 'and',
 '37',
 'rail',
 'terminal',
 'across',
 'the',
 'us',
 '.',
 'at',
 'many',
 'of',
 'the',
 'plastic',
 'express',
 'site',
 ',',
 'we',
 'also',
 'handle',
 'some',
 'non-plastic',
 'commodity',
 ',',
 'which',
 'include',
 ';',
 'paper',
 'roll',
 ',',
 'steel',
 ',',
 'building',
 'material',
 'and',
 'other',
 'dr

## Removing Stopwords and Punctuation

In [10]:
def clean_doc(lemmatized_doc):
    my_stopwords = stopwords.words('english')
    cleaned_doc = [
        word for word in lemmatized_doc
        if word.isalpha() and word not in my_stopwords
        and len(word)>1
    ]
    return cleaned_doc

In [11]:
%time cleaned_doc = clean_doc(lemmatized_doc)
cleaned_doc

CPU times: user 3.66 ms, sys: 0 ns, total: 3.66 ms
Wall time: 3.22 ms


['company',
 'overview',
 'come',
 'join',
 'win',
 'team',
 'since',
 'plastic',
 'express',
 'lead',
 'bulk',
 'trucking',
 'bulk',
 'terminal',
 'packaging',
 'warehouse',
 'need',
 'plastic',
 'industry',
 'strategic',
 'location',
 'modern',
 'system',
 'dedicate',
 'employee',
 'allow',
 'us',
 'provide',
 'custom',
 'tailored',
 'logistical',
 'solution',
 'fulfill',
 'challenging',
 'need',
 'customer',
 'plastic',
 'express',
 'operate',
 'warehouse',
 'location',
 'rail',
 'terminal',
 'across',
 'us',
 'many',
 'plastic',
 'express',
 'site',
 'also',
 'handle',
 'commodity',
 'include',
 'paper',
 'roll',
 'steel',
 'building',
 'material',
 'dry',
 'bulk',
 'material',
 'plastic',
 'express',
 'operate',
 'roughly',
 'truck',
 'approximately',
 'trailer',
 'perform',
 'full',
 'bulk',
 'truck',
 'distribution',
 'business',
 'plastic',
 'express',
 'headquarter',
 'city',
 'industry',
 'ca',
 'employee',
 'nationwide',
 'goal',
 'always',
 'exceed',
 'customer',
 'expectat

## Make Bigrams and Trigrams

In [12]:
def combine_grams(cleaned_doc):
    bigram_model = Phrases(cleaned_doc)
    trigram_model = Phrases(bigram_model[cleaned_doc], min_count=1)
    processed_doc = list(trigram_model[bigram_model[cleaned_doc]])
    return processed_doc

In [13]:
%time processed_doc = combine_grams(cleaned_doc)
processed_doc

CPU times: user 21.9 ms, sys: 0 ns, total: 21.9 ms
Wall time: 23 ms


['company',
 'overview',
 'come',
 'join',
 'win',
 'team',
 'since',
 'plastic',
 'express',
 'lead',
 'bulk',
 'trucking',
 'bulk',
 'terminal',
 'packaging',
 'warehouse',
 'need',
 'plastic',
 'industry',
 'strategic',
 'location',
 'modern',
 'system',
 'dedicate',
 'employee',
 'allow',
 'us',
 'provide',
 'custom',
 'tailored',
 'logistical',
 'solution',
 'fulfill',
 'challenging',
 'need',
 'customer',
 'plastic',
 'express',
 'operate',
 'warehouse',
 'location',
 'rail',
 'terminal',
 'across',
 'us',
 'many',
 'plastic',
 'express',
 'site',
 'also',
 'handle',
 'commodity',
 'include',
 'paper',
 'roll',
 'steel',
 'building',
 'material',
 'dry',
 'bulk',
 'material',
 'plastic',
 'express',
 'operate',
 'roughly',
 'truck',
 'approximately',
 'trailer',
 'perform',
 'full',
 'bulk',
 'truck',
 'distribution',
 'business',
 'plastic',
 'express',
 'headquarter',
 'city',
 'industry',
 'ca',
 'employee',
 'nationwide',
 'goal',
 'always',
 'exceed',
 'customer',
 'expectat

## Data Processing

In [14]:
def doc_processor(doc):
    doc_tokens = doc_tokenizer(doc)
    doc_tags = doc_pos_tagger(doc_tokens)
    lemmatized_doc = doc_lemmatizer(doc_tags)
    cleaned_doc = clean_doc(lemmatized_doc)
    processed_doc = combine_grams(cleaned_doc)
    return processed_doc

In [15]:
%time processed_doc_2 = doc_processor(doc)
assert processed_doc == processed_doc_2, "Should match"

CPU times: user 57.8 ms, sys: 8.14 ms, total: 65.9 ms
Wall time: 63.6 ms


In [16]:
%time processed_docs = [doc_processor(doc) for doc in docs]

CPU times: user 10min 9s, sys: 2.35 s, total: 10min 11s
Wall time: 10min 12s
