# Convert LDA Topics to d2v Vectors
- Richard Kuzma, 8SEP2020

## Load LDA models

In [1]:
### Imports

# basic
from pprint import pprint
import pickle

# data science
import pandas as pd

# NLP
import gensim
from gensim.models import CoherenceModel, LdaModel

# plotting
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline

# ignore depreciation warning
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 


In [2]:
### use LDA probability distribution to find the center vector of each topic

def LDA_to_words(LDA, W2V, words_per_topic=100, show_top_words=20):
    pos_all = []
    total_missed_words = 0

    for i in range (0, LDA.num_topics): # for each LDA topic
    #     print('\n' + '-'*40 + 'Topic Number: {}'.format(i) + '-'*40 + '\n')
        missed_words = 0
        pos_topic = []
    #     print('Num words per topic: {}'.format(words_per_topic))
    #     print('array length: {}'.format(len(LDA_90.show_topic(i, topn=words_per_topic))))
        for j in range(0, len(LDA.show_topic(i, topn=words_per_topic))): # for first 'words_per_topic' words in a topic
            try:
                # multiply w2v word vector by weight
    #             print('i:{} j:{}'.format(i,j))
                pos_topic.append(W2V[LDA.show_topic(i, topn=words_per_topic)[j][0]]*float(LDA.show_topic(i, topn=words_per_topic)[j][1]))
    #             print('appended weighted vector for topic: {} and sub-word #{}: {} '.format(i, j, LDA_40.show_topic(i, topn=words_per_topic)[j][0]))
            except KeyError:
    #             print('Key error.......missed a word from topic: {}, number: {}, word: {}'.format(i,j,LDA_40.show_topic(i, topn=words_per_topic)[j][0]))
                missed_words +=1 

        total_missed_words += missed_words
        pos_all.append(pos_topic)
    #     print('\nappended pos_topic {} to pos_all'.format(i))
    #     print('Missed {} words'.format(missed_words))

    print('Missed words: {}. Total words searched: {}\n\n'.format(total_missed_words, LDA.num_topics*words_per_topic))



    ### use weighted word vectors to find the top 20 most similar words for each topic
    similar_to_LDA_topics = []
    for i in range(0, LDA.num_topics):
        similar_to_LDA_topics.append(W2V.wv.most_similar(positive=pos_all[i], topn=show_top_words))

    from pprint import pprint
    for i in range(0, len(similar_to_LDA_topics)):
        print('LDA Topic #{} word probability distribution \n'.format(i))
        pprint(LDA.show_topic(i, topn=show_top_words))
        print('\nTopic #{} Nearest word vectors \n'.format(i))
        pprint(similar_to_LDA_topics[i])
        print('\n\n' +'*'*50 + '\n\n')
        
    print('type similar_to_LDA_topics: {}'.format(type(similar_to_LDA_topics)))
    print('len similar_to_LDA_topics: {}'.format(len(similar_to_LDA_topics)))
    print('similar_to_LDA_topics[0]: {}'.format(similar_to_LDA_topics[0]))
    return similar_to_LDA_topics
    
   

### Load LDA Models

In [3]:
path = '/Users/richardkuzma/coding/analysis/monster/models/'

## I will use LDA with 90 topics not the 40 topic model
filename = 'monster_jobs_LDA_40_topics_cv_zero476.pkl'
with open(path+filename, 'rb') as f:
    LDA_40 = pickle.load(f)
    
filename = 'monster_jobs_LDA_90_topics_cv_zero461.pkl'
with open(path+filename, 'rb') as f:
    LDA_90 = pickle.load(f)


### Load pre-trained word vectors from Google 
- Gensim uses 'KeyedVectors' to hold just word vectors, not full model. Saves RAM but can't be trained
- Google's word vectors https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing
    - More info from Google: https://code.google.com/archive/p/word2vec/
    - Pre-trained vectors trained on part of Google News dataset (about 100 billion words). The model contains 300-dimensional vectors for 3 million words and phrases

In [56]:
goog50k = gensim.models.KeyedVectors.load_word2vec_format('/Users/richardkuzma/coding/analysis/utils/GoogleNews-vectors-negative300.bin.gz', binary=True, limit=50000)
print('loaded google news word vectors  of size 50,000 with dimension 300 vectors')
type(goog50k)

### tried 500k before, now 100k

loaded google news word vectors  of size 50,000 with dimension 300 vectors


gensim.models.keyedvectors.Word2VecKeyedVectors

### Load pre-trained word vectors from Facebook FastText
- FastText vectors available for download here: https://fasttext.cc/docs/en/english-vectors.html
    - I chose #2: wiki-news-300d-1M-subword.vec.zip: 1 million word vectors trained with subword infomation on Wikipedia 2017, UMBC webbase corpus and statmt.org news dataset (16B tokens).
- Converting to binary from a vec
    - https://github.com/facebookresearch/fastText/issues/171#issuecomment-294295302

In [5]:
FT = gensim.models.KeyedVectors.load_word2vec_format('/Users/richardkuzma/coding/analysis/utils/wiki-news-300d-1M-subword.vec.bin', binary=True)
print('loaded FastText w2v vectors with dimension 300 vectors')
type(FT)

loaded FastText w2v vectors with dimension 300 vectors


gensim.models.keyedvectors.Word2VecKeyedVectors

## LDA (90 topics, 100 words per topic) <> Google News 

In [57]:
goog_90_100 = LDA_to_words(LDA_90, goog50k, words_per_topic=100, show_top_words=25)

Missed words: 1239. Total words searched: 9000


LDA Topic #0 word probability distribution 

[('customer', 0.035272606),
 ('business', 0.021751547),
 ('new', 0.020133903),
 ('sale', 0.014756303),
 ('ability', 0.014664193),
 ('level', 0.013416621),
 ('mop', 0.011995135),
 ('manager', 0.011379488),
 ('custodial', 0.010694284),
 ('branch', 0.01012451),
 ('product', 0.009958617),
 ('save', 0.00943404),
 ('industry', 0.009375976),
 ('exist', 0.009347329),
 ('grade', 0.009292667),
 ('referral', 0.00799078),
 ('smile', 0.007824443),
 ('ensure', 0.007772093),
 ('knowledge', 0.0077156965),
 ('follow', 0.0076437294),
 ('without_reasonable_accommodation', 0.0075819716),
 ('though', 0.007306947),
 ('present', 0.0071192477),
 ('department', 0.0070806458),
 ('company', 0.006879281)]

Topic #0 Nearest word vectors 

[('customer', 0.5932914018630981),
 ('business', 0.5720488429069519),
 ('service', 0.49301502108573914),
 ('customers', 0.4751899242401123),
 ('services', 0.4721561670303345),
 ('product

[('do', 0.5258733630180359),
 ('time', 0.486773818731308),
 ('want', 0.4791724681854248),
 ('enjoy', 0.4715310037136078),
 ('just', 0.4669871926307678),
 ('for', 0.4656944274902344),
 ('need', 0.46426522731781006),
 ('get', 0.462766170501709),
 ('not', 0.45888635516166687),
 ('work', 0.45645421743392944),
 ('all', 0.45420926809310913),
 ('always', 0.449642539024353),
 ('really', 0.44460082054138184),
 ('individual', 0.44231516122817993),
 ('vice_versa', 0.44032806158065796),
 ('full', 0.43796294927597046),
 ('that', 0.4362155497074127),
 ('so', 0.43506836891174316),
 ('For_Restrictions', 0.43439698219299316),
 ('now', 0.43404650688171387),
 ('kind', 0.43030309677124023),
 ('know', 0.42914044857025146),
 ('they', 0.4289049506187439),
 ('them', 0.42788761854171753),
 ('good', 0.42719539999961853)]


**************************************************


LDA Topic #24 word probability distribution 

[('employee', 0.061832186),
 ('benefit', 0.03651849),
 ('program', 0.02657301),
 ('hr', 0.02

 ('nonjob_googleplus_break_case', 0.0151635865),
 ('client_link_homepage', 0.015160896),
 ('break_case_linkedin', 0.015155869),
 ('attr_href_var', 0.015148218),
 ('break_case_http', 0.015138054),
 ('facebook_test_href', 0.015124846),
 ('span_trackingjobbody_function', 0.011701889)]

Topic #41 Nearest word vectors 

[('data', 0.4951430857181549),
 ('track', 0.49284300208091736),
 ('For_Restrictions', 0.473052442073822),
 ('database', 0.46870166063308716),
 ('services', 0.45683348178863525),
 ('programs', 0.44040578603744507),
 ('information', 0.4388931095600128),
 ('behavioral', 0.4377724528312683),
 ('databases', 0.43755456805229187),
 ('need', 0.43653011322021484),
 ('employment', 0.4308532476425171),
 ('help', 0.43003028631210327),
 ('manage', 0.42149293422698975),
 ('business', 0.41924798488616943),
 ('do', 0.41841310262680054),
 ('network', 0.4179403781890869),
 ('program', 0.4154964089393616),
 ('work', 0.41546010971069336),
 ('service', 0.410115122795105),
 ('that', 0.40840399265

 ('equipment', 0.014060352),
 ('able', 0.013934739),
 ('pass_physical_exam', 0.013628537),
 ('assignment_may_depend', 0.013427274),
 ('see_nationalguard_com', 0.013424745),
 ('qualifications_actual_mos', 0.01341005),
 ('candidates_earn_per', 0.013340294),
 ('mos_availability_benefits', 0.013337529),
 ('servicelow_cost_life', 0.0133270705),
 ('savings_planstudent_loan_repayment', 0.01331821),
 ('standardsmust_meet_citizenship', 0.013311985),
 ('school_diploma_gedmust_ages', 0.01329195),
 ('meet_legal_moral', 0.013257946),
 ('earning_regular_paycheck_qualifying', 0.013229477),
 ('program_existing_loans', 0.013190593)]

Topic #59 Nearest word vectors 

[('training', 0.6831969618797302),
 ('required', 0.5155486464500427),
 ('need', 0.492228627204895),
 ('work', 0.4904162287712097),
 ('benefit', 0.46891337633132935),
 ('trainings', 0.46844756603240967),
 ('retraining', 0.4627118706703186),
 ('needs', 0.46210741996765137),
 ('needed', 0.4596165716648102),
 ('learning', 0.45463675260543823),


 ('brand_ambassador', 0.02621447),
 ('missouri', 0.021533709),
 ('casual_dining', 0.018089997),
 ('visa', 0.016229),
 ('small_medium', 0.014724146),
 ('fine_dining', 0.01417891),
 ('immigration', 0.012596701),
 ('polished', 0.011571904),
 ('sure', 0.011138321),
 ('excitement', 0.010995129),
 ('management', 0.009614191),
 ('storage', 0.00957932),
 ('bi_lingual', 0.009314081),
 ('scott', 0.008208757),
 ('reaction', 0.008150455),
 ('serious', 0.008090585),
 ('upbeat', 0.007906355),
 ('constant', 0.0077892556),
 ('genuine', 0.0074098334),
 ('ability', 0.0068898415),
 ('grievance', 0.006722029),
 ('sw', 0.0066339336),
 ('customer', 0.006386027),
 ('dispatcher', 0.0061132554)]

Topic #77 Nearest word vectors 

[('For_Restrictions', 0.4887862801551819),
 ('business', 0.44200247526168823),
 ('kind', 0.4316900670528412),
 ('vice_versa', 0.426033079624176),
 ('dynamic', 0.4230712652206421),
 ('experience', 0.4175475239753723),
 ('genuine', 0.4166359007358551),
 ('really', 0.41470566391944885),
 

## LDA (90 topics, 15 words per topic) <> Google News

In [58]:
goog_90_15 = LDA_to_words(LDA_90, goog50k, words_per_topic=15, show_top_words=25)

Missed words: 133. Total words searched: 1350


LDA Topic #0 word probability distribution 

[('customer', 0.035272606),
 ('business', 0.021751547),
 ('new', 0.020133903),
 ('sale', 0.014756303),
 ('ability', 0.014664193),
 ('level', 0.013416621),
 ('mop', 0.011995135),
 ('manager', 0.011379488),
 ('custodial', 0.010694284),
 ('branch', 0.01012451),
 ('product', 0.009958617),
 ('save', 0.00943404),
 ('industry', 0.009375976),
 ('exist', 0.009347329),
 ('grade', 0.009292667),
 ('referral', 0.00799078),
 ('smile', 0.007824443),
 ('ensure', 0.007772093),
 ('knowledge', 0.0077156965),
 ('follow', 0.0076437294),
 ('without_reasonable_accommodation', 0.0075819716),
 ('though', 0.007306947),
 ('present', 0.0071192477),
 ('department', 0.0070806458),
 ('company', 0.006879281)]

Topic #0 Nearest word vectors 

[('customer', 0.7500631809234619),
 ('business', 0.6532907485961914),
 ('customers', 0.6078958511352539),
 ('product', 0.5377312898635864),
 ('Customer', 0.5319662094116211),
 ('company',

 ('contractor', 0.662155032157898),
 ('projects', 0.6387174129486084),
 ('subcontractor', 0.5952187180519104),
 ('renovation', 0.5930166244506836),
 ('contractors', 0.5737736225128174),
 ('subcontractors', 0.5735054016113281),
 ('excavation', 0.5465205907821655),
 ('constructing', 0.5353783369064331),
 ('feasibility_study', 0.5314325094223022),
 ('renovations', 0.5194272398948669),
 ('building', 0.5168908834457397),
 ('remodeling', 0.5144917964935303),
 ('nearing_completion', 0.50848388671875),
 ('construct', 0.5084463357925415),
 ('installation', 0.5008558630943298),
 ('repaving', 0.4968126714229584),
 ('engineering', 0.4961633086204529),
 ('design', 0.4954822063446045),
 ('refurbishment', 0.4933806359767914),
 ('Contractor', 0.49303919076919556),
 ('development', 0.4923073649406433),
 ('maintenance', 0.49217522144317627)]


**************************************************


LDA Topic #23 word probability distribution 

[('time', 0.04120199),
 ('associate', 0.029576808),
 ('full', 0

 ('professional', 0.0047515314),
 ('policy', 0.00465993),
 ('duty', 0.0046251565)]

Topic #40 Nearest word vectors 

[('program', 0.6265138387680054),
 ('programs', 0.6186926364898682),
 ('outreach', 0.5367212891578674),
 ('services', 0.5121293067932129),
 ('expertise', 0.5099163055419922),
 ('management', 0.5078393220901489),
 ('community', 0.5023363828659058),
 ('educational', 0.49775493144989014),
 ('ability', 0.49545997381210327),
 ('knowledge', 0.4921566843986511),
 ('activities', 0.4886898994445801),
 ('support', 0.48738276958465576),
 ('development', 0.4830302894115448),
 ('resources', 0.47672533988952637),
 ('organization', 0.474355548620224),
 ('initiatives', 0.47364306449890137),
 ('efforts', 0.4697362184524536),
 ('mentoring', 0.46718135476112366),
 ('education', 0.4658092260360718),
 ('help', 0.4626426100730896),
 ('involvement', 0.4606112837791443),
 ('leadership', 0.4590502977371216),
 ('capability', 0.45842674374580383),
 ('communication', 0.452644020318985),
 ('experien

 ('time', 0.009367534),
 ('apply', 0.009191451),
 ('instructor', 0.008883664),
 ('state', 0.008824025),
 ('customer', 0.008618692),
 ('professional', 0.0086028855),
 ('university', 0.008449189),
 ('general', 0.008340292),
 ('associate', 0.008281076),
 ('degree', 0.00826939),
 ('course', 0.008225849),
 ('duty', 0.0079329)]

Topic #58 Nearest word vectors 

[('student', 0.7781370878219604),
 ('school', 0.7320085763931274),
 ('students', 0.700234055519104),
 ('college', 0.6843160390853882),
 ('education', 0.6822032928466797),
 ('teacher', 0.6490963697433472),
 ('academic', 0.6327674388885498),
 ('university', 0.627380907535553),
 ('teaching', 0.620577335357666),
 ('undergraduate', 0.6133927702903748),
 ('classroom', 0.6081781983375549),
 ('graduate', 0.5932046175003052),
 ('elementary', 0.5920031070709229),
 ('teachers', 0.5797984004020691),
 ('curriculum', 0.5778907537460327),
 ('schools', 0.5724810361862183),
 ('coursework', 0.5718144178390503),
 ('semester', 0.5641592741012573),
 ('col

 ('product', 0.011984631),
 ('environment', 0.011090435),
 ('disability', 0.009450952),
 ('perform', 0.00933513),
 ('small', 0.008796202),
 ('requirement', 0.008577828),
 ('ability', 0.008504891),
 ('part', 0.008395665),
 ('wellness', 0.0082272645),
 ('large', 0.007386085),
 ('follow', 0.0073282034),
 ('value', 0.0071493913),
 ('regional', 0.0069509526),
 ('enterprise', 0.006645073),
 ('equipment', 0.0066361316),
 ('must', 0.0066209375),
 ('safety', 0.0064606993),
 ('manager', 0.0064243283),
 ('duty', 0.0062459265),
 ('appropriate', 0.006161379)]

Topic #76 Nearest word vectors 

[('program', 0.5393452048301697),
 ('product', 0.49983447790145874),
 ('programs', 0.49547824263572693),
 ('material', 0.4892370104789734),
 ('products', 0.4749208092689514),
 ('company', 0.4660184383392334),
 ('environment', 0.46120068430900574),
 ('business', 0.44417235255241394),
 ('technology', 0.44014638662338257),
 ('ability', 0.4376339316368103),
 ('component', 0.4368082284927368),
 ('services', 0.43168

## LDA (40 topics, 100 words per topic) <> Google News w2v

In [59]:
goog_40_100 = LDA_to_words(LDA_40, goog50k, words_per_topic=100, show_top_words=25)

Missed words: 417. Total words searched: 4000


LDA Topic #0 word probability distribution 

[('customer', 0.041318946),
 ('ability', 0.020201558),
 ('company', 0.01742402),
 ('safety', 0.013874218),
 ('equipment', 0.01379462),
 ('perform', 0.013377259),
 ('product', 0.013211826),
 ('vehicle', 0.011846664),
 ('education', 0.011182732),
 ('duty', 0.01114483),
 ('high', 0.00994427),
 ('time', 0.009842404),
 ('order', 0.009658955),
 ('requirement', 0.009385821),
 ('need', 0.009314751),
 ('associate', 0.009056977),
 ('general', 0.00772105),
 ('training', 0.0075121247),
 ('production', 0.0074061863),
 ('maintain', 0.0073220823),
 ('month', 0.007314947),
 ('assist', 0.007207732),
 ('school_diploma', 0.007024344),
 ('one', 0.006514933),
 ('local', 0.0063718017)]

Topic #0 Nearest word vectors 

[('customer', 0.5765080451965332),
 ('service', 0.4922577142715454),
 ('need', 0.4822095036506653),
 ('business', 0.48045986890792847),
 ('required', 0.474545419216156),
 ('necessary', 0.46458578109741

 ('customer_satisfaction', 0.4770751893520355),
 ('necessary', 0.46625062823295593),
 ('provide', 0.4656580090522766),
 ('clients', 0.46381378173828125),
 ('help', 0.4545920491218567),
 ('requirements', 0.4540462791919708),
 ('operational_efficiency', 0.4511488080024719),
 ('needs', 0.45059090852737427),
 ('required', 0.4503343105316162),
 ('facilitate', 0.44739383459091187),
 ('product', 0.4466116428375244),
 ('Customer', 0.4446732699871063),
 ('maintain', 0.44402265548706055),
 ('client', 0.44314831495285034),
 ('onsite', 0.442190021276474),
 ('proactively', 0.4420904517173767),
 ('services', 0.4414784610271454),
 ('user', 0.44061994552612305),
 ('responsiveness', 0.4381534159183502)]


**************************************************


LDA Topic #22 word probability distribution 

[('system', 0.028908757),
 ('test', 0.020633584),
 ('engineering', 0.018416764),
 ('design', 0.014038266),
 ('engineer', 0.013283822),
 ('product', 0.008862588),
 ('security', 0.008832565),
 ('must', 0.0


Topic #39 Nearest word vectors 

[('store', 0.6988464593887329),
 ('manager', 0.6330937147140503),
 ('shop', 0.5288415551185608),
 ('customer', 0.5237778425216675),
 ('Store', 0.5145851969718933),
 ('warehouse', 0.5109283328056335),
 ('stores', 0.5101717710494995),
 ('supervisor', 0.4972480237483978),
 ('department', 0.495333731174469),
 ('employee', 0.4932955801486969),
 ('cashier', 0.4930356740951538),
 ('salesperson', 0.49075576663017273),
 ('retail', 0.48519831895828247),
 ('clerk', 0.47798824310302734),
 ('assistant', 0.47723081707954407),
 ('dealership', 0.4725117087364197),
 ('merchandising', 0.46731507778167725),
 ('shopper', 0.46515923738479614),
 ('mall', 0.4649219214916229),
 ('manger', 0.46105992794036865),
 ('customers', 0.459277480840683),
 ('business', 0.45588418841362),
 ('merchandise', 0.45303961634635925),
 ('retailer', 0.4516163766384125),
 ('vice_president', 0.4445596933364868)]


**************************************************


type similar_to_LDA_topics: <cla

- Lots of '%_#F##*' or 'BY_*_*' which go away when we decrease words_per_topic (weighted vectors used to search w2v)
- Potentially these '%_#F##*' or 'BY_*_*' are long vectors that are 'close' to our topic vectors when we add many topic words together

## LDA (40 topics, 15 words per topic) <> Google News w2v

In [None]:
# a lot fewer of the '&_F##' words with 15 words per topic versus 50 or 100...
goog_40_15 = LDA_to_words(LDA_40, goog50k, words_per_topic=15, show_top_words=20)

## LDA (90 topics, 100 words per topic) <> FB FastText vectors

In [10]:
ft_90_100 = LDA_to_words(LDA_90, FT, words_per_topic=100, show_top_words=25)

Missed words: 1002. Total words searched: 9000


LDA Topic #0 word probability distribution 

[('customer', 0.035272606),
 ('business', 0.021751547),
 ('new', 0.020133903),
 ('sale', 0.014756303),
 ('ability', 0.014664193),
 ('level', 0.013416621),
 ('mop', 0.011995135),
 ('manager', 0.011379488),
 ('custodial', 0.010694284),
 ('branch', 0.01012451),
 ('product', 0.009958617),
 ('save', 0.00943404),
 ('industry', 0.009375976),
 ('exist', 0.009347329),
 ('grade', 0.009292667),
 ('referral', 0.00799078),
 ('smile', 0.007824443),
 ('ensure', 0.007772093),
 ('knowledge', 0.0077156965),
 ('follow', 0.0076437294),
 ('without_reasonable_accommodation', 0.0075819716),
 ('though', 0.007306947),
 ('present', 0.0071192477),
 ('department', 0.0070806458),
 ('company', 0.006879281)]

Topic #0 Nearest word vectors 

[('commnunity', 0.783383846282959),
 ('deline', 0.7768153548240662),
 ('positiion', 0.7755110859870911),
 ('make-good', 0.7753884792327881),
 ('permant', 0.7733047008514404),
 ('non-line

 ('commercial', 0.012396035),
 ('manager', 0.0115667395),
 ('schedule', 0.010173596),
 ('site', 0.009113058),
 ('subcontractor', 0.008617329),
 ('company', 0.008528492),
 ('management', 0.008135961),
 ('superintendent', 0.00806793),
 ('field', 0.0072071888),
 ('contract', 0.0071467054),
 ('design', 0.0070092026),
 ('report', 0.007004853),
 ('cost', 0.006902913),
 ('text_null_caption_null', 0.00663927),
 ('building', 0.006302674),
 ('manage', 0.006232562),
 ('general', 0.0059919115),
 ('client', 0.005767047),
 ('please', 0.0057143155),
 ('bid', 0.0055330163),
 ('knowledge', 0.005425793)]

Topic #22 Nearest word vectors 

[('project', 0.787899374961853),
 ('plannning', 0.7800086736679077),
 ('work', 0.7771751284599304),
 ('commnunity', 0.7734687328338623),
 ('design', 0.7709337472915649),
 ('proect', 0.7600258588790894),
 ('deline', 0.7586471438407898),
 ('development', 0.7554073929786682),
 ('pland', 0.7456986904144287),
 ('make-good', 0.7445080876350403),
 ('construction', 0.7429660558

[('co', 0.05300691),
 ('bakery', 0.03981269),
 ('san_francisco', 0.03583798),
 ('ajilon_professional', 0.03507498),
 ('district', 0.032654036),
 ('semi', 0.025754001),
 ('garnishment', 0.0194411),
 ('br', 0.019007303),
 ('attach', 0.018424112),
 ('california', 0.0149387885),
 ('drug_alcohol', 0.014586807),
 ('auto_req_id', 0.013223959),
 ('loading_unloading', 0.012637739),
 ('dallas_fort_worth', 0.012383269),
 ('angular_js', 0.011933311),
 ('arizona', 0.011745963),
 ('mt', 0.011458677),
 ('website_www_ajilon', 0.011292911),
 ('nashville', 0.011056415),
 ('san_francisco_ca', 0.010788448),
 ('free_parking', 0.010367809),
 ('golf_course', 0.009997033),
 ('beach', 0.009364332),
 ('az', 0.009298317),
 ('plano', 0.009184952)]

Topic #31 Nearest word vectors 

[('co', 0.7442973256111145),
 ('fass', 0.6884491443634033),
 ('camra', 0.687393844127655),
 ('corprate', 0.6779894828796387),
 ('colgate', 0.6767540574073792),
 ('cooool', 0.676693320274353),
 ('saft', 0.6715396046638489),
 ('papper', 0

[('automotive', 0.11351022),
 ('auto', 0.05031122),
 ('boston', 0.03169816),
 ('wage', 0.030576374),
 ('dealership', 0.027768457),
 ('car', 0.020549394),
 ('benefit', 0.017751563),
 ('quality', 0.016773023),
 ('package', 0.016344242),
 ('supplier', 0.016337428),
 ('yr', 0.010671775),
 ('tree', 0.010620361),
 ('tool', 0.010549603),
 ('hourly_rate', 0.010134093),
 ('please', 0.010080552),
 ('negotiable', 0.0093982145),
 ('group', 0.00911271),
 ('net_developer', 0.008829433),
 ('michigan', 0.008657296),
 ('competitive', 0.008517268),
 ('vote', 0.00823905),
 ('look', 0.008071774),
 ('ppm', 0.0073391586),
 ('apply', 0.0071362006),
 ('offer', 0.00684912)]

Topic #44 Nearest word vectors 

[('automotive', 0.7500581741333008),
 ('permant', 0.749443531036377),
 ('auto', 0.7484334707260132),
 ('auto-mechanic', 0.7451927065849304),
 ('automative', 0.7412312030792236),
 ('supermaket', 0.7324087023735046),
 ('corprate', 0.7315906286239624),
 ('cark', 0.7314484715461731),
 ('indutry', 0.731029391288

 ('support', 0.7456591725349426),
 ('offfice', 0.7431962490081787),
 ('placment', 0.7422283887863159),
 ('positition', 0.7421045303344727),
 ('expeience', 0.7412176728248596),
 ('funtioning', 0.7392860651016235),
 ('requestable', 0.7390142679214478),
 ('type-ahead', 0.738479733467102),
 ('actvity', 0.7371766567230225),
 ('plannning', 0.7362276315689087)]


**************************************************


LDA Topic #58 word probability distribution 

[('student', 0.04741425),
 ('education', 0.025878197),
 ('ability', 0.018002672),
 ('teach', 0.014887655),
 ('company', 0.014147384),
 ('school', 0.01398441),
 ('career', 0.011844073),
 ('high', 0.011579666),
 ('one', 0.011017833),
 ('residential', 0.010601678),
 ('training', 0.010550981),
 ('candidate', 0.010441361),
 ('college', 0.010121955),
 ('time', 0.009367534),
 ('apply', 0.009191451),
 ('instructor', 0.008883664),
 ('state', 0.008824025),
 ('customer', 0.008618692),
 ('professional', 0.0086028855),
 ('university', 0.008449189),


[('ability', 0.01563777),
 ('business', 0.015293473),
 ('develop', 0.013345673),
 ('client', 0.012690125),
 ('product', 0.011784904),
 ('market', 0.010936474),
 ('customer', 0.010643297),
 ('within', 0.01012435),
 ('relationship', 0.009949414),
 ('development', 0.009930247),
 ('strong', 0.0097287875),
 ('goal', 0.009544416),
 ('management', 0.008744255),
 ('drive', 0.008598729),
 ('demonstrate', 0.007953634),
 ('partner', 0.0075519443),
 ('manage', 0.007285483),
 ('leadership', 0.0072067464),
 ('objective', 0.0068140985),
 ('key', 0.0067616967),
 ('strategy', 0.0066917683),
 ('result', 0.0063685942),
 ('plan', 0.0063519175),
 ('achieve', 0.0056539457),
 ('lead', 0.005613562)]

Topic #72 Nearest word vectors 

[('commnunity', 0.7786359190940857),
 ('positiion', 0.7755331993103027),
 ('expeience', 0.7733460068702698),
 ('proect', 0.769914984703064),
 ('strategy-making', 0.769585371017456),
 ('deline', 0.768868088722229),
 ('actvity', 0.7686043977737427),
 ('work-and', 0.7663387060165405)

 ('make-good', 0.7658796906471252),
 ('strategy-making', 0.7622929811477661),
 ('product', 0.7619858980178833),
 ('promtion', 0.7559325098991394),
 ('commnunity', 0.7549210786819458),
 ('development', 0.7535513639450073),
 ('management', 0.7520729899406433),
 ('actvity', 0.7510919570922852),
 ('positiion', 0.7509681582450867),
 ('one-product', 0.7470943927764893),
 ('francise', 0.744343638420105),
 ('proect', 0.7443389296531677),
 ('deline', 0.7438275814056396),
 ('market', 0.7433369159698486),
 ('plannning', 0.7414018511772156),
 ('office-wide', 0.7409025430679321),
 ('commmercial', 0.739406168460846),
 ('groupwide', 0.7381013631820679),
 ('service', 0.7380921840667725),
 ('back-out', 0.7379436492919922),
 ('re-marketing', 0.7362821102142334),
 ('buiness', 0.7347974181175232),
 ('expeience', 0.7334964275360107)]


**************************************************


LDA Topic #85 word probability distribution 

[('office', 0.017429058),
 ('must', 0.015727049),
 ('voice', 0.013619319),

## LDA (90 topics, 15 words per topic) <> FB FastText vectors

In [11]:
ft_90_15 = LDA_to_words(LDA_90, FT, words_per_topic=15, show_top_words=25)

Missed words: 99. Total words searched: 1350


LDA Topic #0 word probability distribution 

[('customer', 0.035272606),
 ('business', 0.021751547),
 ('new', 0.020133903),
 ('sale', 0.014756303),
 ('ability', 0.014664193),
 ('level', 0.013416621),
 ('mop', 0.011995135),
 ('manager', 0.011379488),
 ('custodial', 0.010694284),
 ('branch', 0.01012451),
 ('product', 0.009958617),
 ('save', 0.00943404),
 ('industry', 0.009375976),
 ('exist', 0.009347329),
 ('grade', 0.009292667),
 ('referral', 0.00799078),
 ('smile', 0.007824443),
 ('ensure', 0.007772093),
 ('knowledge', 0.0077156965),
 ('follow', 0.0076437294),
 ('without_reasonable_accommodation', 0.0075819716),
 ('though', 0.007306947),
 ('present', 0.0071192477),
 ('department', 0.0070806458),
 ('company', 0.006879281)]

Topic #0 Nearest word vectors 

[('business', 0.7667962312698364),
 ('new', 0.7580097317695618),
 ('make-good', 0.7481240034103394),
 ('one-product', 0.7460594177246094),
 ('office-wide', 0.7364851832389832),
 ('customer

 ('product', 0.008085122),
 ('females_protected_veterans', 0.00789547),
 ('potential', 0.0076261074),
 ('employer', 0.0073023224),
 ('every', 0.0071111224),
 ('http_www_eeoc_gov', 0.007066582),
 ('individuals_disabilitiesto_learn', 0.0070578014),
 ('copy_paste_url_browser', 0.007039268),
 ('associate', 0.007013645)]

Topic #14 Nearest word vectors 

[('make-good', 0.7539668083190918),
 ('in-and', 0.7443908452987671),
 ('permant', 0.737037181854248),
 ('work-and', 0.7345050573348999),
 ('business', 0.7337453961372375),
 ('deline', 0.7289800643920898),
 ('totake', 0.728358268737793),
 ('time', 0.7272432446479797),
 ('expeience', 0.7265710830688477),
 ('readvertise', 0.7255897521972656),
 ('positiion', 0.7250790596008301),
 ('employeer', 0.7247448563575745),
 ('immmediate', 0.7236059308052063),
 ('non-one', 0.7232809066772461),
 ('commnunity', 0.7208001613616943),
 ('indutry', 0.7207871079444885),
 ('francise', 0.7201409935951233),
 ('actvity', 0.7190602421760559),
 ('finantial', 0.718760

 ('safety', 0.009140609),
 ('duty', 0.008972478),
 ('environment', 0.008780279),
 ('essential', 0.008389968),
 ('function', 0.0074204314),
 ('may', 0.007219085),
 ('physical', 0.007079761),
 ('maintain', 0.0070519047),
 ('shift', 0.006863262),
 ('high', 0.006696066),
 ('requirement', 0.0066587906),
 ('time', 0.0065530166),
 ('follow', 0.0061604655),
 ('assign', 0.006052309),
 ('employment', 0.0059696273),
 ('standard', 0.005859737),
 ('ensure', 0.0056168046),
 ('location', 0.0055864146),
 ('guest', 0.005162906)]

Topic #21 Nearest word vectors 

[('must', 0.8587607741355896),
 ('able', 0.7992814779281616),
 ('should', 0.791999101638794),
 ('will', 0.7862410545349121),
 ('cannot', 0.7831496596336365),
 ('can', 0.772203803062439),
 ('could', 0.7716319561004639),
 ('reqire', 0.7522665858268738),
 ('coould', 0.7474284768104553),
 ('shoiuld', 0.747173547744751),
 ('chooose', 0.744572639465332),
 ('couuld', 0.7442643642425537),
 ('continue', 0.7412148714065552),
 ('need', 0.7395091652870178)

 ('nutrition', 0.03070946),
 ('il', 0.02346528),
 ('opening', 0.022230493),
 ('banquet', 0.021849988),
 ('please', 0.021739727),
 ('company', 0.01884334),
 ('va', 0.01801563),
 ('illinois', 0.017386567),
 ('america', 0.015713612),
 ('visit_www', 0.015610826),
 ('wi', 0.014963294),
 ('people', 0.0148385335),
 ('apply', 0.014628219),
 ('metro', 0.013199034),
 ('eeo_aa', 0.012832093),
 ('cafeteria', 0.012657878),
 ('benefit', 0.012562985),
 ('dc', 0.01206545),
 ('com', 0.010681361),
 ('spring', 0.010419007),
 ('family_owned', 0.010083771),
 ('time', 0.01008049),
 ('downtown', 0.009899854),
 ('virginia', 0.00973218)]

Topic #30 Nearest word vectors 

[('il', 0.6941758394241333),
 ('marte', 0.6700409054756165),
 ('doob', 0.65095454454422),
 ('cooool', 0.648993730545044),
 ('veure', 0.6479949951171875),
 ('jine', 0.6479512453079224),
 ('yeer', 0.6425232887268066),
 ('hice', 0.6415275931358337),
 ('pasque', 0.6412324905395508),
 ('reso', 0.6411623954772949),
 ('comúnmente', 0.6408858299255371



**************************************************


LDA Topic #39 word probability distribution 

[('system', 0.046497792),
 ('maintenance', 0.017768875),
 ('operation', 0.017618477),
 ('maintains', 0.014883201),
 ('pump', 0.013148279),
 ('email', 0.0115698455),
 ('operating', 0.010365365),
 ('procedure', 0.010204202),
 ('control', 0.010096071),
 ('may', 0.009930205),
 ('call', 0.00830394),
 ('maintain', 0.0083004385),
 ('installation', 0.008065926),
 ('perform', 0.007881142),
 ('technician', 0.007533426),
 ('monitoring', 0.0073548774),
 ('calibration', 0.0070932237),
 ('bookkeeper', 0.0070568477),
 ('installs', 0.0069129243),
 ('machine', 0.0066063837),
 ('center', 0.006475501),
 ('requirement', 0.0061514084),
 ('nd', 0.005720024),
 ('quality', 0.005672973),
 ('standard', 0.0054759686)]

Topic #39 Nearest word vectors 

[('system', 0.8327215909957886),
 ('operation', 0.7572136521339417),
 ('operater', 0.7538596391677856),
 ('sysytem', 0.7468661069869995),
 ('set-down', 0.7420762777

[('accounting', 0.032550287),
 ('company', 0.014574212),
 ('please', 0.014363706),
 ('financial', 0.01388907),
 ('look', 0.012425791),
 ('finance', 0.012252638),
 ('candidate', 0.0112304455),
 ('strong', 0.009758556),
 ('client', 0.009738694),
 ('staff', 0.009367794),
 ('apply', 0.009353294),
 ('interested', 0.009152114),
 ('hire', 0.0080577815),
 ('invoice', 0.0073960167),
 ('reporting', 0.0072507747),
 ('seek', 0.006829018),
 ('manager', 0.006740301),
 ('office', 0.0065536457),
 ('senior', 0.006505082),
 ('role', 0.0061172717),
 ('great', 0.006075686),
 ('prefer', 0.0060159448),
 ('report', 0.0056326673),
 ('excel', 0.00555249),
 ('management', 0.0055369725)]

Topic #49 Nearest word vectors 

[('business', 0.7594490647315979),
 ('work-and', 0.7473201751708984),
 ('exect', 0.7472846508026123),
 ('businessess', 0.7467864751815796),
 ('administraton', 0.7441543340682983),
 ('plannning', 0.7402976155281067),
 ('employeer', 0.7401830554008484),
 ('expeience', 0.7398912906646729),
 ('recru

 ('school--and', 0.753490149974823),
 ('university', 0.7533791065216064),
 ('colledge', 0.7512812614440918),
 ('teaching', 0.749860942363739),
 ('pre-education', 0.7490266561508179),
 ('school-life', 0.7430557012557983),
 ('teachability', 0.7428082227706909),
 ('commnunity', 0.7421320676803589),
 ('school-time', 0.741590142250061),
 ('schoole', 0.7415523529052734),
 ('studen', 0.7410544157028198),
 ('school-system', 0.7409297227859497),
 ('university-bound', 0.7405821681022644),
 ('non-education', 0.7398408055305481),
 ('teacher', 0.7393572926521301)]


**************************************************


LDA Topic #59 word probability distribution 

[('training', 0.071453944),
 ('benefit', 0.04226372),
 ('change', 0.024414929),
 ('must', 0.02406961),
 ('part', 0.019611523),
 ('learn', 0.017027574),
 ('time', 0.01667749),
 ('construction', 0.015206998),
 ('army_national_guard', 0.014495458),
 ('applicant', 0.014080542),
 ('equipment', 0.014060352),
 ('able', 0.013934739),
 ('pass_physi

[('retail', 0.039739564),
 ('sale', 0.026833627),
 ('time', 0.026471283),
 ('customer', 0.025740122),
 ('great', 0.015115158),
 ('full', 0.013212751),
 ('paid', 0.012329423),
 ('people', 0.011991193),
 ('benefit', 0.011616983),
 ('first', 0.01128226),
 ('help', 0.010690888),
 ('one', 0.010271586),
 ('environment', 0.009647748),
 ('product', 0.009242417),
 ('grow', 0.007895163),
 ('like', 0.007148878),
 ('love', 0.006939681),
 ('career', 0.0068122107),
 ('company', 0.006794441),
 ('hour', 0.0066684596),
 ('employee', 0.0064652017),
 ('base', 0.0064110314),
 ('associate', 0.0063689714),
 ('apply', 0.006354819),
 ('need', 0.0063231112)]

Topic #69 Nearest word vectors 

[('make-good', 0.7578557729721069),
 ('retail', 0.7558690309524536),
 ('non-sale', 0.7531669735908508),
 ('sale', 0.7424435615539551),
 ('in-and', 0.7393717765808105),
 ('deline', 0.7351685762405396),
 ('business', 0.7317682504653931),
 ('francise', 0.7311841249465942),
 ('permant', 0.7301006317138672),
 ('mark-down', 0.72

LDA Topic #78 word probability distribution 

[('network', 0.10250896),
 ('candidate', 0.015163516),
 ('knowledge', 0.013737498),
 ('technology', 0.013140197),
 ('networking', 0.011210389),
 ('cisco', 0.011082696),
 ('understand', 0.008613315),
 ('support', 0.0076574064),
 ('infrastructure', 0.0072660423),
 ('strong', 0.0069351783),
 ('commerce', 0.006912595),
 ('data', 0.006897094),
 ('environment', 0.006232647),
 ('engineer', 0.0058550765),
 ('staff', 0.00553134),
 ('technical', 0.005484701),
 ('information', 0.00539787),
 ('career', 0.0052298894),
 ('firewall', 0.005019562),
 ('need', 0.0048101945),
 ('security', 0.0045987186),
 ('write', 0.004531378),
 ('industry', 0.0044899643),
 ('server', 0.004475108),
 ('consult', 0.0044724206)]

Topic #78 Nearest word vectors 

[('network', 0.9404352903366089),
 ('networks', 0.8378973007202148),
 ('cross-network', 0.8032015562057495),
 ('multi-network', 0.776943564414978),
 ('intra-network', 0.7750160694122314),
 ('inter-network', 0.7739358544

 ('stay-home', 0.7407771348953247),
 ('work-and', 0.7381321787834167),
 ('home', 0.7368998527526855),
 ('seak', 0.7361161708831787),
 ('operater', 0.7360011339187622),
 ('recuit', 0.7356059551239014),
 ('permant', 0.7345653772354126),
 ('company', 0.7340292930603027),
 ('work', 0.7317330837249756),
 ('busniess', 0.7313679456710815),
 ('cark', 0.7305560111999512),
 ('reman', 0.7300621271133423),
 ('business', 0.7282195091247559),
 ('make-good', 0.7264459133148193),
 ('invester', 0.7259557247161865),
 ('chooose', 0.7252690196037292),
 ('buiness', 0.7250199317932129),
 ('studnet', 0.719667911529541),
 ('stay-over', 0.719493567943573),
 ('businessess', 0.7190676927566528),
 ('indutry', 0.718390941619873),
 ('pay', 0.7183316946029663)]


**************************************************


LDA Topic #84 word probability distribution 

[('marketing', 0.100095004),
 ('level', 0.03978442),
 ('entry', 0.035788413),
 ('client', 0.025030049),
 ('management', 0.024467332),
 ('sale', 0.024282753),


[('caregiver', 0.03231671),
 ('employment', 0.028945232),
 ('id', 0.023849234),
 ('data', 0.022896545),
 ('change', 0.019033223),
 ('telecom', 0.0118556535),
 ('loan', 0.011157508),
 ('information', 0.01115452),
 ('contingent_upon', 0.0111094685),
 ('marital_status', 0.010802966),
 ('veteran_status', 0.010545019),
 ('manage', 0.010283021),
 ('employee', 0.010042579),
 ('qualification', 0.010015838),
 ('centurylink', 0.009745047),
 ('release', 0.009452805),
 ('discrimination', 0.0094276285),
 ('recreation', 0.009301245),
 ('company', 0.009171116),
 ('attentive', 0.009008472),
 ('gender_sexual_orientation', 0.008354438),
 ('religion', 0.008039883),
 ('designed_indicate_general', 0.007727945),
 ('duty', 0.007644705),
 ('network', 0.007621795)]

Topic #89 Nearest word vectors 

[('id', 0.7008687257766724),
 ('data', 0.6903637647628784),
 ('info', 0.6444708108901978),
 ('informnation', 0.6444271802902222),
 ('permant', 0.6444255709648132),
 ('help-line', 0.6439581513404846),
 ('stype', 0.64

## LDA (40 topics, 100 words per topic) <> FB FastText vectors

In [12]:
ft_40_100 = LDA_to_words(LDA_40, FT, words_per_topic=100, show_top_words=25)

Missed words: 363. Total words searched: 4000


LDA Topic #0 word probability distribution 

[('customer', 0.041318946),
 ('ability', 0.020201558),
 ('company', 0.01742402),
 ('safety', 0.013874218),
 ('equipment', 0.01379462),
 ('perform', 0.013377259),
 ('product', 0.013211826),
 ('vehicle', 0.011846664),
 ('education', 0.011182732),
 ('duty', 0.01114483),
 ('high', 0.00994427),
 ('time', 0.009842404),
 ('order', 0.009658955),
 ('requirement', 0.009385821),
 ('need', 0.009314751),
 ('associate', 0.009056977),
 ('general', 0.00772105),
 ('training', 0.0075121247),
 ('production', 0.0074061863),
 ('maintain', 0.0073220823),
 ('month', 0.007314947),
 ('assist', 0.007207732),
 ('school_diploma', 0.007024344),
 ('one', 0.006514933),
 ('local', 0.0063718017)]

Topic #0 Nearest word vectors 

[('commnunity', 0.7835477590560913),
 ('actvity', 0.7755882143974304),
 ('permant', 0.7741615772247314),
 ('deline', 0.7711034417152405),
 ('expeience', 0.7692409753799438),
 ('balnce', 0.7690848708152


[('payroll', 0.05766105),
 ('test_href_attr_mns_rt', 0.029442254),
 ('track', 0.02004115),
 ('data', 0.019778391),
 ('com', 0.011782795),
 ('break', 0.011645904),
 ('twitter_com', 0.011639203),
 ('attr', 0.010123193),
 ('plus_google', 0.009921809),
 ('nonjob_twitter_break_case', 0.009899789),
 ('facebook_test_href', 0.00989926),
 ('switch_true_case', 0.0098793525),
 ('href_attr_href_tolowercase', 0.009874818),
 ('break_case_http', 0.009864829),
 ('test_href_attr', 0.009861531),
 ('break_default_attr_mns_rt', 0.009859969),
 ('nonjob_linkedin_break_case', 0.009857403),
 ('client_social_youtube', 0.009846953),
 ('attr_mns_rt_nonjob_facebook', 0.009845798),
 ('break_case_linkedin', 0.009842565),
 ('nonjob_googleplus_break_case', 0.009840138),
 ('client_link_homepage', 0.009833745),
 ('youtube_test_href', 0.009823554),
 ('href_match_http', 0.009806797),
 ('attr_href_var', 0.009793996)]

Topic #16 Nearest word vectors 

[('help-line', 0.7581029534339905),
 ('permant', 0.7564342021942139),
 

 ('maintenance', 0.037775204),
 ('repair', 0.03584534),
 ('technician', 0.029900953),
 ('electrical', 0.022689624),
 ('must', 0.017690724),
 ('system', 0.014980923),
 ('machine', 0.014598288),
 ('mechanical', 0.014564233),
 ('manufacturing', 0.013379435),
 ('control', 0.012771996),
 ('tool', 0.0122691775),
 ('inspection', 0.010386977),
 ('perform', 0.008998726),
 ('able', 0.008622726),
 ('part', 0.008372867),
 ('troubleshoot', 0.008185849),
 ('industrial', 0.008091881),
 ('production', 0.0074271313),
 ('automotive', 0.0068144375),
 ('requirement', 0.006788445),
 ('power', 0.0066492036),
 ('plant', 0.0066318684),
 ('component', 0.0064590857),
 ('field', 0.0064289025)]

Topic #29 Nearest word vectors 

[('reline', 0.7710464000701904),
 ('maintenace', 0.7643797397613525),
 ('workcell', 0.7642874121665955),
 ('equpment', 0.7636103630065918),
 ('set-down', 0.7588412761688232),
 ('commnunity', 0.7558234930038452),
 ('permant', 0.7532951831817627),
 ('non-line', 0.7520700693130493),
 ('funtio

 ('care-giver', 0.7545839548110962),
 ('care-home', 0.7517417669296265),
 ('care-based', 0.7503302097320557),
 ('healthcare', 0.7467352151870728),
 ('home-care', 0.7462895512580872),
 ('care-taking', 0.7457857728004456),
 ('extended-care', 0.7438664436340332),
 ('non-patient', 0.7426306009292603),
 ('careing', 0.7422932386398315),
 ('long-term-care', 0.7421374917030334),
 ('health-service', 0.7415825128555298),
 ('pet-care', 0.740976095199585),
 ('caree', 0.7395493984222412),
 ('medcine', 0.738248348236084),
 ('social-care', 0.7382416725158691),
 ('non-nurse', 0.7356089353561401)]


**************************************************


LDA Topic #38 word probability distribution 

[('digital', 0.01991505),
 ('content', 0.015217815),
 ('ups', 0.013824856),
 ('communication', 0.01298663),
 ('medium', 0.012174767),
 ('creative', 0.008197634),
 ('social_media', 0.007689858),
 ('design', 0.007603559),
 ('ability', 0.007191115),
 ('print', 0.0071291756),
 ('support', 0.006529402),
 ('graphic'

## LDA (40 topics, 15 words per topic) <> FB FastText vectors

In [13]:
ft_40_15 = LDA_to_words(LDA_40, FT, words_per_topic=15, show_top_words=25)

Missed words: 30. Total words searched: 600


LDA Topic #0 word probability distribution 

[('customer', 0.041318946),
 ('ability', 0.020201558),
 ('company', 0.01742402),
 ('safety', 0.013874218),
 ('equipment', 0.01379462),
 ('perform', 0.013377259),
 ('product', 0.013211826),
 ('vehicle', 0.011846664),
 ('education', 0.011182732),
 ('duty', 0.01114483),
 ('high', 0.00994427),
 ('time', 0.009842404),
 ('order', 0.009658955),
 ('requirement', 0.009385821),
 ('need', 0.009314751),
 ('associate', 0.009056977),
 ('general', 0.00772105),
 ('training', 0.0075121247),
 ('production', 0.0074061863),
 ('maintain', 0.0073220823),
 ('month', 0.007314947),
 ('assist', 0.007207732),
 ('school_diploma', 0.007024344),
 ('one', 0.006514933),
 ('local', 0.0063718017)]

Topic #0 Nearest word vectors 

[('service', 0.763076901435852),
 ('customer', 0.7495589256286621),
 ('product', 0.7419997453689575),
 ('actvity', 0.7386067509651184),
 ('business', 0.737226128578186),
 ('capcity', 0.7371953725814819),

 ('duty', 0.7287269830703735),
 ('must', 0.7271124124526978),
 ('balnce', 0.7264812588691711),
 ('recuit', 0.7257044315338135),
 ('invlove', 0.7252165079116821),
 ('proceeed', 0.7245989441871643),
 ('expeience', 0.7206244468688965),
 ('actvity', 0.7195906639099121),
 ('necessary', 0.7192147970199585),
 ('chooose', 0.7188577651977539),
 ('resonsibility', 0.7183729410171509),
 ('required', 0.7137881517410278),
 ('actualise', 0.713013231754303),
 ('normall', 0.7116724848747253),
 ('undertake', 0.7109880447387695),
 ('function', 0.7102881073951721),
 ('ability', 0.7099783420562744),
 ('expet', 0.7091288566589355),
 ('deline', 0.7082217931747437)]


**************************************************


LDA Topic #16 word probability distribution 

[('payroll', 0.05766105),
 ('test_href_attr_mns_rt', 0.029442254),
 ('track', 0.02004115),
 ('data', 0.019778391),
 ('com', 0.011782795),
 ('break', 0.011645904),
 ('twitter_com', 0.011639203),
 ('attr', 0.010123193),
 ('plus_google', 0.009921809),

 ('non-sale', 0.7967289686203003),
 ('business', 0.7817228436470032),
 ('make-good', 0.7465027570724487),
 ('product', 0.7453892827033997),
 ('post-sale', 0.7393875122070312),
 ('first-sale', 0.7375480532646179),
 ('no-sale', 0.7371048927307129),
 ('off-market', 0.7367243766784668),
 ('off-sale', 0.7349981069564819),
 ('mark-down', 0.7339128255844116),
 ('sales', 0.7331809401512146),
 ('re-sale', 0.7299843430519104),
 ('commmercial', 0.7283846139907837),
 ('home-sale', 0.7275345921516418),
 ('company', 0.7262798547744751),
 ('flash-sale', 0.7246001958847046),
 ('customer', 0.7244417667388916),
 ('market', 0.7241960167884827),
 ('one-product', 0.7235993146896362),
 ('stop-sale', 0.7229451537132263),
 ('new-business', 0.7201334238052368),
 ('new', 0.7192809581756592),
 ('pre-acquisition', 0.7169177532196045),
 ('market-stall', 0.7169013023376465)]


**************************************************


LDA Topic #25 word probability distribution 

[('ability', 0.018607134),
 ('quality', 0

 ('location', 0.005103387)]

Topic #33 Nearest word vectors 

[('non-one', 0.6753607988357544),
 ('deline', 0.6670256853103638),
 ('exspect', 0.6506680250167847),
 ('thru-out', 0.6470295190811157),
 ('commnunity', 0.6461374759674072),
 ('contiuous', 0.6453043222427368),
 ('expeience', 0.6419010758399963),
 ('right-now', 0.640749454498291),
 ('non-line', 0.6381058692932129),
 ('permant', 0.6357976794242859),
 ('make-good', 0.6349309682846069),
 ('use', 0.6346593499183655),
 ('work', 0.6337630748748779),
 ('good', 0.6336278319358826),
 ('giffgaff', 0.6331288814544678),
 ('on-brand', 0.632163405418396),
 ('poiint', 0.6315679550170898),
 ('rub-off', 0.6311696767807007),
 ('reqire', 0.630369246006012),
 ('fror', 0.6297169327735901),
 ('reach-out', 0.6296333074569702),
 ('expasion', 0.6294710636138916),
 ('time', 0.6292781829833984),
 ('actvity', 0.6292527914047241),
 ('seak', 0.6291422247886658)]


**************************************************


LDA Topic #34 word probability distribut

- Looks like 40 topics and fewer (15) words per topic better...
- Put them all in a df to compare

In [61]:
import pandas as pd

lda_topic_words = []
temp = []
topn = 25
for i in range (0, LDA_40.num_topics):
    for j in range(0, topn):
        temp.append(LDA_40.show_topic(i, topn=topn)[j][0])
    lda_topic_words.append(temp)
    temp = []

### make df. column has all LDA topic words
df_40 = pd.DataFrame({'LDA_40_topics': lda_topic_words})

lda_topic_words = []
temp = []
topn = 25
for i in range (0, LDA_90.num_topics):
    for j in range(0, topn):
        temp.append(LDA_90.show_topic(i, topn=topn)[j][0])
    lda_topic_words.append(temp)
    temp = []

### make df. column has all LDA topic words
df_90 = pd.DataFrame({'LDA_90_topics': lda_topic_words})

In [62]:
def find_similar_words(model_topic_words):
    sim_words = []
    temp = []
    for i in range(0, len(model_topic_words)):
        for j in range(0, len(model_topic_words[i])):
            temp.append(model_topic_words[i][j][0])

        sim_words.append(temp)
        temp = []
    return sim_words

In [63]:
forty_topic_names = ['goog_40_15', 'goog_40_100', 'ft_40_15', 'ft_40_100']
forty_topic_list = [goog_40_15, goog_40_100, ft_40_15, ft_40_100]

ninety_topic_names = ['goog_90_15', 'goog_90_100', 'ft_90_15', 'ft_90_100'] 
ninety_topic_names = [goog_90_15, goog_90_100, ft_90_15, ft_90_100]

In [64]:
df_40['goog_40_15'] = find_similar_words(goog_40_15)
df_40['goog_40_100'] = find_similar_words(goog_40_100)
df_40['ft_40_15'] = find_similar_words(ft_40_15)
df_40['ft_40_100'] = find_similar_words(ft_40_100)

### I wanted to do this with a list comprehension but got ValueError: Length of values does not match length of index
# for i in forty_topic_names:
#     df_40[i] = [find_similar_words(j) for j in forty_topic_list]


df_90['goog_90_15'] = find_similar_words(goog_90_15)
df_90['goog_90_100'] = find_similar_words(goog_90_100)
df_90['ft_90_15'] = find_similar_words(ft_90_15)
df_90['ft_90_100'] = find_similar_words(ft_90_100)

### I wanted to do this with a list comprehension but got ValueError: Length of values does not match length of index
# for i in ninety_topic_names:
#     df_90[i] = [find_similar_words(j) for j in ninety_topic_list]



# Now compare (subjective):
- 40 vs 90 topics
- 15 versus 100 words per topic
- google news vectors versus fasttext vectors

In [65]:
pd.set_option('display.max_colwidth', -1)
# df_40.loc[0][forty_topic_names]
df_40.head(40)

  """Entry point for launching an IPython kernel.


Unnamed: 0,LDA_40_topics,goog_40_15,goog_40_100,ft_40_15,ft_40_100
0,"[customer, ability, company, safety, equipment, perform, product, vehicle, education, duty, high, time, order, requirement, need, associate, general, training, production, maintain, month, assist, school_diploma, one, local]","[customer, customers, product, customer_satisfaction, service, operational_efficiency, business, company, ability, products, responsiveness, reliability, Customer, customer_loyalty, capability, user, value_proposition, regulatory_compliance, capabilities, operational_excellence]","[customer, service, need, business, required, necessary, customers, product, ability, responsiveness, requirements, maintenance, services, customer_satisfaction, needs, requirement, operational_efficiency, quality, capability, regulatory_compliance, operational, do, onsite, ensure, equipment]","[service, customer, product, actvity, business, capcity, expeience, effciency, commnunity, permant, non-line, capability, over-order, balnce, deline, readyness, work-and, prority, back-out, public-utility, customer-care, make-good, technicity, pre-position, supply-line]","[commnunity, actvity, permant, deline, expeience, balnce, positiion, non-line, make-good, proect, work-and, capcity, in-and, normall, thru-out, work, back-out, reguard, set-down, prority, right-now, indutry, placment, promary, reqire]"
1,"[support, system, network, technical, customer, issue, security, troubleshoot, software, knowledge, hardware, problem, technology, computer, environment, level, client, data, window, ability, resolve, microsoft, server, related, call]","[network, software, systems, troubleshoot, troubleshooting, hardware, infrastructure, technical, technology, system, support, capabilities, helpdesk, capability, functionality, solutions, infrastructures, computer, services, connectivity]","[systems, software, network, infrastructure, hardware, system, troubleshooting, functionality, capability, capabilities, technology, services, enterprise, interface, customer, application, configuration, connectivity, centralized, solutions, data, computer, service, user, server]","[technical-support, system, technlogy, commnunity, support, technicity, software, technology, infrastructure, network, cross-system, data-centre, help-line, computer-system, effciency, data-center, techology, hardware, non-system, implentation, non-line, equpment, develpment, desk-side, system-administration]","[commnunity, technical-support, non-line, help-line, support, funtionality, technlogy, desk-side, permant, back-out, technicity, funtioning, system, sup-port, proect, nework, expeience, deline, backs-up, equpment, informnation, type-ahead, re-key, balnce, appplication]"
2,"[training, golden_corral, benefit, engineering, variety, tree, construction, material, must, religion_creed_sex_sexual, visit_www_northropgrumman, guard, member, support, semiconductor, information, ait, building, minimum, paid, application_kept_file, buffet, appreciate_interest_golden_corral, learn, corral_careers_ref_ts]","[training, trainings, Training, instructors, retraining, trainees, mentoring, instruction, trained, conditioning, vocational_training, trainers, instructor, learning, courses, classroom_instruction, teaching, specialized, skills, retrain]","[training, trainings, Training, retraining, instructors, work, trainees, instruction, specialized, learning, mentoring, vocational_training, educational, engineering, onsite, maintenance, required, certification, courses, equipment, teaching, skills, teach, knowledge, trained]","[training, non-training, equiping, in-train, traine, training-related, cross-train, funtioning, set-down, commnunity, com-munity, development, traing, plannning, support, non-line, actvity, pre-job, traning, work, field-training, recuiting, management, expeience, equpment]","[commnunity, balnce, proect, deline, expeience, set-down, funtioning, traine, work, permant, actvity, non-line, plannning, work-and, expet, recuit, com-munity, training, thru-out, disgard, sup-port, reqire, make-good, indutry, equiping]"
3,"[project, construction, schedule, manager, site, management, engineering, estimate, commercial, design, cost, contractor, manage, company, field, budget, must, client, report, plan, building, material, coordinate, subcontractor, safety]","[project, construction, projects, Construction, renovation, feasibility_study, development, design, contractor, Projects, constructing, feasibility_studies, engineering, construct, nearing_completion, installation, redevelopment, roadwork, excavation, refurbishment]","[project, construction, projects, development, contractor, design, infrastructure, engineering, Construction, feasibility_study, maintenance, installation, building, renovation, subcontractors, construct, contractors, constructing, work, feasibility_studies, subcontractor, plan, build, nearing_completion, expansion]","[project, design, pre-project, development, projects, construction, plannning, work, mid-project, megaproject, post-project, pilot-project, program, miniproject, pre-design, predesign, site, project--and, preconstruction, operation, project-management, progect, maintenace, multi-project, planning]","[project, work, plannning, design, commnunity, proect, development, deline, pland, progect, workcell, maintenace, set-down, pre-project, actvity, back-out, plan, com-munity, dealine, work-and, expeience, office-wide, make-good, predesign, sup-port]"
4,"[quality, safety, process, product, manufacturing, company, program, production, ensure, management, requirement, manager, plant, control, environment, improvement, development, oldcastle, procedure, standard, assurance, employee, lead, operation, maintain]","[quality, product, safety, manufacturing, products, reliability, continuous_improvement, supply_chain, Quality, company, production, efficiency, processes, standards, traceability, operational_efficiency, productivity, suppliers, availability, operational_excellence]","[quality, continuous_improvement, processes, management, product, ensure, operational_efficiency, development, safety, efficiency, supply_chain, manufacturing, efficiencies, responsiveness, maintenance, standards, compliance, improve, regulatory_compliance, requirements, operational_excellence, facilities, utilization, reliability, business]","[quality, product, safety, effciency, management, production, development, non-quality, process, efficiency, one-product, performance, control, design, controllership, work-product, qualty, producibility, quanitity, first-quality, deline, balnce, data-quality, technicity, retainment]","[quality, deline, commnunity, balnce, development, actvity, effciency, management, expeience, work, product, strategy-making, technicity, set-down, proect, positiion, good-practice, post-use, permant, process, manage-ment, deveopment, tracability, non-quality, work-and]"
5,"[facility, employee, program, staff, policy, training, operation, personnel, procedure, security, report, duty, maintain, conduct, ensure, management, ability, safety, property, resident, ensures, assign, state, activity, site]","[personnel, staff, facility, facilities, employees, operation, procedures, employee, program, operations, training, policy, department, staffing, service, onsite, officers, programs, workers, conduct]","[personnel, supervise, staff, facilities, procedures, management, oversight, department, supervision, oversee, ensure, maintenance, employees, facility, services, staffing, onsite, monitoring, compliance, conduct, employee, monitor, supervisory, operations, program]","[personnel, facility, staff, program, service, management, supervison, operation, commnunity, com-munity, service-wide, office-wide, help-line, operatory, policy-holder, administraton, staff-only, mission-support, cross-Government, controllership, work-station, offfice, operations, facilities, restaffing]","[commnunity, com-munity, supervison, permant, office-wide, deline, administraton, actvity, work, policy-holder, expeience, management, help-line, plannning, funtioning, work-and, back-out, balnce, offfice, positiion, non-line, proect, set-down, reguard, sup-port]"
6,"[test, system, requirement, design, technical, software, plan, engineering, support, automation, development, develop, knowledge, document, project, qa, integration, application, quality, analysis, documentation, network, automate, engineer, data]","[test, testing, tests, system, systems, tested, evaluation, Testing, design, validation, engineering, standardized, technology, evaluations, simulation, prototype, application, requirement, requirements, tester]","[test, testing, system, systems, evaluation, application, validation, technology, tests, implement, capability, processes, software, implementation, design, verification, applications, standardized, develop, evaluations, automation, requirements, validate, standardize, implementations]","[test, tests, sub-test, testing, test-case, test-run, test-based, design, test-related, testsuite, system, test-first, field-test, evaluation, breath-test, full-system, test-preparation, appplication, test-cases, testin, test-out, set-down, program, two-test, testwork]","[test, commnunity, sub-test, design, appplication, test-case, set-down, evaluation, system, deveopment, swap-out, type-ahead, back-out, interative, proect, workcell, process, development, resequence, work, technicity, deline, balnce, implemention, funtionality]"
7,"[employment, without_regard_race_color, please, equal, affirmative_action_employer, apply, information, contact, religion_sex_national_origin, llc, employee, disability_veteran_status, search, department, qualified_applicants_receive_consideration, evaluate_qualified_applicants, system, website_search_application, protected_characteristics_visited_website, require_accommodation_using, protected_veteran_status, description, age, national_origin, eeo_employerapex_systems]","[employment, please, information, contact, apply, please_contact, info, inquire, Please, log_onto, contacting, employer, information_visit_www.ambest.com, employee, emailing, applicant, applying, information_visit_http://www.ereleases.com, Apply, call]","[employment, information, apply, applicant, contact, please, qualifications, relevant, application, services, For_Restrictions, employer, job, workforce, applying, specific, applicants, obtain, any, info, provide, employee, employees, access, hire]","[employment, appply, employement, apply, commnunity, exect, work-and, positiion, employeer, and, buiness, appoach, proect, use, readvertise, applicate, coprorate, reqire, acccess, offfice, informnation, applicable, work, invlove, permant]","[commnunity, positiion, proect, deline, permant, actvity, balnce, informnation, exect, expeience, appoach, work-and, reqire, applicate, placment, employeer, posiiton, appply, expet, work, positition, trasfer, disgard, funtioning, buiness]"
8,"[patient, assign, clinical, need, appropriate, procedure, performs, demonstrates, policy, plan, care, document, standard, maintains, information, ability, knowledge, area, activity, within, resident, duty, medical, state, process]","[patient, clinical, clinicians, clinician, procedures, physicians, diagnostic, procedure, patients, appropriate, care, medical, need, physician, surgical, protocols, pediatric, ambulatory, inpatient, documentation]","[patient, procedures, clinical, medical, appropriate, required, evaluation, necessary, needs, ensure, diagnostic, documentation, need, protocols, care, requires, processes, clinicians, evaluate, compliance, specific, physicians, proper, establishes, determines]","[care, commnunity, deline, reqire, selfcare, expeience, balnce, disgard, proect, policywise, promary, pre-procedure, appropriate, work-and, work, permant, policy-holder, tratment, good-practice, need, plannning, prority, informnation, medcine, critical-care]","[commnunity, deline, balnce, expeience, proect, permant, actvity, work, disgard, positiion, funtioning, informnation, policywise, com-munity, good-practice, practics, promary, plannning, positition, prority, tratment, non-line, reguard, expet, placment]"
9,"[contract, supplier, agreement, management, procurement, cincinnati_oh, business, manager, requirement, centre, planning, contractual, logistics, negotiate, negotiation, process, registered_nurse_rn, delivery, client, cost, foh, day, location, area, prefer]","[contract, contracts, agreement, contractual, Contract, deal, multiyear_contract, agreements, subcontract, lease, Contracts, tentative_agreement, renegotiated, collective_bargaining_agreement, contractual_obligations, multiyear, negotiated, renegotiate, procurement, negotiating]","[contract, contracts, contractual, agreement, deal, agreements, Contract, procurement, lease, Contracts, supplier, renegotiated, negotiation, management, RFP, tentative_agreement, negotiating, multiyear, contractual_obligations, negotiated, negotiate, renegotiate, contractor, collective_bargaining_agreement, leasing]","[contract, contracts, precontract, pre-contract, post-contract, sub-contract, noncontract, agreement, contractual, non-contract, contracting, contracting-out, procurement, contracter, quasi-contract, sub-contracts, contractee, contractor, pre-agreement, mid-contract, management, subcontract, agreements, noncompete, negotiation]","[contract, make-good, deline, contracter, back-out, commnunity, plannning, commer, proect, precontract, employeer, dealine, balnce, noncontract, pland, service, positiion, work, studnet, business, permant, serice, operater, boad, exect]"


In [66]:
for i in range(0,40):
    print('Topic# {}'.format(i))
    print(LDA_40.show_topic(i, topn=25))
    print()

Topic# 0
[('customer', 0.041318946), ('ability', 0.020201558), ('company', 0.01742402), ('safety', 0.013874218), ('equipment', 0.01379462), ('perform', 0.013377259), ('product', 0.013211826), ('vehicle', 0.011846664), ('education', 0.011182732), ('duty', 0.01114483), ('high', 0.00994427), ('time', 0.009842404), ('order', 0.009658955), ('requirement', 0.009385821), ('need', 0.009314751), ('associate', 0.009056977), ('general', 0.00772105), ('training', 0.0075121247), ('production', 0.0074061863), ('maintain', 0.0073220823), ('month', 0.007314947), ('assist', 0.007207732), ('school_diploma', 0.007024344), ('one', 0.006514933), ('local', 0.0063718017)]

Topic# 1
[('support', 0.043462217), ('system', 0.028095089), ('network', 0.027179576), ('technical', 0.020391688), ('customer', 0.015878476), ('issue', 0.015052615), ('security', 0.013102518), ('troubleshoot', 0.012946775), ('software', 0.012734365), ('knowledge', 0.011874789), ('hardware', 0.011828976), ('problem', 0.011430565), ('technol

## Comparision
<em> note, google had 500k word vectors here </em>
- Google News vectors (40 topics, 15 words) is best; FT (40 topics, 15 words) also good
- Google News (40 topics, 100 words) unusable. FT (40 topics, 100 words) not a great fit 
- Google News vectors (40 topics, 15 words) has some funky words containing '_' or '##' that would need filtering

<em> with only 50k word vectors in google here </em>
- Google News vectors (40 topics, 15 words) is best; google news (40 topics, 100 words) and FT (40 topics, 15 words) also good
- Google News (40 topics, 15 or 100 words) no longer has random funky worrds. 
- FT (40 topics, 100 words) not a great fit 


In [67]:
df_90.head(50)

Unnamed: 0,LDA_90_topics,goog_90_15,goog_90_100,ft_90_15,ft_90_100
0,"[customer, business, new, sale, ability, level, mop, manager, custodial, branch, product, save, industry, exist, grade, referral, smile, ensure, knowledge, follow, without_reasonable_accommodation, though, present, department, company]","[customer, business, customers, product, Customer, company, retail, customer_satisfaction, clients, salesperson, client, sales, management, service, profitability, salespeople, businesses, servicing, customer_loyalty, employee, commercial, customer_centric, operational_efficiencies, pricing, marketing]","[customer, business, service, customers, services, product, For_Restrictions, management, customer_satisfaction, clients, company, salesperson, quality, existing, servicing, commercial, client, retail, that, businesses, manage, transactional, provide, operational_efficiency, vice_versa]","[business, new, make-good, one-product, office-wide, customer, midtier, product, desk-side, non-sale, back-out, commnunity, customer-care, market-stall, groupwide, new-business, permant, positiion, service, employeer, public-utility, non-line, over-order, deline, money-no-object]","[commnunity, deline, positiion, make-good, permant, non-line, actvity, proect, expeience, office-wide, balnce, back-out, francise, right-now, down-the-road, indutry, seak, non-one, buiness, work-and, money-no-object, business, non-co-operative, capcity, meak]"
1,"[duty, ability, procedure, follow, assign, maintain, perform, policy, maintains, equipment, customer, area, safety, time, company, accord, order, standard, knowledge, department, function, complete, clean, handle, proper]","[maintain, duty, procedures, assign, conduct, ensure, enforce, ability, adhere, required, do, instruct, allow, assigned, supervise, perform, procedure, service, provide, notify, maintenance, continue, personnel, necessary, insure]","[necessary, required, proper, ensure, adequate, need, maintain, appropriate, do, procedures, provide, supervise, require, requirement, essential, imperative, allow, prioritize, requirements, utmost_importance, needs, needed, adhere, instruct, requires]","[duty, commnunity, deline, work, balnce, actvity, back-out, reguard, positiion, permant, work-and, expeience, prority, practice, resonsibility, responsibility, recuit, service, policy-holder, positition, disgard, seak, proceeed, invlove, reqire]","[balnce, deline, proect, commnunity, reqire, permant, positiion, expeience, back-out, work, actvity, meak, disgard, necessary, work-and, normall, reguard, appropriate, proceeed, seak, non-line, funtioning, promary, proper, thru-out]"
2,"[product, development, design, technical, process, ability, project, customer, business, engineering, lead, new, requirement, solution, support, management, strong, develop, knowledge, understand, program, client, analysis, manage, communication]","[product, design, development, products, engineering, technology, technical, customer, project, innovation, processes, Product, marketing, process, business, technologies, implementation, implementations, supply_chain, usability, value_proposition, capabilities, lifecycle, functionality, sourcing]","[product, development, design, technology, develop, processes, capabilities, management, innovation, business, optimize, expertise, products, solutions, developing, technologies, innovative, innovate, capability, value_proposition, implementation, supply_chain, technical, customer, competencies]","[product, development, design, one-product, product-line, product-design, deveopment, process, productization, strategy-making, design-driven, develpment, work-product, interative, formulative, develop-ment, design-in, technical-support, new-product, product-development, desgin, work, workcell, side-product, product-level]","[proect, commnunity, development, expeience, deline, product, deveopment, strategy-making, interative, actvity, design, work-and, work, balnce, formulative, develop-ment, plannning, permant, make-good, funtioning, positiion, non-line, one-product, succcess, technicity]"
3,"[industry, quality, ability, utensil, cook, length, involves_repetitive_motion, steam_risk_conditions, follow, must, temperature, level, appropriate, safely_operate, production_guidelines_recipes, product_identification_acceptable, food_prepared_prescribed, cutting_preparing, equipment, food, qualification, method, back_house_requires, frequent_lifting_transporting_hot, certify]","[quality, ability, can, temperature, industry, length, level, optimum, must, need, thickness, levels, required, freshness, without_compromising, longevity, necessary, duration, should, do, standards, optimal, consistency, responsiveness, how]","[quality, optimum, ensure, optimal, required, necessary, do, proper, ensures, appropriate, level, without_compromising, can, ability, need, requirements, needs, competencies, subjective, strive, minimum, standards, evaluate, essential, For_Restrictions]","[balnce, deline, meak, self-rate, finess, quality, seak, expeience, heate, cutlure, actvity, appoach, commnunity, quanity, expectance, washability, chooose, permant, capcity, stickability, recuit, hand-hold, expet, work, proect]","[balnce, deline, commnunity, meak, permant, expeience, self-rate, seak, actvity, proect, appoach, expet, finess, defficient, non-one, cutlure, indutry, capcity, positiion, disgard, non-line, thru-out, recuit, comple, quality]"
4,"[ohio, oh, control, layout, indiana, mi, de, logic, must, forward_resume, united_states, export, locate, candidate, logistics, ia, iowa, northern, gear, accredited_institution, system, trouble_shooting, midtown, posse, cleveland]","[oh, Oh, layout, yeah, control, logic, Hmmm, ah, anyway, yes, damn, um, eh, controls, darn, suppose, Hmm, so, Uh, uh, just, guess, do, anyways, cant]","[For_Restrictions, just, anyway, oh, so, vice_versa, it, do, maybe, guess, Newspapers_below, layout, suppose, anyways, then, Oh, hey, damn, anyhow, Hmmm, really, darn, yeah, need, even]","[oh, cooool, hey, yess, yeahhhh, hice, yeahhh, sike, duhhhh, coool, yeahh, hummm, geeze, hummmm, humm, yoyu, duhhh, heah, geesh, Ermmmm, aaaaand, quin, heyyyy, ermmm, jude]","[cooool, fass, coool, fack, quin, geer, looool, saft, yoyu, hice, forr, sike, coss, bess, loooool, mond, budy, yeer, mase, colgate, lool, countin, fock, gooood, eline]"
5,"[equipment, maintenance, technician, electrical, mechanical, repair, facility, system, installation, industrial, perform, power, must, tool, knowledge, control, troubleshoot, building, plant, manufacturing, basic, hvac, field, part, need]","[equipment, maintenance, electrical, technician, technicians, repair, mechanical, installation, repairs, equipments, welding, HVAC, systems, machinery, wiring, hydraulic, electrician, Equipment, plumbing, mechanic, repairing, spare_parts, avionics, troubleshooting, transformer]","[equipment, maintenance, electrical, technician, repair, technicians, mechanical, installation, welding, systems, HVAC, equipments, repairs, machinery, engineering, wiring, plumbing, hydraulic, avionics, calibration, electrician, facilities, components, spare_parts, carpentry]","[equipment, maintenance, equiptment, equpment, maintenace, heavy-equipment, mechanical, technical-support, operatory, repair, power-train, equipement, original-equipment, technology, self-maintenance, technician, workcell, electro-technical, automative, electrical, re-power, effciency, system, work-station, control-room]","[equpment, maintenace, non-line, set-down, commnunity, reline, effciency, permant, equipment, technical-support, funtioning, balnce, equiptment, workcell, actvity, operatory, maintenance, re-power, effcient, technicity, work, capcity, deline, equiping, back-out]"
6,"[patient, care, medical, treatment, health, physician, need, clinical, current, nursing, family, medication, certification, must, hospital, license, practice, healthcare, professional, state, assist, procedure, plan, time, appropriate]","[patient, medical, patients, care, physician, physicians, palliative_care, hospital, outpatient, inpatient, doctors, doctor, medication, nursing, clinical, pediatric, surgical, healthcare, Patient, clinicians, nurse, treatment, clinician, hospitals, nurses]","[patient, medical, care, physician, patients, physicians, outpatient, inpatient, hospital, palliative_care, healthcare, treatment, clinical, doctors, medication, doctor, surgical, clinicians, nursing, pediatric, clinician, health, hospitals, health_care, nurse]","[patient, care, critical-care, medical-care, medical, selfcare, patient-care, non-patient, care-giver, care-related, health, healtcare, clinical, healthcare, patients, provider-patient, physician, health-system, care-based, health-service, patient-related, hospital, patient-focused, social-care, post-hospital]","[care, patient, critical-care, selfcare, medical, medical-care, commnunity, medcine, healtcare, care-related, tratment, expeience, physicial, caree, health-system, care-giver, non-patient, careing, condtion, clinical, in-and, health, extended-care, help-line, care-based]"
7,"[food, quality, production, product, manufacturing, safety, standard, procedure, ensure, equipment, operation, maintain, requirement, shift, din, process, area, plant, control, follow, training, company, environment, material, customer]","[quality, product, products, production, food, manufacturing, packaging, supply, traceability, standards, raw_materials, manufacture, supply_chain, consumables, safety, suppliers, reliability, availability, sourcing, manufacturers, raw_material, supplies, produce, foodstuffs, without_compromising]","[quality, product, products, manufacturing, food, supply, packaging, production, traceability, supply_chain, suppliers, processes, standards, raw_materials, ensure, facilities, equipment, manufacture, sourcing, continuous_improvement, requirements, maintenance, safety, supplier, produce]","[food, quality, product, production, suppy, safety, quanitity, comsumption, non-quality, effciency, first-quality, healthfood, balnce, supply-line, supply, deline, food-production, marination, suppler, equpment, superior-quality, quaility, washability, restaurant-quality, one-product]","[deline, balnce, quality, product, actvity, commnunity, set-down, permant, supply-line, suppy, food, equpment, effciency, supply, work, non-line, non-quality, back-out, post-use, reducation, good-practice, suppler, positiion, proect, tracability]"
8,"[employment, disability, national_origin, age, status, without_regard_race_color, qualified_applicants_receive_consideration, equal, cincinnati_oh, protected_veteran_status, health, affirmative_action_employer, genetic, sexual_orientation_gender_identity, applicant, information, characteristic_protected_law, company, base, veteran, client, business, basis, consider, clean_harbors]","[disability, employment, occupational, age, disabilities, dependents, health, unemployed, employer, status, discrimination, gender, qualifications, equality, life_expectancy, socioeconomic, occupations, disabled, unemployment, employers, welfare, eligibility, mental_health, socio_economic, self_employed]","[employment, disability, occupational, qualifications, health, employer, age, dependents, status, benefits, gender, sexual_orientation, socioeconomic, disabilities, employers, workplace, occupations, workforce, discrimination, applicant, equality, socio_economic, mental_health, job, eligibility]","[age, employment, disability, health, age-class, status, gender, age-set, age-discrimination, non-employment, age-structure, diability, education, age-sex, employement, relative, child-hood, age-limit, eligibility, educability, age--and, age-range, age-level, social, equability]","[commnunity, positiion, permant, actvity, caree, expeience, posiiton, positition, reducation, prority, employment, deline, balnce, capcity, professsional, condtion, tratment, commnity, careeer, recuitment, informnation, finantial, age, non-comprehension, succcess]"
9,"[sale, product, event, advantage, time, solution, ability, marketing, store, retail, demonstration, management, part, best, training, client, duty, travel, report, one, day, essential, customer, requirement, related]","[product, sale, products, business, sales, event, advantage, marketing, transaction, purchase, inventory, solution, value_proposition, retail, customer, customers, marketplace, offerings, distribution, market, pricing, purchasing, rollout, capabilities, company]","[product, business, customer, onsite, For_Restrictions, service, services, products, purchase, transaction, capabilities, customers, application, inventory, management, sales, opportunity, capability, that, retail, sale, pricing, value_proposition, it, benefit]","[sale, product, non-sale, back-out, make-good, event, succcess, mark-down, promtion, market, one-product, actvity, business, rolling-out, deline, part, development, re-sale, post-sale, francise, time-in, selling-off, thru-out, revenue-generation, rub-off]","[make-good, deline, back-out, actvity, permant, thru-out, positiion, commnunity, work, work-and, balnce, succcess, proect, expeience, francise, placment, dealine, product, reach-out, non-line, in-and, offtrack, set-down, time-in, promtion]"


## Comparison of 90 topic models
- Seems like repeats of topics exist...
- Google news 15 words and 100 seem best
- Again, why can't we just use LDA words?

## Thoughts moving forward
- Could the LDA words just be used? No weighting of vectors. Just take top words per topic, cross-reference to KSBs? 