# Convert LDA Topics to d2v Vectors
- Richard Kuzma, 8SEP2020

## Load LDA models

In [1]:
### Imports

# basic
from pprint import pprint
import pickle

# data science
import pandas as pd

# NLP
import gensim
from gensim.models import CoherenceModel, LdaModel

# plotting
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline


In [2]:
### save best perplexity model (8 topics)
path = '/Users/richardkuzma/coding/analysis/monster/models/'

filename = 'monster_jobs_LDA_40_topics_cv_zero476.pkl'
with open(path+filename, 'rb') as f:
    LDA_40 = pickle.load(f)
    
filename = 'monster_jobs_LDA_90_topics_cv_zero461.pkl'
with open(path+filename, 'rb') as f:
    LDA_90 = pickle.load(f)


## LDA with 30 topics

In [None]:
pprint(best_model.print_topics())

## Convert KSB LDA Topics to vectors

### Load w2v KeyedVectors model trained on Google News
Note, KeyedVectors models are not full models you can retrain

In [None]:
goog = gensim.models.KeyedVectors.load_word2vec_format('/Users/richardkuzma/coding/analysis/utils/GoogleNews-vectors-negative300.bin.gz', binary=True, limit=500000)
print('loaded google w2v model of size 500,000 with dimension 300 vectors')


In [None]:
### use LDA probability distribution to find the center vector of each topic

pos_all = []
total_missed_words = 0

for i in range (0, best_model.num_topics): # for each LDA topic
    print('\n' + '-'*40 + 'Topic Number: {}'.format(i) + '-'*40 + '\n')
    missed_words = 0
    pos_topic = []
    for j in range(0, len(best_model.show_topic(i))): # for each words in a given topic
        try:
            # multiply w2v word vector by weight
            pos_topic.append(goog[best_model.show_topic(i)[j][0]]*float(best_model.show_topic(i)[j][1]))
            print('appended weighted vector for topic: {} and sub-word #{}: {} '.format(i, j, best_model.show_topic(i)[j][0]))
        except KeyError:
            print('Key error.......missed a word from topic: {}, number: {}, word: {}'.format(i,j,best_model.show_topic(i)[j][0]))
            missed_words +=1 
    
    total_missed_words += missed_words
    pos_all.append(pos_topic)
    print('\nappended pos_topic {} to pos_all'.format(i))
    print('Missed {} words'.format(missed_words))
    
print('Missed words in total: {}'.format(total_missed_words))
    

    
    
    
    

In [None]:
### use weighted word vectors to find the top 20 most similar words for each topic

similar_to_LDA_topics = []
for i in range(0, best_model.num_topics):
    similar_to_LDA_topics.append(goog.wv.most_similar(positive=pos_all[i], topn=15))

from pprint import pprint
for i in range(0, len(similar_to_LDA_topics)):
    print('Topic #{} most similar words'.format(i))
    pprint(similar_to_LDA_topics[i])
    print('\n\n' +'*'*40 + '\n\n')

## Save 30-topic LDA model derived from 10,000 KSB aggregate list

In [None]:
path = '/Users/richardkuzma/coding/analysis/monster/models/'
filename = 'LDA_30_topics_10k_KSBs'

with open(path+filename, 'wb') as f:
    pickle.dump(best_model, f)

# Evaluation

Best model (by coherence):
- LdaModel(num_terms=6107, <b>num_topics=30</b>, decay=0.5, chunksize=2000)
- Coherence of <b>0.6807534740840404</b>
- Saved to '/Users/richardkuzma/coding/analysis/monster/models/LDA_30_topics_10k_KSBs'

Using a comprehensive list of 10,000 KSBs for LDA yields 20-30 topics that are too large. (e.g. 'supervisor', 'techincian', 'professional'. Will try to create separate LDA models for each