# Gensim LDA Model Evaluation

In [1]:
import pickle
import gensim
import spacy
import pandas as pd
import nltk; nltk.download('stopwords')
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import re
import numpy as np
import sklearn
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import seaborn as sns
%config InlineBackend.figure_formats = ['retina']
from sklearn.metrics import f1_score
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.metrics import fbeta_score
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

import json
from collections import defaultdict, Counter
from nltk.corpus import stopwords
stopwords_set = stopwords.words("english")
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import numpy as np; np.random.seed(0)
import seaborn as sns; sns.set_theme()

from scipy.spatial.distance import cdist
from pprint import pprint

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


## Data Preprocessing

In [3]:
articles = "/content/drive/MyDrive/Indeed/data/article.json"

art = []
with open(articles, "r", encoding="utf-8") as f:
    for line in f:
        art.append(json.loads(line))

rev_train = pd.DataFrame(pd.DataFrame(art))
rev_train['content_len'] = rev_train['content'].apply(lambda x: len(x.split()))

In [4]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['am','really','they','go','get','we','me','would','like','great',
                   'should','must', 'may', 'might', 'also', 'people', 'person', 'work', 'works',
                   'job', 'jobs', 'use', 'using', 'used', 'uses', 'think', 'thinking', 'thinks', 
                   'thinked', 'consider', 'considering', 'considers', 'considered', 'mga', 
                   'mgas', 'monochronic', 'fertility', 'page', 'pages', 'question', 'questions',
                   'answer', 'answers', 'feel', 'feels', 'felt', 'feeling', 'try', 
                   'trying', 'tries', 'tried', 'change', 'changed', 'changing', 'changes',
                   'name', 'names', 'show', 'shows', 'showing', 'shown', 'showed', 'write',
                   'wrote', 'writes', 'writing', 'section', 'sections']) 

In [5]:
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

In [6]:
def strip_newline(series):
    """
    stripping out new line
    """                    
    return [review.replace('\n','') for review in series]

def sent_to_words(sentences):
    """
    Converting sentence to words
    """                
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        
def remove_stopwords(texts):
    """
    Used for removing stopwords
    """            
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def bigrams(words, bi_min=15, tri_min=10):
    """
    Find the bigrams in the content of the article
    """        
    bigram = gensim.models.Phrases(words, min_count = bi_min)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return bigram_mod

name_regex = "[^]]+"
url_regex = "http[s]?://[^)]+"
markup_regex = '\[({0})]\(\s*({1})\s*\)'.format(name_regex, url_regex)

def related_articles(corpus):
    """
    Find the articles related to each content title and content
    """    
    related_art = defaultdict(dict)
    for content, title in zip(corpus["content"], corpus["contentTitle"]):
        content = content.split("\n")
        for i, cont in enumerate(content):
            cont = cont.replace("*","")
            try:            
                for match in re.findall(markup_regex, cont):
                    related_art[title][match[0]] = match[1]
            except :
                print(cont)
    return related_art

def remove_hyperlinks(corpus):
    """
    Remove hypoerlinks from the content
    """  
    fixed_content = [] 
    for i, content in enumerate(corpus["content"]):
        content = content.split("\n")
        #print(content)
        for i, cont in enumerate(content): 
            if cont!='':
                if "https" in cont:
                    content[i] = ""
        #corpus.iloc[i]["content"] = " ".join(content)
        fix_ = " ".join(content)
        #print(fix_)
        fixed_content.append(fix_)
    
    corpus["content"] = fixed_content
    return corpus
    
def get_corpus(df):
    """
    Get Bigram Model, Corpus, id2word mapping
    """
    rltd_articles = {}
    rltd_articles = related_articles(rev_train)
    #df = remove_hyperlinks(df, 'content')
    df = remove_hyperlinks(df)
    df['content'] = strip_newline(df.content)
    words = list(sent_to_words(df.content))
    words = remove_stopwords(words)
    bigram = bigrams(words)
    bigram = [bigram[review] for review in words]
    id2word = gensim.corpora.Dictionary(bigram)
    id2word.filter_extremes(no_below=10, no_above=0.35)
    id2word.compactify()
    corpus = [id2word.doc2bow(text) for text in bigram]
    return corpus, id2word, bigram



In [7]:
def striphtml(data):
    p = re.compile(r'<.*?>')
    return p.sub('', data)


In [8]:
for i, content in enumerate(rev_train['content']):
    content_ = striphtml(content) 
    rev_train['content'][i] = content_

In [9]:
train_corpus4, train_id2word4, bigram_train4 = get_corpus(rev_train)

## Model parameter tuning -- LdaMulticore

In [10]:
# The results do not consistent, the first time we got alpha=0.01, eta =0.9, passes = 50 with the highest coherence value. This time, it gives a different result.

alpha = [0.01, 0.9, 5]
eta = [0.01, 0.9, 5]
passes = [30, 40, 50]
alpha_list = []
eta_list = []
pass_list = []
coherence_list = []

for alpha_ in alpha:
  for eta_ in eta:
    for pass_ in passes:
      lda_train4 = gensim.models.LdaModel(corpus=train_corpus4,
                                                            num_topics=10,
                                                            id2word=train_id2word4,
                                                            chunksize=100,
                                                            #random_state=None,                                                            ,
                                                            passes=pass_,
                                                            eval_every = 1,
                                                            per_word_topics=True,
                                                            alpha=alpha_,
                                                            eta=eta_)
      coherence_model_lda = CoherenceModel(model=lda_train4, texts=bigram_train4, dictionary=train_id2word4, coherence='c_v')
      coherence_ = coherence_model_lda.get_coherence()
      alpha_list.append(alpha_)
      eta_list.append(eta_)
      pass_list.append(pass_)
      coherence_list.append(coherence_)
 

In [11]:
para_tunning_dict = {'alpha': alpha_list,
                    'eta': eta_list,
                    'pass': pass_list,
                    'coherence': coherence_list}
para_tunning_df = pd.DataFrame().from_dict(para_tunning_dict)
para_tunning_df

Unnamed: 0,alpha,eta,pass,coherence
0,0.01,0.01,30,0.588634
1,0.01,0.01,40,0.569785
2,0.01,0.01,50,0.554366
3,0.01,0.9,30,0.614789
4,0.01,0.9,40,0.629302
5,0.01,0.9,50,0.613861
6,0.01,5.0,30,0.652143
7,0.01,5.0,40,0.603887
8,0.01,5.0,50,0.631164
9,0.9,0.01,30,0.587864


## **models.ldamodel**

Get models for multiple topic numbers with the same random seed to get the best model with the highest coherence value.

In [16]:
def compute_coherence_values(corpus, dictionary, k):
    '''
    Function which takes the corpus, dictionary and number of topics and return the topic coherence
    '''
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        lda_train4 = gensim.models.LdaModel(corpus=corpus,
                                                            num_topics=k,
                                                            id2word=dictionary,
                                                            chunksize=100,
                                                            passes=50,
                                                            random_state=None,
                                                            alpha=10,
                                                            eta=10)            
        


        lda_train4.save(f'/content/drive/MyDrive/Indeed/ab_40/lda_train4_{k}.model')
    coherence_model_lda = CoherenceModel(model=lda_train4, texts=bigram_train4, dictionary=train_id2word4, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [17]:
def hyperparameterize(alpha, beta, corpus_sets, topics_range):
    pbar = tqdm.tqdm(total=len(corpus_sets) * len(topics_range))

    #iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # get the coherence score for the given parameters
            cv = compute_coherence_values(corpus=corpus_sets[0], dictionary=train_id2word4, 
                                            k=k)
            # Save the model results
            model_results['Validation_Set'].append(corpus_title[i])
            model_results['Topics'].append(k)
            model_results['Alpha'].append(alpha)
            model_results['Beta'].append(beta)
            model_results['Coherence'].append(cv)
                    
            pbar.update(1)
    pbar.close()
    return model_results

In [None]:
import numpy as np
import tqdm
grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = 40
max_topics = 41
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Validation sets
num_of_docs = len(train_corpus4)
corpus_sets = [train_corpus4]
a = 10
b = 10
corpus_title = ['100% Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
results = hyperparameterize(a, b, corpus_sets, topics_range)
df = pd.DataFrame(results).to_csv(f'/content/drive/MyDrive/Indeed/lda_results_ab_{min_topics}_{max_topics}_{step_size}.csv', index=False)
df

100%|██████████| 1/1 [25:17<00:00, 1517.82s/it]


### Load Gensim LDA model

In [None]:
# Load model 
# model with random_state = None
lda_train10 = gensim.models.LdaModel.load("/content/drive/MyDrive/Indeed/none_10/lda_train4_10.model")
lda_train20 = gensim.models.LdaModel.load("/content/drive/MyDrive/Indeed/none_20/lda_train4_20.model")
lda_train30 = gensim.models.LdaModel.load("/content/drive/MyDrive/Indeed/none_30/lda_train4_30.model")
lda_train40 = gensim.models.LdaModel.load("/content/drive/MyDrive/Indeed/none_40/lda_train4_40.model")
lda_train50 = gensim.models.LdaModel.load("/content/drive/MyDrive/Indeed/none_50/lda_train4_50.model")
lda_train60 = gensim.models.LdaModel.load("/content/drive/MyDrive/Indeed/none_60/lda_train4_60.model")
lda_train70 = gensim.models.LdaModel.load("/content/drive/MyDrive/Indeed/none_70/lda_train4_70.model")
lda_train80 = gensim.models.LdaModel.load("/content/drive/MyDrive/Indeed/none_80/lda_train4_80.model")
lda_train90 = gensim.models.LdaModel.load("/content/drive/MyDrive/Indeed/none_90/lda_train4_90.model")
lda_train100 = gensim.models.LdaModel.load("/content/drive/MyDrive/Indeed/none_100/lda_train4_100.model")

# model40 with multiple random_state value 10, 50, 100
lda_train10_40 = gensim.models.LdaModel.load("/content/drive/MyDrive/Indeed/10_40/lda_train4_40.model")
lda_train50_40 = gensim.models.LdaModel.load("/content/drive/MyDrive/Indeed/50_40/lda_train4_40.model")
lda_train100_40 = gensim.models.LdaModel.load("/content/drive/MyDrive/Indeed/100_40/lda_train4_40.model")

# model40 with random_state = None, alpha=100, eta=10
lda_trainab_40 = gensim.models.LdaModel.load("/content/drive/MyDrive/Indeed/ab_40/lda_train4_40.model")


## **Heatmap that shows the inter-compare of topic models**

Let's now look at the Jaccard Distance metric for similarity between bags of words (i.e, documents)

The bigger the value, the more distinct these topics are

In [None]:
def plot_difference_plotly(mdiff, title="", annotation=None):
    """Plot the difference between models.

    Uses plotly as the backend."""
    import plotly.graph_objs as go
    import plotly.offline as py

    annotation_html = None
    if annotation is not None:
        annotation_html = [
            [
                "+++ {}<br>--- {}".format(", ".join(int_tokens), ", ".join(diff_tokens))
                for (int_tokens, diff_tokens) in row
            ]
            for row in annotation
        ]

    data = go.Heatmap(z=mdiff, colorscale='RdBu', text=annotation_html)
    layout = go.Layout(width=950, height=950, title=title, xaxis=dict(title="topic"), yaxis=dict(title="topic"))
    py.iplot(dict(data=[data], layout=layout))

## **Comparing different topic numbers with the same random seed**
topic number = 20, 30, 40, 50, 60, 70, 80, 90, 100

random seed = None

For the topic in the larger model, if each topic could match one similar topic in the small model, then the small model is a better choice.

In [None]:
mdiff, annotation = lda_train20.diff(lda_train10, distance='jaccard', num_words=100)
plot_difference_plotly(mdiff, title="Topic difference (two models)[jaccard distance]", annotation=annotation)

In [None]:
it = np.nditer(mdiff, flags=['multi_index'])
count = 0
for z in it:
  if z < 0.8:
    count += 1
    #print((z, it.multi_index), end=' ')
    print(z, it.multi_index)

0.3739837398373984 (1, 1)
0.6013986013986015 (2, 5)
0.6486486486486487 (3, 8)
0.6577181208053691 (5, 5)
0.3739837398373984 (8, 6)
0.4126984126984127 (9, 2)
0.4251968503937008 (11, 3)
0.29059829059829057 (12, 4)
0.5185185185185186 (13, 7)
0.26086956521739135 (14, 0)
0.46153846153846156 (15, 9)


In [None]:
percentage_1020 = count/20
percentage_1020

0.35

In [None]:
pprint(lda_train10.print_topics(num_topics=10))

[(0,
  '0.015*"project" + 0.013*"manager" + 0.010*"managers" + 0.006*"plan" + '
  '0.006*"employee" + 0.006*"communication" + 0.005*"workplace" + '
  '0.005*"tasks" + 0.005*"team_members" + 0.005*"leadership"'),
 (1,
  '0.022*"medical" + 0.022*"patients" + 0.012*"nursing" + 0.012*"health" + '
  '0.012*"patient" + 0.009*"healthcare" + 0.008*"care" + 0.008*"nurses" + '
  '0.006*"nurse" + 0.006*"primary_duties"'),
 (2,
  '0.017*"software" + 0.017*"data" + 0.016*"design" + 0.014*"systems" + '
  '0.011*"engineering" + 0.009*"computer" + 0.009*"engineers" + 0.008*"system" '
  '+ 0.008*"technology" + 0.008*"technical"'),
 (3,
  '0.020*"financial" + 0.010*"accounting" + 0.008*"cost" + 0.008*"costs" + '
  '0.007*"value" + 0.007*"businesses" + 0.006*"sales" + 0.005*"market" + '
  '0.005*"investment" + 0.005*"inventory"'),
 (4,
  '0.042*"resume" + 0.009*"relevant" + 0.009*"letter" + 0.008*"cover_letter" + '
  '0.007*"qualifications" + 0.006*"applying" + 0.006*"candidate" + '
  '0.006*"candidates"

In [None]:
pprint(lda_train20.print_topics(num_topics=20))

[(0,
  '0.026*"students" + 0.012*"fitness" + 0.012*"school" + 0.011*"teaching" + '
  '0.010*"sports" + 0.009*"social" + 0.009*"teacher" + 0.008*"children" + '
  '0.008*"community" + 0.008*"teachers"'),
 (1,
  '0.032*"medical" + 0.031*"patients" + 0.018*"nursing" + 0.017*"patient" + '
  '0.015*"health" + 0.013*"healthcare" + 0.011*"nurses" + 0.011*"care" + '
  '0.009*"nurse" + 0.006*"primary_duties"'),
 (2,
  '0.038*"sales" + 0.035*"customers" + 0.029*"product" + 0.028*"customer" + '
  '0.019*"products" + 0.010*"market" + 0.008*"businesses" + 0.007*"purchase" + '
  '0.007*"customer_service" + 0.006*"marketing"'),
 (3,
  '0.012*"construction" + 0.011*"equipment" + 0.011*"engineering" + '
  '0.008*"primary_duties" + 0.008*"safety" + 0.007*"engineers" + '
  '0.006*"systems" + 0.006*"technician" + 0.006*"design" + '
  '0.005*"technicians"'),
 (4,
  '0.012*"day" + 0.008*"office" + 0.008*"home" + 0.007*"schedule" + '
  '0.007*"food" + 0.006*"restaurant" + 0.006*"items" + 0.005*"tasks" + '
  '

In [None]:
mdiff, annotation = lda_train30.diff(lda_train20, distance='jaccard', num_words=100)
plot_difference_plotly(mdiff, title="Topic difference (two models)[jaccard distance]", annotation=annotation)

In [None]:
it = np.nditer(mdiff, flags=['multi_index'])
count = 0
for z in it:
  if z < 0.5:
    count += 1
    #print((z, it.multi_index), end=' ')
    print(z, it.multi_index)

0.4496124031007752 (0, 14)
0.31932773109243695 (1, 10)
0.18181818181818177 (2, 15)
0.3870967741935484 (10, 4)
0.3471074380165289 (11, 3)
0.058252427184465994 (12, 12)
0.23008849557522126 (13, 17)
0.3471074380165289 (14, 8)
0.3870967741935484 (15, 7)
0.31932773109243695 (16, 9)
0.23008849557522126 (22, 1)
0.13084112149532712 (23, 16)
0.3870967741935484 (27, 11)
0.4126984126984127 (28, 13)


In [None]:
percentage_2030 = count/30
percentage_2030

0.4666666666666667

In [None]:
mdiff, annotation = lda_train40.diff(lda_train30, distance='jaccard', num_words=100)
plot_difference_plotly(mdiff, title="Topic difference (two models)[jaccard distance]", annotation=annotation)

In [None]:
it = np.nditer(mdiff, flags=['multi_index'])
count = 0
for z in it:
  if z < 0.5:
    count += 1
    #print((z, it.multi_index), end=' ')
    print(z, it.multi_index)

0.13084112149532712 (2, 19)
0.039215686274509776 (4, 2)
0.039215686274509776 (5, 0)
0.4732824427480916 (7, 27)
0.039215686274509776 (13, 12)
0.18181818181818177 (14, 1)
0.1132075471698113 (17, 25)
0.1651376146788991 (18, 26)
0.360655737704918 (19, 24)
0.23008849557522126 (20, 23)
0.07692307692307687 (21, 22)
0.3471074380165289 (22, 5)
0.13084112149532712 (23, 21)
0.48484848484848486 (24, 13)
0.27586206896551724 (25, 11)
0.07692307692307687 (28, 28)
0.4 (29, 15)
0.09523809523809523 (31, 16)
0.09523809523809523 (32, 10)
0.1132075471698113 (35, 20)
0.14814814814814814 (36, 3)
0.3471074380165289 (37, 8)
0.2142857142857143 (39, 14)


In [None]:
percentage_3040 = count/40
percentage_3040

0.575

In [None]:
mdiff, annotation = lda_train50.diff(lda_train40, distance='jaccard', num_words=100)
plot_difference_plotly(mdiff, title="Topic difference (two models)[jaccard distance]", annotation=annotation)

In [None]:
it = np.nditer(mdiff, flags=['multi_index'])
count = 0
for z in it:
  if z < 0.5:
    count += 1
    #print((z, it.multi_index), end=' ')
    print(z, it.multi_index)

0.2142857142857143 (0, 3)
0.039215686274509776 (1, 14)
0.2142857142857143 (2, 18)
0.14814814814814814 (3, 10)
0.1651376146788991 (4, 26)
0.27586206896551724 (6, 38)
0.2142857142857143 (7, 39)
0.30508474576271183 (8, 1)
0.4126984126984127 (9, 7)
0.07692307692307687 (13, 12)
0.26086956521739135 (14, 33)
0.2142857142857143 (16, 32)
0.31932773109243695 (17, 5)
0.23008849557522126 (18, 19)
0.26086956521739135 (19, 20)
0.039215686274509776 (20, 21)
0.058252427184465994 (22, 17)
0.058252427184465994 (23, 4)
0.23008849557522126 (24, 16)
0.24561403508771928 (26, 28)
0.24561403508771928 (30, 22)
0.01980198019801982 (31, 31)
0.13084112149532712 (33, 23)
0.1651376146788991 (34, 27)
0.09523809523809523 (36, 25)
0.058252427184465994 (37, 13)
0.039215686274509776 (38, 36)
0.33333333333333337 (43, 37)
0.058252427184465994 (46, 29)
0.18181818181818177 (47, 24)


In [None]:
percentage_4050 = count/50
percentage_4050

0.6

In [None]:
mdiff, annotation = lda_train60.diff(lda_train50, distance='jaccard', num_words=100)
plot_difference_plotly(mdiff, title="Topic difference (two models)[jaccard distance]", annotation=annotation)

In [None]:
it = np.nditer(mdiff, flags=['multi_index'])
count = 0
for z in it:
  if z < 0.5:
    count += 1
    #print((z, it.multi_index), end=' ')
    print(z, it.multi_index)

0.1132075471698113 (2, 7)
0.18181818181818177 (3, 30)
0.09523809523809523 (4, 17)
0.18181818181818177 (6, 47)
0.3739837398373984 (7, 0)
0.058252427184465994 (10, 22)
0.0 (11, 5)
0.0 (11, 10)
0.0 (11, 11)
0.0 (11, 25)
0.0 (11, 35)
0.0 (11, 40)
0.0 (11, 44)
0.0 (11, 45)
0.0 (11, 48)
0.0 (11, 49)
0.07692307692307687 (12, 13)
0.0 (13, 5)
0.0 (13, 10)
0.0 (13, 11)
0.0 (13, 25)
0.0 (13, 35)
0.0 (13, 40)
0.0 (13, 44)
0.0 (13, 45)
0.0 (13, 48)
0.0 (13, 49)
0.4732824427480916 (14, 9)
0.0 (15, 5)
0.0 (15, 10)
0.0 (15, 11)
0.0 (15, 25)
0.0 (15, 35)
0.0 (15, 40)
0.0 (15, 44)
0.0 (15, 45)
0.0 (15, 48)
0.0 (15, 49)
0.0 (19, 5)
0.0 (19, 10)
0.0 (19, 11)
0.0 (19, 25)
0.0 (19, 35)
0.0 (19, 40)
0.0 (19, 44)
0.0 (19, 45)
0.0 (19, 48)
0.0 (19, 49)
0.13084112149532712 (20, 23)
0.0 (21, 5)
0.0 (21, 10)
0.0 (21, 11)
0.0 (21, 25)
0.0 (21, 35)
0.0 (21, 40)
0.0 (21, 44)
0.0 (21, 45)
0.0 (21, 48)
0.0 (21, 49)
0.2142857142857143 (22, 1)
0.13084112149532712 (23, 33)
0.30508474576271183 (24, 31)
0.18181818181818177

In [None]:
percentage_5060 = 42/60
percentage_5060

0.7

In [None]:
pprint(lda_train50.print_topics(num_topics=50))

[(0,
  '0.075*"data" + 0.032*"analysis" + 0.017*"analyst" + 0.016*"analysts" + '
  '0.008*"economics" + 0.008*"analyze" + 0.007*"models" + 0.006*"trends" + '
  '0.006*"theory" + 0.006*"statistics"'),
 (1,
  '0.034*"primary_duties" + 0.022*"manager" + 0.021*"clients" + '
  '0.017*"managers" + 0.013*"duties" + 0.013*"professionals" + '
  '0.008*"operations" + 0.007*"client" + 0.007*"consultant" + 0.007*"manage"'),
 (2,
  '0.013*"opportunities" + 0.008*"professionals" + 0.007*"search" + '
  '0.007*"internship" + 0.006*"potential" + 0.006*"network" + '
  '0.006*"positions" + 0.006*"tips" + 0.006*"opportunity" + '
  '0.006*"interested"'),
 (3,
  '0.039*"test" + 0.024*"tests" + 0.012*"assessment" + 0.007*"personality" + '
  '0.007*"introverts" + 0.005*"introverted" + 0.005*"extroverted" + '
  '0.005*"extroverts" + 0.004*"pre_employment" + 0.004*"aptitude_tests"'),
 (4,
  '0.054*"marketing" + 0.027*"social_media" + 0.022*"content" + 0.022*"brand" '
  '+ 0.014*"website" + 0.012*"event" + 0.011

In [None]:
mdiff, annotation = lda_train40.diff(lda_train20, distance='jaccard', num_words=100)
plot_difference_plotly(mdiff, title="Topic difference (two models)[jaccard distance]", annotation=annotation)

In [None]:
it = np.nditer(mdiff, flags=['multi_index'])
count = 0
for z in it:
  if z < 0.5:
    count += 1
    #print((z, it.multi_index), end=' ')
    print(z, it.multi_index)

0.18181818181818177 (4, 15)
0.4375 (5, 14)
0.058252427184465994 (13, 12)
0.4251968503937008 (14, 10)
0.26086956521739135 (20, 16)
0.26086956521739135 (21, 1)
0.4251968503937008 (25, 3)
0.30508474576271183 (26, 5)
0.2142857142857143 (27, 2)
0.4126984126984127 (28, 13)
0.27586206896551724 (29, 7)
0.3739837398373984 (31, 9)
0.4375 (32, 4)
0.4 (37, 0)
0.4496124031007752 (39, 8)


In [None]:
percentage_2040 = count/40
percentage_2040

0.375

In [None]:
pprint(lda_train20.print_topics(num_topics=20))

[(0,
  '0.026*"students" + 0.012*"fitness" + 0.012*"school" + 0.011*"teaching" + '
  '0.010*"sports" + 0.009*"social" + 0.009*"teacher" + 0.008*"children" + '
  '0.008*"community" + 0.008*"teachers"'),
 (1,
  '0.032*"medical" + 0.031*"patients" + 0.018*"nursing" + 0.017*"patient" + '
  '0.015*"health" + 0.013*"healthcare" + 0.011*"nurses" + 0.011*"care" + '
  '0.009*"nurse" + 0.006*"primary_duties"'),
 (2,
  '0.038*"sales" + 0.035*"customers" + 0.029*"product" + 0.028*"customer" + '
  '0.019*"products" + 0.010*"market" + 0.008*"businesses" + 0.007*"purchase" + '
  '0.007*"customer_service" + 0.006*"marketing"'),
 (3,
  '0.012*"construction" + 0.011*"equipment" + 0.011*"engineering" + '
  '0.008*"primary_duties" + 0.008*"safety" + 0.007*"engineers" + '
  '0.006*"systems" + 0.006*"technician" + 0.006*"design" + '
  '0.005*"technicians"'),
 (4,
  '0.012*"day" + 0.008*"office" + 0.008*"home" + 0.007*"schedule" + '
  '0.007*"food" + 0.006*"restaurant" + 0.006*"items" + 0.005*"tasks" + '
  '

In [None]:
pprint(lda_train40.print_topics(num_topics=40))

[(0,
  '0.011*"mentee" + 0.001*"mentor_mentee" + 0.001*"mentees" + '
  '0.000*"mentor_mentor" + 0.000*"identifies" + 0.000*"identifying_areas" + '
  '0.000*"advises" + 0.000*"quit" + 0.000*"less_experienced" + '
  '0.000*"midlevel"'),
 (1,
  '0.023*"legal" + 0.019*"law" + 0.014*"case" + 0.012*"lawyers" + '
  '0.011*"government" + 0.010*"lawyer" + 0.009*"law_enforcement" + '
  '0.008*"law_school" + 0.008*"evidence" + 0.007*"cases"'),
 (2,
  '0.012*"coding" + 0.010*"cpc" + 0.009*"medical_coding" + 0.006*"ccs" + '
  '0.006*"medical_billing" + 0.006*"angular" + 0.005*"codes" + '
  '0.005*"angularjs" + 0.003*"aapc" + 0.003*"rhit"'),
 (3,
  '0.061*"data" + 0.027*"analysis" + 0.014*"analyst" + 0.013*"analysts" + '
  '0.010*"economic" + 0.008*"economics" + 0.008*"analyze" + 0.006*"trends" + '
  '0.006*"theory" + 0.005*"models"'),
 (4,
  '0.023*"degree" + 0.017*"certification" + 0.013*"program" + 0.011*"programs" '
  '+ 0.010*"earn" + 0.009*"requirements" + 0.008*"certifications" + '
  '0.007*"

### **Comparing model40 with multiple random seed**
(topic number = 40, random seed = 10, 50, 100)

After comparing the random seed None/10, None/50, None/100, we find that the results are not consistent.

In [None]:
mdiff, annotation = lda_train40.diff(lda_train10_40, distance='jaccard', num_words=100)
plot_difference_plotly(mdiff, title="Topic difference (two models)[jaccard distance]", annotation=annotation)

In [None]:
it = np.nditer(mdiff, flags=['multi_index'])
count = 0
for z in it:
  if z < 0.5:
    count += 1
    #print((z, it.multi_index), end=' ')
    print(z, it.multi_index)

0.1132075471698113 (4, 19)
0.2142857142857143 (5, 2)
0.13084112149532712 (7, 32)
0.14814814814814814 (9, 21)
0.058252427184465994 (13, 16)
0.1132075471698113 (16, 3)
0.058252427184465994 (17, 23)
0.19819819819819817 (20, 9)
0.23008849557522126 (21, 6)
0.4496124031007752 (22, 8)
0.26086956521739135 (23, 24)
0.48484848484848486 (24, 4)
0.07692307692307687 (25, 28)
0.1132075471698113 (26, 1)
0.07692307692307687 (27, 35)
0.18181818181818177 (28, 15)
0.058252427184465994 (29, 14)
0.14814814814814814 (31, 13)
0.31932773109243695 (33, 31)
0.4496124031007752 (35, 27)
0.07692307692307687 (36, 10)
0.1651376146788991 (39, 25)


In [None]:
percentage_none10 = count/40
percentage_none10

0.55

In [None]:
mdiff, annotation = lda_train40.diff(lda_train50_40, distance='jaccard', num_words=100)
plot_difference_plotly(mdiff, title="Topic difference (two models)[jaccard distance]", annotation=annotation)

In [None]:
it = np.nditer(mdiff, flags=['multi_index'])
count = 0
for z in it:
  if z < 0.5:
    count += 1
    #print((z, it.multi_index), end=' ')
    print(z, it.multi_index)

0.46153846153846156 (2, 21)
0.19819819819819817 (4, 36)
0.4375 (5, 13)
0.18181818181818177 (9, 8)
0.07692307692307687 (13, 27)
0.14814814814814814 (14, 6)
0.1132075471698113 (17, 28)
0.23008849557522126 (18, 30)
0.27586206896551724 (20, 22)
0.19819819819819817 (21, 25)
0.1651376146788991 (22, 34)
0.1651376146788991 (23, 1)
0.13084112149532712 (25, 39)
0.14814814814814814 (26, 32)
0.2142857142857143 (27, 2)
0.27586206896551724 (28, 4)
0.1651376146788991 (29, 5)
0.18181818181818177 (30, 19)
0.13084112149532712 (31, 17)
0.2142857142857143 (32, 26)
0.29059829059829057 (33, 15)
0.09523809523809523 (36, 0)
0.26086956521739135 (39, 23)


In [None]:
percentage_none50 = count/40
percentage_none50

0.575

In [None]:
mdiff, annotation = lda_train40.diff(lda_train100_40, distance='jaccard', num_words=100)
plot_difference_plotly(mdiff, title="Topic difference (two models)[jaccard distance]", annotation=annotation)

In [None]:
it = np.nditer(mdiff, flags=['multi_index'])
count = 0
for z in it:
  if z < 0.5:
    count += 1
    #print((z, it.multi_index), end=' ')
    print(z, it.multi_index)

0.07692307692307687 (4, 11)
0.13084112149532712 (5, 32)
0.19819819819819817 (9, 22)
0.19819819819819817 (10, 35)
0.1651376146788991 (11, 4)
0.01980198019801982 (13, 2)
0.26086956521739135 (14, 24)
0.07692307692307687 (17, 6)
0.07692307692307687 (18, 38)
0.2142857142857143 (20, 3)
0.30508474576271183 (21, 8)
0.18181818181818177 (22, 16)
0.4251968503937008 (23, 28)
0.48484848484848486 (24, 23)
0.1132075471698113 (25, 25)
0.3471074380165289 (26, 27)
0.31932773109243695 (27, 19)
0.058252427184465994 (28, 5)
0.1651376146788991 (31, 15)
0.07692307692307687 (32, 7)
0.1132075471698113 (33, 13)
0.19819819819819817 (35, 39)
0.1132075471698113 (36, 18)
0.19819819819819817 (39, 37)


In [None]:
percentage_none100 = count/40
percentage_none100

0.6

## **Comparing the same topic number with different random seed (alpha, eta)**

(topic number = 40, alpha=10, eta=100)

After comparing the model40 with alpha=0.01,eta=9 with alpha=10,eta=100, we found that the model is not consistent.

In [None]:
mdiff, annotation = lda_trainab_40.diff(lda_train40, distance='jaccard', num_words=100)
plot_difference_plotly(mdiff, title="Topic difference (two models)[jaccard distance]", annotation=annotation)

In [None]:
it = np.nditer(mdiff, flags=['multi_index'])
count = 0
for z in it:
  if z < 0.5:
    count += 1
    #print((z, it.multi_index), end=' ')
    print(z, it.multi_index)

0.4496124031007752 (29, 5)


In [None]:
percentage_ab40 = count/40
percentage_ab40

0.025

In [None]:
pprint(lda_trainab_40.print_topics(num_topics=40))

[(0,
  '0.000*"order" + 0.000*"local" + 0.000*"sometimes" + 0.000*"location" + '
  '0.000*"food" + 0.000*"helpful" + 0.000*"article_discuss" + 0.000*"sleep" + '
  '0.000*"since" + 0.000*"restaurant"'),
 (1,
  '0.017*"financial" + 0.007*"accounting" + 0.007*"value" + 0.007*"costs" + '
  '0.007*"cost" + 0.006*"pay" + 0.006*"businesses" + 0.005*"budget" + '
  '0.005*"money" + 0.005*"year"'),
 (2,
  '0.000*"order" + 0.000*"local" + 0.000*"sometimes" + 0.000*"location" + '
  '0.000*"food" + 0.000*"helpful" + 0.000*"article_discuss" + 0.000*"sleep" + '
  '0.000*"since" + 0.000*"restaurant"'),
 (3,
  '0.000*"order" + 0.000*"local" + 0.000*"sometimes" + 0.000*"location" + '
  '0.000*"food" + 0.000*"helpful" + 0.000*"article_discuss" + 0.000*"sleep" + '
  '0.000*"since" + 0.000*"restaurant"'),
 (4,
  '0.000*"order" + 0.000*"local" + 0.000*"sometimes" + 0.000*"location" + '
  '0.000*"food" + 0.000*"helpful" + 0.000*"article_discuss" + 0.000*"sleep" + '
  '0.000*"since" + 0.000*"restaurant"'),
 (