# <center>Implementing LDA in Python</center>

<center>Dr. W.J.B. Mattingly</center>

<center>Smithsonian Data Science Lab and United States Holocaust Memorial Museum</center>

<center>February 2021</center>

## Importing the Required Libraries

In [1]:
#https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#1introduction
import pandas as pd
import numpy as np
import json
import glob

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#spacy
import spacy
from nltk.corpus import stopwords

#vis
import pyLDAvis
import pyLDAvis.gensim_models

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

## Preparing the Data

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mango\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
stopwords = stopwords.words("english")

In [4]:
print (stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [6]:
# data = load_data("data/ushmm_dn.json")["texts"]
data = pd.read_csv('Data/Response_Comments_Metrics.csv')
data = list(data.RESP_COMMENT)
# print (data[0][0:90])

In [7]:
data[0][0:90]

'Learning new experiences with animation and testing my strengths asa designer and animator'

In [None]:
def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    texts_out = []
    for text in texts:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
    return (texts_out)


lemmatized_texts = lemmatization(data)
print (lemmatized_texts[0][0:90])



In [None]:
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

data_words = gen_words(lemmatized_texts)

print (data_words[0][0:20])

In [10]:
#BIGRAMS AND TRIGRAMS
bigram_phrases = gensim.models.Phrases(data_words, min_count=5, threshold=100)
trigram_phrases = gensim.models.Phrases(bigram_phrases[data_words], threshold=100)

bigram = gensim.models.phrases.Phraser(bigram_phrases)
trigram = gensim.models.phrases.Phraser(trigram_phrases)

def make_bigrams(texts):
    return([bigram[doc] for doc in texts])

def make_trigrams(texts):
    return ([trigram[bigram[doc]] for doc in texts])

data_bigrams = make_bigrams(data_words)
data_bigrams_trigrams = make_trigrams(data_bigrams)

print (data_bigrams_trigrams[0][0:20])

['learn', 'new', 'experience', 'animation', 'test', 'strength', 'asa', 'designer', 'animator']


In [11]:
#TF-IDF REMOVAL
from gensim.models import TfidfModel

id2word = corpora.Dictionary(data_bigrams_trigrams)

texts = data_bigrams_trigrams

corpus = [id2word.doc2bow(text) for text in texts]
# print (corpus[0][0:20])

tfidf = TfidfModel(corpus, id2word=id2word)

low_value = 0.03
words  = []
words_missing_in_tfidf = []
for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = [] #reinitialize to be safe. You can skip this.
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words+words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # The words with tf-idf socre 0 will be missing

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
    corpus[i] = new_bow


In [12]:
# id2word = corpora.Dictionary(all_texts)

# corpus = []
# for text in data_words:
#     new = id2word.doc2bow(text)
#     corpus.append(new)

# print (corpus[0][0:20])

# word = id2word[[0][:1][0]]
# print (word)

In [13]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus[:-1],
                                           id2word=id2word,
                                           num_topics=10,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")


In [14]:
test_doc = corpus[-1]

vector = lda_model[test_doc]
print (vector)

def Sort(sub_li):
    sub_li.sort(key = lambda x: x[1])
    sub_li.reverse()
    return (sub_li)
new_vector = Sort(vector)
print (new_vector)

[(0, 0.09429054), (1, 0.09817904), (2, 0.055370737), (3, 0.22694865), (4, 0.14122745), (5, 0.03568427), (6, 0.08367516), (7, 0.100482345), (8, 0.017456938), (9, 0.1466849)]
[(3, 0.22694865), (9, 0.1466849), (4, 0.14122745), (7, 0.100482345), (1, 0.09817904), (0, 0.09429054), (6, 0.08367516), (2, 0.055370737), (5, 0.03568427), (8, 0.017456938)]


In [21]:
lda_model.save("models/20221003_lda.model")

In [22]:
new_model = gensim.models.ldamodel.LdaModel.load("models/20221003_lda.model")

## Vizualizing the Data

In [15]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
# vis

  default_term_info = default_term_info.sort_values(


In [16]:
vis

# Questions other than 1053 and 1070 Responses only

In [56]:
# data = load_data("data/ushmm_dn.json")["texts"]
df_resp = pd.read_csv('Data/Response_Comments_Metrics.csv')
# data = data[data.RE
# data = list(data.RESP_COMMENT)
# print (data[0][0:90])

In [57]:
df_resp

Unnamed: 0,RESP_ID,RESP_SURVEY_ID,RESP_Q_ID,RESP_POINTS,resp_len,avg_word_len,max_word_len,word_count,RESP_COMMENT
0,13926609,164973,1053,,90,6.0,11,13,Learning new experiences with animation and te...
1,13926622,164973,3131,4.0,102,4.0,10,19,I learned a lot of new techniques and had expe...
2,13926628,164973,3707,3.0,132,5.0,13,22,Goals of immersion and data visualisation were...
3,14109824,187612,1053,,125,4.5,11,22,The interaction between each of my classmate. ...
4,13926629,164973,3708,4.0,46,4.5,9,8,"Came in everyday, very happy with group effort"
...,...,...,...,...,...,...,...,...,...
385703,22675055,350031,1070,,22,6.5,12,3,No improvements needed
385704,22623374,343111,3131,2.0,907,5.0,18,147,Due to recent changes in leadership at the sch...
385705,22675154,350299,1053,,170,5.0,12,27,I liked the tutorial sessions. My tutor was re...
385706,22623378,343111,3709,2.0,1218,5.0,14,203,"Leena was a satisfactory tutor, who seemingly ..."


In [58]:
data = df_resp.loc[(df_resp.RESP_Q_ID!=1053) & (df_resp.RESP_Q_ID!=1070)]['RESP_COMMENT']

In [60]:
data_resp_id = df_resp.loc[(df_resp.RESP_Q_ID!=1053) & (df_resp.RESP_Q_ID!=1070)]['RESP_ID']

In [20]:
lemmatized_texts = lemmatization(data)
print (lemmatized_texts[0][0:90])

data_words = gen_words(lemmatized_texts)
print (data_words[0][0:20])



learn lot new technique experience work new screen space great
['learn', 'lot', 'new', 'technique', 'experience', 'work', 'new', 'screen', 'space', 'great']


In [21]:
#BIGRAMS AND TRIGRAMS
bigram_phrases = gensim.models.Phrases(data_words, min_count=5, threshold=100)
trigram_phrases = gensim.models.Phrases(bigram_phrases[data_words], threshold=100)

bigram = gensim.models.phrases.Phraser(bigram_phrases)
trigram = gensim.models.phrases.Phraser(trigram_phrases)

def make_bigrams(texts):
    return([bigram[doc] for doc in texts])

def make_trigrams(texts):
    return ([trigram[bigram[doc]] for doc in texts])

data_bigrams = make_bigrams(data_words)
data_bigrams_trigrams = make_trigrams(data_bigrams)

print (data_bigrams_trigrams[0][0:20])

['learn', 'lot', 'new', 'technique', 'experience', 'work', 'new', 'screen', 'space', 'great']


In [22]:
#TF-IDF REMOVAL
from gensim.models import TfidfModel

id2word = corpora.Dictionary(data_bigrams_trigrams)

texts = data_bigrams_trigrams

corpus = [id2word.doc2bow(text) for text in texts]
# print (corpus[0][0:20])

tfidf = TfidfModel(corpus, id2word=id2word)

low_value = 0.03
words  = []
words_missing_in_tfidf = []
for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = [] #reinitialize to be safe. You can skip this.
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words+words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # The words with tf-idf socre 0 will be missing

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
    corpus[i] = new_bow


In [23]:
type(data_bigrams_trigrams)

list

In [24]:
data_bigrams_trigrams

[['learn',
  'lot',
  'new',
  'technique',
  'experience',
  'work',
  'new',
  'screen',
  'space',
  'great'],
 ['goal',
  'immersion',
  'datum_visualisation',
  'achieve',
  'feel',
  'more',
  'focus',
  'put',
  'motion',
  'capture',
  'storytelle'],
 ['come', 'everyday', 'very', 'happy', 'group', 'effort'],
 ['intensive', 'overall', 'positive', 'experience'],
 ['very', 'intelligent', 'engage', 'feedback', 'group'],
 ['content', 'course', 'provide', 'adequacy', 'align', 'objective'],
 ['content', 'deliver', 'provide', 'great', 'deal', 'knowledge', 'learn'],
 ['extremely',
  'friendly',
  'come',
  'even',
  'non',
  'teaching',
  'day',
  'show',
  'commitment',
  'technical',
  'knowledge',
  'course',
  'need'],
 ['high', 'level', 'content', 'case', 'present', 'class', 'learn'],
 ['level',
  'content',
  'deliver',
  'very',
  'well',
  'present',
  'very',
  'engaging',
  'assist',
  'motivation',
  'learn'],
 ['experience',
  'gain',
  'animation',
  'software',
  'entirely

In [25]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus[:-1],
                                           id2word=id2word,
                                           num_topics=10,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")


In [26]:
test_doc = corpus[-1]

vector = lda_model[test_doc]
print (vector)

def Sort(sub_li):
    sub_li.sort(key = lambda x: x[1])
    sub_li.reverse()
    return (sub_li)
new_vector = Sort(vector)
print (new_vector)

[(0, 0.15399261), (1, 0.118609525), (2, 0.04983521), (3, 0.12212789), (4, 0.03810325), (5, 0.15043305), (6, 0.0621007), (7, 0.11237193), (8, 0.124662206), (9, 0.06776361)]
[(0, 0.15399261), (5, 0.15043305), (8, 0.124662206), (3, 0.12212789), (1, 0.118609525), (7, 0.11237193), (9, 0.06776361), (6, 0.0621007), (2, 0.04983521), (4, 0.03810325)]


In [53]:
# save non-1053-1070 model
lda_model.save("models/20221003_lda_non10531070.model")

## Vizualizing the Data

In [27]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis

  default_term_info = default_term_info.sort_values(


## Checking model result

In [28]:
from gensim import corpora,models,similarities

index = similarities.SparseMatrixSimilarity(lda_model[corpus],num_features=len(id2word))

In [30]:
# t = np.random.randint(len(Xtest))
# query_document = re.split('\W+',Xtest[t].lower())
query_document = data_bigrams_trigrams[-1]

query_bow = id2word.doc2bow(query_document)
sims = index[lda_model[query_bow]]
docNumber = sorted(enumerate(sims), key=lambda x: x[1], reverse=True)[0][0]
docNumber
# print('Predicted:',y_names[ytrain[docNumber]])
# print('Ground Truth:',y_names[ytest[t]])

77220

In [31]:
query_document

['satisfactory',
 'tutor',
 'inject',
 'valuable',
 'insight',
 'industry',
 'experience',
 'studio',
 'most',
 'time',
 'helpful',
 'provide',
 'good',
 'feedback',
 'min',
 'week',
 'however',
 'external',
 'tutor',
 'only',
 'present',
 'class',
 'week',
 'reduce',
 'time',
 'able',
 'dedicate',
 'studio',
 'also',
 'wish',
 'able',
 'bring',
 'more',
 'industry',
 'specialist',
 'studio',
 'perhaps',
 'create',
 'more',
 'diverse',
 'discussion',
 'studio',
 'space']

In [40]:
topic_id = sorted(lda_model[query_bow], key=lambda x: x[1], reverse=True)[0][0]

In [41]:
topic_id

0

In [63]:
type(data_resp_id)

pandas.core.series.Series

In [None]:
def getTopicByRespId( query_resp_id, data_resp_id, data_bigrams_trigrams, lda_model):
    doc_posn = data_resp_id[data_resp_id==query_resp_id]
    doc_id = np.where(data_resp_id.to_numpy()==query_resp_id)[0][0]
    query_document = data_bigrams_trigrams[doc_id]

    query_bow = id2word.doc2bow(query_document)
    topic_id = sorted(lda_model[query_bow], key=lambda x: x[1], reverse=True)[0][0]
    print(topic_id, doc_posn, query_document)
    print(data[doc_posn.index.tolist()[0]])
    return(topic_id)
    
for i in data_resp_id:
    query_resp_id = i
    o_topic = getTopicByRespId(query_resp_id = query_resp_id
                     ,data_resp_id = data_resp_id
                     ,data_bigrams_trigrams = data_bigrams_trigrams
                     ,lda_model = lda_model)
    print(query_resp_id, o_topic)

5 1    13926622
Name: RESP_ID, dtype: int64 ['learn', 'lot', 'new', 'technique', 'experience', 'work', 'new', 'screen', 'space', 'great']
I learned a lot of new techniques and had experience working with a new screen space, which was great.
13926622 5
5 2    13926628
Name: RESP_ID, dtype: int64 ['goal', 'immersion', 'datum_visualisation', 'achieve', 'feel', 'more', 'focus', 'put', 'motion', 'capture', 'storytelle']
Goals of immersion and data visualisation were achieved, where I felt more focus could be put on motion capture, VR and storytelling
13926628 5
3 4    13926629
Name: RESP_ID, dtype: int64 ['come', 'everyday', 'very', 'happy', 'group', 'effort']
Came in everyday, very happy with group effort
13926629 3
5 5    13926630
Name: RESP_ID, dtype: int64 ['intensive', 'overall', 'positive', 'experience']
It was intensive but it was an overall positive experience
13926630 5
5 7    13926631
Name: RESP_ID, dtype: int64 ['very', 'intelligent', 'engage', 'feedback', 'group']
Very intellige

In [191]:
np.where(data_resp_id.to_numpy()==13926631)[0][0]

4

In [200]:
data_resp_id

1         13926622
2         13926628
4         13926629
5         13926630
7         13926631
            ...   
385685    22623225
385686    22623226
385704    22623374
385706    22623378
385707    22623379
Name: RESP_ID, Length: 77221, dtype: int64

In [125]:
data

1         I learned a lot of new techniques and had expe...
2         Goals of immersion and data visualisation were...
4            Came in everyday, very happy with group effort
5         It was intensive but it was an overall positiv...
7         Very intelligent and engages with feedback and...
                                ...                        
385685    never is all my life of education have i had s...
385686    shit teacher. contradicts himself often. has a...
385704    Due to recent changes in leadership at the sch...
385706    Leena was a satisfactory tutor, who seemingly ...
385707    James was a satisfactory tutor, and injected v...
Name: RESP_COMMENT, Length: 77221, dtype: object