# <center>Topic Modeling Using LDA (Latent Dirichlet Allocation) in Python</center>

## Importing the Required Libraries

In [19]:
import numpy as np
import json
import glob

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#spacy
import spacy
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

#vis
#! pip install pyLDAvis
import pyLDAvis
import pyLDAvis.gensim_models

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from tqdm import tqdm 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Preparing the Data

In [4]:
def load_data(file):
    with open (file, "r", encoding="utf-8") as f:
        data = json.load(f) 
    return (data)

def write_data(file, data):
    with open (file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)


In [7]:
stopwords = stopwords.words("english")

In [8]:
print (stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [11]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [18]:
data = load_data("/content/drive/MyDrive/Advanced_Machine_Learning/LDA/ushmm_dn.json")["texts"]
print(len(data))
print(data[10][0:90])


252
 My name is Helene Baraf, named Zupnik. I am born in Antwerp, Belgium, the 24th of July 19


In [21]:
def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    texts_out = []
    for text in tqdm(texts):
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
    return (texts_out)
lemmatized_texts = lemmatization(data)
print (lemmatized_texts[10][0:90])

100%|██████████| 252/252 [02:16<00:00,  1.85it/s]

name name bear 24th start tell little bit ancestor come very famous branch jewish religion





In [22]:
def gen_words(texts):
    final = []
    for text in tqdm(texts):
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

data_words = gen_words(lemmatized_texts)
print()
print (data_words[10][0:20])

100%|██████████| 252/252 [00:01<00:00, 159.41it/s]


['name', 'name', 'bear', 'th', 'start', 'tell', 'little', 'bit', 'ancestor', 'come', 'very', 'famous', 'branch', 'jewish', 'religion', 'great', 'grand', 'uncle', 'synagogue', 'name']





In [23]:
id2word = corpora.Dictionary(data_words)

corpus = []
for text in tqdm(data_words):
    new = id2word.doc2bow(text)
    corpus.append(new)
print()
print (corpus[10][0:20])

word = id2word[[10][:1][0]]
print (word)

100%|██████████| 252/252 [00:00<00:00, 893.95it/s]


[(1, 4), (10, 1), (12, 1), (20, 5), (21, 1), (24, 7), (25, 2), (26, 1), (33, 2), (38, 2), (39, 2), (54, 2), (56, 5), (60, 1), (61, 1), (65, 3), (66, 18), (67, 2), (78, 6), (79, 2)]
afternoon





In [29]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")


## Vizualizing the Data

In [30]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis

  default_term_info = default_term_info.sort_values(
