In [1]:
# General Libraries

import numpy as np
import json
import glob

# Gensim Libraries

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# SpaCy Libraries

import spacy
from nltk.corpus import stopwords

# Vis Libraries

import pyLDAvis
import pyLDAvis.gensim

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Loading and Writing JSON Files

def load_data(file):
    with open (file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return (data)

def write_data(file, data):
    with open(file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)



In [2]:
# Add Stopwords

stopwords = stopwords.words("english")

data = load_data("C:/Users/LENOVO/Desktop/python_scripts/old files/ushmm_dn.json")["texts"]

# Lemmatization

def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    texts_out = []
    for text in texts:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
    return (texts_out)

lemmatized_texts = lemmatization(data)
print(lemmatized_texts[0][0:90])

def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

data_words = gen_words(lemmatized_texts)
print(data_words[0][0:20])


name bear small town call bear very hard work child father mother small mill flour buckwhe
['name', 'bear', 'small', 'town', 'call', 'bear', 'very', 'hard', 'work', 'child', 'father', 'mother', 'small', 'mill', 'flour', 'buckwheat', 'prosperous', 'comfortable', 'go', 'school']


In [3]:

# Bigrams and Trigrams

bigram_phrases = gensim.models.Phrases(data_words, min_count=5, threshold=100)
trigram_phrases = gensim.models.Phrases(bigram_phrases[data_words], threshold=100)

bigram = gensim.models.phrases.Phraser(bigram_phrases)
trigram = gensim.models.phrases.Phraser(trigram_phrases)

def make_bigrams(texts):
    return(bigram[doc] for doc in texts)

def make_trigrams(texts):
    return([trigram[bigram[doc]] for doc in texts])

data_bigrams = make_bigrams(data_words)
data_bigrams_trigrams = make_trigrams(data_bigrams)

print (data_bigrams_trigrams[0])




['name', 'bear', 'small', 'town', 'call', 'bear', 'very', 'hard', 'work', 'child', 'father', 'mother', 'small', 'mill', 'flour', 'buckwheat', 'prosperous', 'comfortable', 'go', 'school', 'public', 'school', 'morning', 'afternoon', 'go', 'religious', 'school', 'almost', 'late', 'night', 'raise', 'spirit', 'school', 'little', 'city', 'segregate', 'mind', 'small', 'town', 'say', 'majority', 'people', 'small', 'town', 'jewish', 'people', 'town', 'somehow', 'know', 'separate', 'jewish', 'child', 'catholic', 'child', 'know', 'most', 'people', 'catholic', 'use', 'friend', 'feel', 'maybe', 'personally', 'know', 'lot', 'incident', 'small', 'little', 'call', 'separate', 'other', 'word', 'hardly', 'get', 'together', 'incident', 'incident', 'pleasant', 'incident', 'call', 'house', 'people', 'regardless', 'religious', 'believe', 'really', 'religious', 'people', 'other', 'lovely', 'family', 'city', 'even', 'though', 'time', 'go', 'underground', 'religious', 'institution', 'parent', 'say', 'very', 'r

In [4]:
# TF-IDF Removal

from gensim.models import TfidfModel

id2word = corpora.Dictionary(data_bigrams_trigrams)

texts = data_bigrams_trigrams

corpus = [id2word.doc2bow(text) for text in texts]

print (corpus[0][0:20])

tfidf = TfidfModel(corpus, id2word=id2word)

low_value = 0.03
words = []
words_missing_in_tfidf = []
for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = []
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words+words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids]
    
    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
    corpus[i] = new_bow

[(0, 1), (1, 1), (2, 11), (3, 1), (4, 2), (5, 1), (6, 2), (7, 1), (8, 2), (9, 3), (10, 1), (11, 12), (12, 1), (13, 8), (14, 1), (15, 2), (16, 1), (17, 3), (18, 2), (19, 1)]


In [5]:
#id2word = corpora.Dictionary(data_words)
#
#corpus = []
#for text in data_words:
#    new = id2word.doc2bow(text)
#    corpus.append(new)
#
#print(corpus[0][0:20])
#
#word = id2word[0][:1][0]
#print (word)

# Creating the LDA Model

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus[:-1],
                                            id2word=id2word,
                                            num_topics=10,
                                            random_state=100,
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha="auto")



In [7]:
test_doc = corpus[-1]

vector = lda_model[test_doc]
print(vector)

def Sort(sub_li):
    sub_li.sort(key = lambda x: x[1])
    sub_li.reverse()
    return(sub_li)
new_vector = Sort(vector)
print (new_vector)

[(0, 0.021716643), (1, 0.06821019), (2, 0.019167583), (3, 0.12570667), (4, 0.10546406), (5, 0.27726611), (6, 0.13618551), (7, 0.1983748), (9, 0.047813557)]
[(5, 0.27726611), (7, 0.1983748), (6, 0.13618551), (3, 0.12570667), (4, 0.10546406), (1, 0.06821019), (9, 0.047813557), (0, 0.021716643), (2, 0.019167583)]


In [8]:
lda_model.save("C:/Users/LENOVO/Desktop/python_scripts/old files/models/ldamodel.model")

In [9]:
new_model = gensim.models.ldamodel.LdaModel.load("C:/Users/LENOVO/Desktop/python_scripts/old files/models/ldamodel.model")

In [10]:
test_doc = corpus[-1]

vector = new_model[test_doc]
print(vector)

def Sort(sub_li):
    sub_li.sort(key = lambda x: x[1])
    sub_li.reverse()
    return(sub_li)
new_vector = Sort(vector)
print (new_vector)

[(0, 0.021717155), (1, 0.06821198), (2, 0.019167596), (3, 0.1257088), (4, 0.105463296), (5, 0.27726662), (6, 0.13618359), (7, 0.19837348), (9, 0.0478126)]
[(5, 0.27726662), (7, 0.19837348), (6, 0.13618359), (3, 0.1257088), (4, 0.105463296), (1, 0.06821198), (9, 0.0478126), (0, 0.021717155), (2, 0.019167596)]


In [12]:
# Data Visualization

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis

