In [1]:
# !pip install pyLDAvis
# !pip install gensim

In [2]:
import re
import numpy as np
import pandas as  pd
from pprint import pprint# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt

In [3]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [4]:
# LoadDataset
df=pd.read_json('newsgroups.json')
print(df.target_names.unique())
df.head()

['rec.autos' 'comp.sys.mac.hardware' 'comp.graphics' 'sci.space'
 'talk.politics.guns' 'sci.med' 'comp.sys.ibm.pc.hardware'
 'comp.os.ms-windows.misc' 'rec.motorcycles' 'talk.religion.misc'
 'misc.forsale' 'alt.atheism' 'sci.electronics' 'comp.windows.x'
 'rec.sport.hockey' 'rec.sport.baseball' 'soc.religion.christian'
 'talk.politics.mideast' 'talk.politics.misc' 'sci.crypt']


Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.sys.mac.hardware
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,comp.graphics
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,sci.space


In [5]:
df.shape

(11314, 3)

### Initial Data Cleaning

In [6]:
# Convert to list 
data = df.content.values.tolist()  
# Remove Emails 
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]  

  data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]


In [7]:
# Remove new line characters 
data = [re.sub('\s+', ' ', sent) for sent in data]  

  data = [re.sub('\s+', ' ', sent) for sent in data]


In [8]:
# Remove distracting single quotes 
data = [re.sub("\'", "", sent) for sent in data]  
pprint(data[:1])

['From: (wheres my thing) Subject: WHAT car is this!? Nntp-Posting-Host: '
 'rac3.wam.umd.edu Organization: University of Maryland, College Park Lines: '
 '15 I was wondering if anyone out there could enlighten me on this car I saw '
 'the other day. It was a 2-door sports car, looked to be from the late 60s/ '
 'early 70s. It was called a Bricklin. The doors were really small. In '
 'addition, the front bumper was separate from the rest of the body. This is '
 'all I know. If anyone can tellme a model name, engine specs, years of '
 'production, where this car is made, history, or whatever info you have on '
 'this funky looking car, please e-mail. Thanks, - IL ---- brought to you by '
 'your neighborhood Lerxst ---- ']


### Tokenization

In [9]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))            #deacc=True removes punctuations
data_words = list(sent_to_words(data))
print(data_words[:1])

[['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp', 'posting', 'host', 'rac', 'wam', 'umd', 'edu', 'organization', 'university', 'of', 'maryland', 'college', 'park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']]


### Build the bigram and trigram models

In [10]:
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

In [11]:
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp_posting_host', 'rac_wam_umd_edu', 'organization', 'university', 'of', 'maryland_college_park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front_bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']


### Define function for stopwords, bigrams, trigrams and lemmatization

In [12]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [13]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

In [14]:
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

In [15]:
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])



In [16]:
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
print(data_lemmatized[:1])

[['s', 'thing', 'car', 'nntp_poste', 'host', 'umd', 'organization', 'park', 'line', 'wonder', 'enlighten', 'car', 'see', 'day', 'door', 'sport', 'car', 'look', 'late', 'early', 'call', 'bricklin', 'door', 'really', 'small', 'addition', 'separate', 'rest', 'body', 'know', 'tellme', 'model', 'name', 'engine', 'spec', 'year', 'production', 'car', 'make', 'history', 'info', 'funky', 'look', 'car', 'mail', 'thank', 'bring', 'neighborhood', 'lerxst']]


In [17]:
# Create Dictionary 
id2word = corpora.Dictionary(data_lemmatized)  

In [18]:
# Create Corpus 
texts = data_lemmatized 

In [19]:
# Term Document Frequency 
corpus = [id2word.doc2bow(text) for text in texts]  

In [20]:
# View 
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 5), (6, 1), (7, 2), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 2), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1)]]


In [21]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('addition', 1),
  ('body', 1),
  ('bricklin', 1),
  ('bring', 1),
  ('call', 1),
  ('car', 5),
  ('day', 1),
  ('door', 2),
  ('early', 1),
  ('engine', 1),
  ('enlighten', 1),
  ('funky', 1),
  ('history', 1),
  ('host', 1),
  ('info', 1),
  ('know', 1),
  ('late', 1),
  ('lerxst', 1),
  ('line', 1),
  ('look', 2),
  ('mail', 1),
  ('make', 1),
  ('model', 1),
  ('name', 1),
  ('neighborhood', 1),
  ('nntp_poste', 1),
  ('organization', 1),
  ('park', 1),
  ('production', 1),
  ('really', 1),
  ('rest', 1),
  ('s', 1),
  ('see', 1),
  ('separate', 1),
  ('small', 1),
  ('spec', 1),
  ('sport', 1),
  ('tellme', 1),
  ('thank', 1),
  ('thing', 1),
  ('umd', 1),
  ('wonder', 1),
  ('year', 1)]]

In [22]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=8, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [23]:
# Print the keyword of topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.032*"sale" + 0.030*"physical" + 0.019*"server" + 0.016*"item" + '
  '0.014*"sell" + 0.014*"wing" + 0.012*"box" + 0.012*"direct" + 0.012*"nhl" + '
  '0.011*"recommend"'),
 (1,
  '0.022*"line" + 0.011*"organization" + 0.011*"use" + 0.010*"nntp_poste" + '
  '0.010*"host" + 0.010*"get" + 0.009*"system" + 0.008*"drive" + 0.008*"need" '
  '+ 0.007*"thank"'),
 (2,
  '0.015*"write" + 0.014*"say" + 0.011*"people" + 0.011*"think" + '
  '0.011*"article" + 0.010*"know" + 0.010*"make" + 0.009*"line" + 0.008*"go" + '
  '0.007*"see"'),
 (3,
  '0.014*"line" + 0.014*"go" + 0.014*"get" + 0.014*"year" + 0.013*"team" + '
  '0.013*"game" + 0.012*"car" + 0.009*"good" + 0.009*"organization" + '
  '0.009*"write"'),
 (4,
  '0.575*"ax" + 0.057*"_" + 0.018*"c" + 0.010*"cx" + 0.008*"rlk" + 0.005*"sj" '
  '+ 0.005*"m" + 0.003*"mf" + 0.003*"nei" + 0.003*"mu"'),
 (5,
  '0.015*"government" + 0.014*"gun" + 0.014*"child" + 0.013*"kill" + '
  '0.013*"law" + 0.011*"state" + 0.011*"people" + 0.009*"death" + '
  

In [24]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  
# a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.341446189001056

Coherence Score:  0.46851123719778365


### Visualize the topics

In [25]:
vis = gensimvis.prepare(lda_model, corpus, id2word)

  default_term_info = default_term_info.sort_values(


In [26]:
pyLDAvis.display(vis)

In [27]:
pyLDAvis.prepared_data_to_html(vis)

'\n<link rel="stylesheet" type="text/css" href="https://cdn.jsdelivr.net/gh/bmabey/pyLDAvis@3.3.1/pyLDAvis/js/ldavis.v1.0.0.css">\n\n\n<div id="ldavis_el243223746034450723168361166"></div>\n<script type="text/javascript">\n\nvar ldavis_el243223746034450723168361166_data = {"mdsDat": {"x": [0.25882677308780844, 0.23734374228651361, 0.1890152912370282, 0.020535589519430384, -0.06310820524604746, -0.263690236327624, -0.13245329022693003, -0.24646966433017878], "y": [-0.01947269573664832, -0.10329349879140184, -0.07702978975804965, 0.07586729099069255, 0.25866822796147226, -0.19060152281454157, 0.19638022894950954, -0.14051824080103292], "topics": [1, 2, 3, 4, 5, 6, 7, 8], "cluster": [1, 1, 1, 1, 1, 1, 1, 1], "Freq": [35.00731076821505, 26.70484975746512, 11.16420589057515, 9.462275538647946, 6.7352761257162355, 4.5596484263034975, 4.279900382054066, 2.086533111022935]}, "tinfo": {"Term": ["ax", "_", "people", "line", "go", "get", "say", "think", "write", "team", "year", "nntp_poste", "gam

In [39]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data_lemmatized)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(10)

  sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
  sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)


Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,1,0.4328,"line, organization, use, nntp_poste, host, get...","[s, thing, car, nntp_poste, host, umd, organiz..."
1,1,1,0.3592,"line, organization, use, nntp_poste, host, get...","[poll, final, call, summary, final, call, cloc..."
2,2,1,0.4046,"line, organization, use, nntp_poste, host, get...","[question, organization, purdue_university, en..."
3,3,1,0.5036,"line, organization, use, nntp_poste, host, get...","[system, division, line, nntp_poste, host, amb..."
4,4,1,0.3633,"line, organization, use, nntp_poste, host, get...","[question, organization, smithsonian_astrophys..."
5,5,2,0.606,"write, say, people, think, article, know, make...","[foxvog_dougla, reword, second_amendment, idea..."
6,6,1,0.3513,"line, organization, use, nntp_poste, host, get...","[man, brain, tumor, treatment, line, people, r..."
7,7,1,0.7824,"line, organization, use, nntp_poste, host, get...","[las_cruce, nm, line, nntp_poste, host, dante_..."
8,8,1,0.4591,"line, organization, use, nntp_poste, host, get...","[win, icon, help, please, organization, line, ..."
9,9,1,0.67,"line, organization, use, nntp_poste, host, get...","[sigma_design, double, article, pellettiere, w..."


In [54]:
df_dominant_topic2 = df_dominant_topic[df_dominant_topic.Dominant_Topic != 6]
df_dominant_topic2

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,1,0.4328,"line, organization, use, nntp_poste, host, get...","[s, thing, car, nntp_poste, host, umd, organiz..."
1,1,1,0.3592,"line, organization, use, nntp_poste, host, get...","[poll, final, call, summary, final, call, cloc..."
2,2,1,0.4046,"line, organization, use, nntp_poste, host, get...","[question, organization, purdue_university, en..."
3,3,1,0.5036,"line, organization, use, nntp_poste, host, get...","[system, division, line, nntp_poste, host, amb..."
4,4,1,0.3633,"line, organization, use, nntp_poste, host, get...","[question, organization, smithsonian_astrophys..."
...,...,...,...,...,...
11309,11309,2,0.5004,"write, say, people, think, article, know, make...","[scan, organization, reply, line, dn, neurolog..."
11310,11310,1,0.6471,"line, organization, use, nntp_poste, host, get...","[screen, death, line, organization, problem, s..."
11311,11311,1,0.6391,"line, organization, use, nntp_poste, host, get...","[este, mount, cpu, cool, vertical, case, organ..."
11312,11312,1,0.4841,"line, organization, use, nntp_poste, host, get...","[point, organization, central, research, line,..."


In [55]:
df_dominant_topic2.to_csv('LDA_topics_lda', index = False)

#### LSI MODEL

In [59]:
lsi_model = gensim.models.LsiModel(corpus=corpus, num_topics = 8, id2word = id2word)
lsi_model.show_topics(num_topics=8)

[(0,
  '1.000*"ax" + 0.001*"mf" + 0.001*"pl_pl" + 0.001*"m" + 0.001*"wm" + 0.001*"giz_giz" + 0.000*"mi" + 0.000*"fq" + 0.000*"wt" + 0.000*"pmfq"'),
 (1,
  '0.238*"say" + 0.200*"file" + 0.194*"go" + 0.178*"get" + 0.166*"people" + 0.166*"know" + 0.143*"make" + 0.133*"see" + 0.132*"use" + 0.129*"also"'),
 (2,
  '-0.395*"file" + 0.333*"say" + 0.251*"go" + -0.171*"image" + 0.163*"know" + 0.161*"people" + -0.158*"program" + 0.138*"think" + -0.137*"available" + 0.137*"s"'),
 (3,
  '0.555*"file" + 0.335*"entry" + -0.173*"system" + 0.131*"say" + -0.126*"available" + 0.122*"output" + -0.122*"use" + 0.114*"program" + -0.106*"also" + 0.102*"printf"'),
 (4,
  '-0.845*"_" + -0.403*"c" + -0.198*"cx" + -0.115*"sc" + -0.098*"d" + -0.070*"i" + -0.069*"sj" + -0.061*"gc" + -0.058*"m" + -0.056*"sy"'),
 (5,
  '-0.397*"image" + 0.163*"wire" + -0.160*"color" + -0.159*"jpeg" + -0.154*"say" + 0.144*"entry" + -0.140*"go" + -0.139*"available" + -0.139*"version" + 0.138*"privacy"'),
 (6,
  '-0.346*"wire" + 0.204*"

In [60]:
# Compute Coherence Score
coherence_model_lsi = CoherenceModel(model=lsi_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lsi = coherence_model_lsi.get_coherence()
print('\nCoherence Score: ', coherence_lsi)


Coherence Score:  0.5152303719209037


In [69]:
def format_topics_sentences2(lsimodel=None, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(lsimodel[corpus]):
        row = row_list[0] if lsimodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = lsimodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences2(lsimodel=lsi_model, corpus=corpus, texts=data_lemmatized)

# Format
df_dominant_topic3 = df_topic_sents_keywords.reset_index()
df_dominant_topic3.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic3.head(10)

AttributeError: 'LsiModel' object has no attribute 'per_word_topics'

In [70]:
df_dominant_topic3.to_csv('LDA_topics_lsi', index = False)

In [65]:
hdp_model =  gensim.models.hdpmodel.HdpModel(corpus=corpus, id2word=id2word)
hdp_model.show_topics()

[(0,
  '0.010*line + 0.008*write + 0.006*get + 0.006*article + 0.006*say + 0.005*organization + 0.005*know + 0.005*people + 0.005*go + 0.005*make + 0.005*think + 0.004*well + 0.004*time + 0.004*use + 0.004*good + 0.004*see + 0.004*nntp_poste + 0.004*also + 0.003*host + 0.003*work'),
 (1,
  '0.786*ax + 0.001*pl_pl + 0.001*mf + 0.001*m + 0.001*wm + 0.001*giz_giz + 0.000*_ + 0.000*mi + 0.000*bxlt + 0.000*wt + 0.000*rlk + 0.000*part + 0.000*gq + 0.000*mb + 0.000*giz + 0.000*fq + 0.000*gizwt + 0.000*pmf + 0.000*sl + 0.000*pne'),
 (2,
  '0.012*line + 0.006*organization + 0.006*write + 0.005*get + 0.004*nntp_poste + 0.004*article + 0.004*host + 0.004*driver + 0.004*use + 0.004*know + 0.004*run + 0.003*problem + 0.003*drive + 0.003*window + 0.003*system + 0.003*do + 0.003*work + 0.003*think + 0.003*need + 0.003*make'),
 (3,
  '0.010*line + 0.006*write + 0.005*organization + 0.004*get + 0.004*nntp_poste + 0.004*article + 0.004*host + 0.004*know + 0.003*file + 0.003*go + 0.002*use + 0.002*say + 

In [32]:
# Compute Coherence Score
coherence_model_hdpmodel = CoherenceModel(model=hdpmodel, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_hdpmodel = coherence_model_hdpmodel.get_coherence()
print('\nCoherence Score: ', coherence_hdpmodel)


Coherence Score:  0.5107970671911714


In [72]:
def format_topics_sentences3(hdpmodel=None, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(hdpmodel[corpus]):
        row = row_list[0] if hdpmodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = hdpmodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences3(hdpmodel=hdp_model, corpus=corpus, texts=data_lemmatized)

# Format
df_dominant_topic4 = df_topic_sents_keywords.reset_index()
df_dominant_topic4.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic4.head(10)

AttributeError: 'HdpModel' object has no attribute 'per_word_topics'