In [1]:
import pandas as pd
from gensim import models,corpora
import pyLDAvis.gensim
from gensim.models.coherencemodel import CoherenceModel
import warnings
%matplotlib inline

In [2]:
pd.set_option('max_colwidth',400)
pyLDAvis.enable_notebook()
warnings.filterwarnings('ignore')

In [2]:
Lda = models.LdaMulticore
lda_final =Lda.load('lda_final2')
dictionary = corpora.Dictionary.load('dictionary2')
doc_term_matrix = corpora.MmCorpus('doc_term_matrix.mm2')

In [15]:
a = lda_final.show_topics(num_topics=50,formatted=False,num_words=10)
b = lda_final.top_topics(doc_term_matrix,dictionary=dictionary,topn=10) # This orders the topics in the decreasing order of coherence score

topic2skillb = {}
topic2csb = {}
topic2skilla = {}
topic2csa = {}
num_topics =lda_final.num_topics
cnt =1

for ws in b:
    wset = set(w[1] for w in ws[0])
    topic2skillb[cnt] = wset
    topic2csb[cnt] = ws[1]
    cnt +=1

for ws in a:
    wset = set(w[0]for w in ws[1])
    topic2skilla[ws[0]+1] = wset
    
for i in range(1,num_topics+1):
    for j in range(1,num_topics+1):  
        if topic2skilla[i].intersection(topic2skillb[j])==topic2skilla[i]:
            topic2csa[i] = topic2csb[j]

finalData = pd.DataFrame([],columns=['Topic','words'])
finalData['Topic']=topic2skilla.keys()
finalData['Topic'] = finalData['Topic'].apply(lambda x: 'Topic'+str(x))
finalData['words']=topic2skilla.values()
finalData['cs'] = topic2csa.values()
finalData.sort_values(by='cs',ascending=False,inplace=True)
finalData.to_csv('CoherenceScore.csv')
finalData


Unnamed: 0,Topic,words,cs
15,Topic16,"{people, place, food, wait, sit, come, table, service, minute, time}",-1.141726
24,Topic25,"{restaurant, place, food, eat, come, love, try, good, time, order}",-1.284114
3,Topic4,"{restaurant, food, great, menu, good, dinner, dish, experience, service, time}",-1.298012
19,Topic20,"{place, food, like, come, try, fish, good, fry, burger, order}",-1.335788
22,Topic23,"{place, food, come, great, delicious, good, vegas, drink, service, order}",-1.340198
45,Topic46,"{place, food, service, come, great, good, server, experience, amazing, time}",-1.355201
26,Topic27,"{walk, place, food, like, come, good, hotel, room, time, stay}",-1.360177
41,Topic42,"{food, wait, come, good, server, drink, service, minute, time, order}",-1.372793
29,Topic30,"{food, like, come, try, good, shake, fry, cheese, burger, time}",-1.409361
17,Topic18,"{bad, place, food, eat, come, try, good, chicken, time, order}",-1.411977


In [16]:
num_topics =50
vis = pyLDAvis.gensim.prepare(lda_final, doc_term_matrix, dictionary,sort_topics=False)
pyLDAvis.save_html(vis,f'pyLDAvis_{num_topics}.html')
vis

In [24]:
from random import sample

import scipy as scp
from scipy.cluster import hierarchy as sch
from scipy import spatial as scs

# get topic distributions
topic_dist = lda_final.state.get_lambda()

# get topic terms
num_words = 20
topic_terms = [{w for (w, _) in lda_final.show_topic(topic, topn=num_words)} for topic in range(topic_dist.shape[0])]

# no. of terms to display in annotation
n_ann_terms = 7

In [37]:
edges = []

t_size = topic_dist.shape[0]
fst_topics = [{w for (w, _) in lda_final.show_topic(topic, topn=num_words)} for topic in range(t_size)]

for topic1 in range(t_size):
    for topic2 in range(t_size):
        if topic1>topic2:
            pos_tokens = fst_topics[topic1] & fst_topics[topic2]
            neg_tokens = fst_topics[topic1].symmetric_difference(fst_topics[topic2])

            pos_tokens = list(pos_tokens)[:min(len(pos_tokens), n_ann_terms)]
            neg_tokens = list(neg_tokens)[:min(len(neg_tokens), n_ann_terms)]

            if len(pos_tokens)>5:
                if topic1==topic2:
                    pass
                else:
                    edges.append((topic1, topic2))
        else:
            pass

In [38]:
import plotly.offline as py
from plotly.graph_objs import *
py.init_notebook_mode(connected=True)
import numpy as np
import networkx as nx

G=nx.Graph()

G.add_nodes_from(range(t_size))
G.add_edges_from(edges)

graph_pos = nx.spring_layout(G)


In [39]:
edge_trace = Scatter(
    x=[],
    y=[],
    text=[],
    line=Line(width=0.05,color='#888'),
    hoverinfo='text',
    mode='lines')

for edge in G.edges():
    x0, y0 = graph_pos[edge[0]]
    x1, y1 = graph_pos[edge[1]]
    
    pos_tokens = fst_topics[edge[0]] & fst_topics[edge[1]]
    neg_tokens = fst_topics[edge[0]].symmetric_difference(fst_topics[edge[1]])

    pos_tokens = list(pos_tokens)[:min(len(pos_tokens), n_ann_terms)]
    neg_tokens = list(neg_tokens)[:min(len(neg_tokens), n_ann_terms)]
    
    annotation = "<br>".join((": ".join(("+++", str(pos_tokens))), ": ".join(("---", str(neg_tokens)))))
    
    x_trace = list(np.linspace(x0, x1, 10))
    y_trace = list(np.linspace(y0, y1, 10))
    text_annotation = [annotation] * 10
    x_trace.append(None)
    y_trace.append(None)
    text_annotation.append(None)
    edge_trace['x'] += x_trace
    edge_trace['y'] += y_trace
    edge_trace['text'] += text_annotation


node_trace = Scatter(
    x=[],
    y=[],
    text=[],
    mode='markers',
    hoverinfo='text',
    marker=Marker(
        showscale=True,
        colorscale='YIGnBu',
        reversescale=True,
        color=[],
        size=10,
        colorbar=dict(
            thickness=15,
            title='Node Connections',
            xanchor='left',
            titleside='right'
        ),
        line=dict(width=2)))

for node in G.nodes():
    x, y = graph_pos[node]
    node_trace['x'].append(x)
    node_trace['y'].append(y)

In [40]:
for node, adjacencies in enumerate(list(G.adjacency())[:-1]):
    node_trace['marker']['color'].append(len(adjacencies))
    node_info = '# of connections: '+str(len(adjacencies))
    node_info = str(list(fst_topics[node])[:n_ann_terms])
    node_trace['text'].append(node_info)

In [41]:
%matplotlib inline

In [42]:
fig = Figure(data=Data([edge_trace, node_trace]),
             layout=Layout(showlegend=True,
                hovermode='closest',
                xaxis=XAxis(showgrid=True, zeroline=False, showticklabels=True),
                yaxis=YAxis(showgrid=True, zeroline=False, showticklabels=True)))

py.plot(fig,'png')


'file:///home/ubuntu/temp-plot.html'

In [48]:
from IPython.display import Image

In [61]:
Image(url= './finalplot.png',width=800,height=800)