In [1]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pyLDAvis
import pyLDAvis.sklearn

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV

from topic_model_arxiv import TopicModeler

# Set up analysis

In [2]:
mod = TopicModeler('output.json')
mod.build_tfidf_matrix()
mod.get_matrix_shape()

(1328, 4536)


# Perform grid search

In [3]:
n_topics = [5, 10]

mod.perform_grid_search(n_topics=n_topics)

## Grid Search Results

In [None]:
mod.plot_grid_search_results(n_topics=n_topics)

# Explore selected model

In [None]:
mod.get_matrix_shape()

In [4]:
mod.set_model_to_optimum()

mod.print_grid_search_results()
mod.print_selected_model_details()



Printing grid search results...
Best Model's Params:  {'learning_decay': 0.9, 'n_components': 5}
Best Log Likelihood Score:  -13465.460161811909
Model Perplexity:  8627.72239572414


Printing detailed grid search results...
[[0.8466399  0.03831372 0.03832356 0.03831258 0.03841024]
 [0.74894424 0.03867419 0.13242373 0.03863228 0.04132557]
 [0.81566826 0.04505831 0.0480494  0.0450672  0.04615683]
 ...
 [0.0828108  0.04991409 0.04993727 0.04998228 0.76735556]
 [0.69353055 0.03720964 0.03713746 0.1948515  0.03727085]
 [0.35179831 0.04367295 0.04356545 0.51601838 0.04494491]]
5
1328
      Topic0  Topic1  Topic2  Topic3  Topic4  dominant_topic
Doc0    0.85    0.04    0.04    0.04    0.04               0
Doc1    0.75    0.04    0.13    0.04    0.04               0
Doc2    0.82    0.05    0.05    0.05    0.05               0
Doc3    0.67    0.04    0.04    0.04    0.20               0
Doc4    0.84    0.04    0.04    0.04    0.04               0
   Topic Num  Num Documents
0          0       

In [5]:
mod.print_top_keywords_in_topic()

             Word 0        Word 1        Word 2       Word 3     Word 4  \
Topic 0  convection  reconnection  metamaterial        delta    sensing   
Topic 1     network         model        system        datum  algorithm   
Topic 2    semantic         logic   cooperation      barrier   calculus   
Topic 3   colouring        colour      citation  disjunction    stadium   
Topic 4       field       physics        energy      network      state   

              Word 5     Word 6     Word 7        Word 8     Word 9  \
Topic 0      vehicle       knot  criticism      pairwise    swimmer   
Topic 1         game      paper    problem          time     method   
Topic 2  dissipation  estimator  predictor  completeness       mesh   
Topic 3      terrain         co   mutation         blood  cosmology   
Topic 4     equation       time       wave        system     theory   

               Word 10    Word 11    Word 12       Word 13         Word 14  
Topic 0  multisequence     rhythm  alignment 

# LDA visualization

In [5]:
panel = pyLDAvis.sklearn.prepare(
    mod.model, 
    np.matrix(mod.X), 
    mod.vectorizer, 
    mds='tsne')

panel



AssertionError: Term frequencies and vocabulary are of different sizes, 1 != 4536.