Focus this notebook on evaluation of coherence of topics in learned models.

### Import packages

In [6]:
import os
import glob
import zipfile
import shutil
import re
import string
from sklearn.feature_extraction.text import CountVectorizer
import scipy.stats as stats
import datetime
import pylab as pl
import math
import codecs
import nltk
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import time
import json
import gensim
from gensim import corpora, models
from gensim.matutils import hellinger
from gensim.models.coherencemodel import CoherenceModel
import logging
%matplotlib inline

### Set all important file paths

In [7]:
pwd()

u'/Users/seddont/Dropbox/Tom/MIDS/W266_work/w266_project'

In [8]:
# Richard path specs

# # At work:
# TEXT_PATH = "T:/Quant/TextAnalysis/Transcripts/SP100/Text/"
# PDF_PATH = "T:/Quant/TextAnalysis/Transcripts/SP100/PDF/"
# LIBRARY_PATH = "T:/Quant/TextAnalysis/Transcripts/SP100/Libraries/"

# # At home:
# dict_path = '/Users/Richard/Desktop/Berkeley/w266/'
# input_path = '/Users/Richard/Desktop/Berkeley/w266/repo/w266_project/'
# output_path = '/Users/Richard/Desktop/Berkeley/w266/'

# LIBRARY_PATH = '/Users/Richard/Desktop/Berkeley/w266/repo/w266_project/'



In [9]:
# Tom path specs

# Tom machine:
TEXT_DIR_LIST = ["T1", "T2", "T3", "T4"]
# PDF_PATH = "T:/Quant/TextAnalysis/Transcripts/SP100/PDF/"
LIBRARY_PATH = "/Users/seddont/Dropbox/Tom/MIDS/W266_work/w266_project/"

# On Tom's google cloud instance
# LIBRARY_PATH = "/home/seddon/w266_project/"



In [10]:
class Saved_state():
    '''Represents a saved state that includes an LDA model and the data used to create it.
    
       Instantiated with a model_spec dictionary that locates the files to 
       recreate the saved state and includes a description.
       
       Dictionary needs to contain:
       
       model_files
       qa_pairs_file
       raw_qa_text_file
       corpus_file
       hellinger_file
       '''
    def __init__(self, model_spec):
        
        model_dir = LIBRARY_PATH+model_spec["model_directory"]
        qa_dir = LIBRARY_PATH+model_spec["qa_pair_directory"]
        
        self.ldamodel = gensim.models.ldamodel.LdaModel.load(model_dir+"/full_model")
        
        self.dictionary = gensim.corpora.dictionary.Dictionary.load(model_dir+"/dictionary.txt")
        
        with open(qa_dir+"/qa_pairs.txt", "r") as f:
            self.qa_pairs = json.loads(f.read())
            
        with open(LIBRARY_PATH + "/raw_qa_data.txt", "r") as f:
            self.raw_qa_text = json.loads(f.read())
            
        with open(model_dir+"/corpus.txt", "r") as f:
            self.corpus = json.loads(f.read())
            
        with open(model_dir+"/hell_sims.txt", "r") as f:
            self.hellinger_sims = json.loads(f.read())
            
        with open(model_dir+"/model_runtime.txt", "r") as f:
            self.model_runtime = f.read()

In [11]:
def get_coherences(model_spec, topn_list):
    '''Retrieves saved model from a model spec.  Returns a dict containing
       top n words, membership in topic per word and u_mass coherence
       measures for these top n words.'''
    
    coherences = dict()
    
    saved = Saved_state(model_spec)

    ldamodel = saved.ldamodel
    corpus = saved.corpus
    dictionary = saved.dictionary

    for n in topn_list:

        cm = CoherenceModel(model = ldamodel, corpus = corpus, coherence = "u_mass", topn = n)
        
#         coherences[n] = cm.get_coherence_per_topic()
        
        coherences[n] = ldamodel.top_topics(corpus = corpus, coherence = "u_mass", topn = n)
        
    return coherences

In [12]:
# model_spec_list = [{"model_directory": "saved_models/topic20_minlength20_base",
#               "qa_pair_directory": "saved_models/standard_preproc",
#               "preprocessing_function": "testLDA_pre_process_document",
#               "min_sequence_length": 20,
#               "num_topics": 20,
#               "description": "test model"}]

model_dir_list = ["top10_len10_prebase_ps20_it100", "top10_len20_prebase_ps20_it100",
                  "top10_len40_prebase_ps20_it100", "top20_len10_prebase_ps20_it100",
                  "top20_len20_prebase_ps20_it100", "top20_len40_prebase_ps20_it100",
                  "top40_len10_prebase_ps20_it100", "top40_len20_prebase_ps20_it100",
                  "top40_len40_prebase_ps20_it100"]

topn_list = [10, 25, 50]

coherences = dict()

for model_dir in model_dir_list:
    
    with open(LIBRARY_PATH+"saved_models/"+model_dir+"/model_spec.txt") as f:
        model_spec = json.loads(f.read())
    
    start_time = time.time()
    
    print "evaluating coherence measures for", model_spec["description"]
    
    num_topics = model_spec["num_topics"]
    min_sequence_length = model_spec["min_sequence_length"]
    
    coherences[(num_topics, min_sequence_length)] = get_coherences(model_spec, topn_list)
    
    print "evaluation took", time.time() - start_time, "seconds"


# for model_spec in model_spec_list:
    
#     start_time = time.time()
    
#     print "evaluating coherence measures for", model_spec["description"]
    
#     num_topics = model_spec["num_topics"]
#     min_sequence_length = model_spec["min_sequence_length"]
    
#     coherences[(num_topics, min_sequence_length)] = get_coherences(model_spec, topn_list)
    
#     print "evaluation took", time.time() - start_time, "seconds"
    
    

evaluating coherence measures for Topics 10, Min Length 10, base preprocessing, passes20, iterations100
evaluation took 24.0488870144 seconds
evaluating coherence measures for Topics 10, Min Length 20, base preprocessing, passes20, iterations100
evaluation took 28.1455430984 seconds
evaluating coherence measures for Topics 10, Min Length 40, base preprocessing, passes20, iterations100
evaluation took 11.7550959587 seconds
evaluating coherence measures for Topics 20, Min Length 10, base preprocessing, passes20, iterations100
evaluation took 29.2275381088 seconds
evaluating coherence measures for Topics 20, Min Length 20, base preprocessing, passes20, iterations100
evaluation took 25.4610061646 seconds
evaluating coherence measures for Topics 20, Min Length 40, base preprocessing, passes20, iterations100
evaluation took 13.0647530556 seconds
evaluating coherence measures for Topics 40, Min Length 10, base preprocessing, passes20, iterations100
evaluation took 33.4466631413 seconds
evalua

In [13]:
coherences

{(10,
  10): {10: [([(0.031972111654476776, u'think'),
     (0.021461197835601419, u'go'),
     (0.014801170351631176, u'busi'),
     (0.013049010927434199, u'us'),
     (0.012179464584258027, u'thing'),
     (0.012101952471837114, u'get'),
     (0.011983201832075093, u'look'),
     (0.011817542195530564, u'realli'),
     (0.011335401859128659, u'make'),
     (0.0097009222031374451, u'time')],
    -1.1979053159202995),
   ([(0.048146401933910829, u'quarter'),
     (0.037542178226369592, u'year'),
     (0.020564448926966655, u'margin'),
     (0.018937107868135857, u'think'),
     (0.017961428023408069, u'growth'),
     (0.0175680244585983, u'see'),
     (0.014218819230860377, u'expect'),
     (0.012867796214145778, u'littl'),
     (0.01283372012430853, u'first'),
     (0.012094381911117091, u'bit')],
    -1.2807560161002862),
   ([(0.19765604860886443, u'NUM'),
     (0.043826416144214397, u'year'),
     (0.017151916391985898, u'think'),
     (0.016295400429183194, u'million'),
     (0.0

## Impact of hyperparameters on the topic quality

Using U_mass coherence as measure of topic quality.  Coherence is measured across a 
certain number of N words in the topic (ranked by the highest probability for that 
word in the topic).

Quality varies by topic.  Can summarize by showing mean and sd of coherence measures.

Showing for 3 different measures of N.

In [14]:
print "Mean and SD of topic Coherences for learned models"
print
print "{:^13}{:^21}{:^21}{:>11}{:>11}{:>11}".format("Num Topics", "Min Seq Length", 
                                            "N words for umass", "Mean", "SD", "SD/Mean")
print "{:^13}{:^21}{:^21}{:>11}{:>11}{:>11}".format("-"*11, "-"*17, "-"*17, "-"*6, "-"*6, "-"*6)

for n in [10, 25, 50]:
    print
    for t in sorted(coherences):
        u_mass_scores = [r[1] for r in coherences[t][n]]
        mean_score = np.mean(u_mass_scores)
        sd_score = np.std(u_mass_scores)
        print "{:^13}{:^21}{:^21}{:11.3f}{:11.3f}{:11.3f}".format(t[0], t[1], n, mean_score, sd_score, sd_score/mean_score)
        

Mean and SD of topic Coherences for learned models

 Num Topics     Min Seq Length      N words for umass         Mean         SD    SD/Mean
 -----------   -----------------    -----------------       ------     ------     ------

     10               10                   10               -1.852      0.472     -0.255
     10               20                   10               -1.623      0.517     -0.319
     10               40                   10               -1.452      0.634     -0.436
     20               10                   10               -2.137      0.805     -0.377
     20               20                   10               -2.280      1.607     -0.705
     20               40                   10               -2.404      1.767     -0.735
     40               10                   10               -3.104      1.513     -0.487
     40               20                   10               -2.664      1.105     -0.415
     40               40                   10            

## Evaluating how words in each topic contribute to the total

For each topic, we have a measure of the amount that each word contributes to that topic.

So we can sum up the probabilities across n words in the topic.  The total amount obviously
generally grows as the number of words we are considering grows.  But for the same number 
of N words it gives a sense of how concentrated the topic is in a few words.

In [15]:
print "Mean and SD of word probability for n words in learned topics"
print
print "{:^13}{:^21}{:^21}{:>11}{:>11}".format("Num Topics", "Min Seq Length", 
                                            "N words for umass", "Mean", "SD")
print "{:^13}{:^21}{:^21}{:>11}{:>11}".format("-"*11, "-"*17, "-"*17, "-"*6, "-"*6)


for n in [10, 25, 50]:
    print
    for t in sorted(coherences):
        topic_list = coherences[t][n]
        total_probs = list()    
        for topic_tuple in topic_list:
            word_tuples = topic_tuple[0]
            total_probs.append(sum(w[0] for w in word_tuples))
            
        mean_prob = np.mean(total_probs)
        sd_prob = np.std(total_probs)
            
        print "{:^13}{:^21}{:^21}{:11.3f}{:11.3f}".format(t[0], t[1], n, mean_prob, sd_prob)
        
#         print t, n, np.mean(total_probs), np.std(total_probs)

Mean and SD of word probability for n words in learned topics

 Num Topics     Min Seq Length      N words for umass         Mean         SD
 -----------   -----------------    -----------------       ------     ------

     10               10                   10                0.183      0.071
     10               20                   10                0.183      0.062
     10               40                   10                0.168      0.047
     20               10                   10                0.241      0.075
     20               20                   10                0.262      0.157
     20               40                   10                0.238      0.093
     40               10                   10                0.387      0.156
     40               20                   10                0.382      0.167
     40               40                   10                0.328      0.124

     10               10                   25                0.295      0.086

In [21]:
# for n in [10, 25, 50]:
for n in [50]:
    print
    for t in sorted(coherences)[:1]:
        topic_list = coherences[t][n]
#         print topic_list
#         total_probs = list()    
        for topic_tuple in topic_list:
            print topic_tuple
            print
#             word_tuples = topic_tuple[0]
#             total_probs.append(sum(w[0] for w in word_tuples))
            
#         mean_prob = np.mean(total_probs)
#         sd_prob = np.std(total_probs)
            
        print "{:^13}{:^21}{:^21}{:11.3f}{:11.3f}".format(t[0], t[1], n, mean_prob, sd_prob)


([(0.031972111654476776, u'think'), (0.021461197835601419, u'go'), (0.014801170351631176, u'busi'), (0.013049010927434199, u'us'), (0.012179464584258027, u'thing'), (0.012101952471837114, u'get'), (0.011983201832075093, u'look'), (0.011817542195530564, u'realli'), (0.011335401859128659, u'make'), (0.0097009222031374451, u'time'), (0.0095250693527837871, u'continu'), (0.0087116662505958668, u'well'), (0.0086765591926221061, u'work'), (0.0085968521046816404, u'lot'), (0.0084575954187181644, u'term'), (0.0081351865122924489, u'way'), (0.0080159621632229436, u'one'), (0.0075319269183827808, u'right'), (0.0074885709473675715, u'say'), (0.0069848901898281347, u'good'), (0.0069091607092115759, u'want'), (0.0068791379432245293, u'know'), (0.0068193297826604875, u'invest'), (0.0066194255271915048, u'opportun'), (0.0062620300071232621, u'like'), (0.0060571209770873312, u'take'), (0.0059718953625164297, u'see'), (0.0055182576339147639, u'sure'), (0.0054343835256986358, u'compani'), (0.0052030792