More work evaluating quality of topics and identifying potential sources of errors and possible avenues for improvement.

### Import packages

In [1]:
import os
import glob
import zipfile
import shutil
import re
import string
from sklearn.feature_extraction.text import CountVectorizer
import scipy.stats as stats
import datetime
import pylab as pl
import math
import codecs
import nltk
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import time
import json
import gensim
from gensim import corpora, models
from gensim.matutils import hellinger
from gensim.models.coherencemodel import CoherenceModel
import logging
%matplotlib inline

### Set all important file paths

In [2]:
pwd()

u'/Users/seddont/Dropbox/Tom/MIDS/W266_work/w266_project'

In [3]:
# Richard path specs

# # At work:
# TEXT_PATH = "T:/Quant/TextAnalysis/Transcripts/SP100/Text/"
# PDF_PATH = "T:/Quant/TextAnalysis/Transcripts/SP100/PDF/"
# LIBRARY_PATH = "T:/Quant/TextAnalysis/Transcripts/SP100/Libraries/"

# # At home:
# dict_path = '/Users/Richard/Desktop/Berkeley/w266/'
# input_path = '/Users/Richard/Desktop/Berkeley/w266/repo/w266_project/'
# output_path = '/Users/Richard/Desktop/Berkeley/w266/'

# LIBRARY_PATH = '/Users/Richard/Desktop/Berkeley/w266/repo/w266_project/'



In [3]:
# Tom path specs

# Tom machine:
TEXT_DIR_LIST = ["T1", "T2", "T3", "T4"]
# PDF_PATH = "T:/Quant/TextAnalysis/Transcripts/SP100/PDF/"
LIBRARY_PATH = "/Users/seddont/Dropbox/Tom/MIDS/W266_work/w266_project/"

# On Tom's google cloud instance
# LIBRARY_PATH = "/home/seddon/w266_project/"



In [13]:
model_spec = {"model_directory": "saved_models/topic20_minlength20_base",
              "qa_pair_directory": "saved_models/standard_preproc",
              "preprocessing_function": "testLDA_pre_process_document",
              "min_sequence_length": 20,
              "num_topics": 20,
              "description": "test model"}

In [6]:
class Saved_state():
    '''Represents a saved state that includes an LDA model and the data used to create it.
    
       Instantiated with a model_spec dictionary that locates the files to 
       recreate the saved state and includes a description.
       
       Dictionary needs to contain:
       
       model_files
       qa_pairs_file
       raw_qa_text_file
       corpus_file
       hellinger_file
       '''
    def __init__(self, model_spec):
        
        model_dir = LIBRARY_PATH+model_spec["model_directory"]
        qa_dir = LIBRARY_PATH+model_spec["qa_pair_directory"]
        
        self.ldamodel = gensim.models.ldamodel.LdaModel.load(model_dir+"/full_model")
        
        self.dictionary = gensim.corpora.dictionary.Dictionary.load(model_dir+"/dictionary.txt")
        
        with open(qa_dir+"/qa_pairs.txt", "r") as f:
            self.qa_pairs = json.loads(f.read())
            
        with open(LIBRARY_PATH + "/raw_qa_data.txt", "r") as f:
            self.raw_qa_text = json.loads(f.read())
            
        with open(model_dir+"/corpus.txt", "r") as f:
            self.corpus = json.loads(f.read())
            
        with open(model_dir+"/hell_sims.txt", "r") as f:
            self.hellinger_sims = json.loads(f.read())
            
        with open(model_dir+"/model_runtime.txt", "r") as f:
            self.model_runtime = f.read()

In [7]:
saved = Saved_state(model_spec)

In [8]:
# Restoring saved state

ldamodel = saved.ldamodel
qa_pairs= saved.qa_pairs
raw_qa_text = saved.raw_qa_text
corpus = saved.corpus
hellinger_sims = saved.hellinger_sims
dictionary = saved.dictionary
model_runtime = saved.model_runtime

print "Restored", model_spec["description"], "originally run at", model_runtime

Restored test model originally run at 2017-12-09 22:12:32.176898


## Looking at words that appear in multiple topics

Inspecting our topics, we have a number of words that appear in multiple topics, like 'think'.  These may not be helping with the distinctiveness of each topic, and may be getting learned because of our short document lengths.

In [37]:
def get_topic_counts(model_spec, topn):
    '''Retrieves saved model from a model spec.  Returns a dict containing
       how many of the topics each word that is in the top n of at least one topic
       shows up in the top n for a topic.
       
       Helps identify words that may be uninformative for topic modelling.'''
    
    topic_words = dict()
    topic_count = defaultdict(int)
    
    saved = Saved_state(model_spec)

    ldamodel = saved.ldamodel

    for n in range(model_spec["num_topics"]):
        
        topic_words[n] = ldamodel.show_topic(n, topn = topn)

    for topic in topic_words:
        for w in topic_words[topic]:
            topic_count[w[0]] += 1
            
    return topic_count
#     return topic_words

In [49]:
word_counts = get_topic_counts(model_spec, 50)

In [50]:
multi_counts = [(w, word_counts[w]) for w in word_counts if word_counts[w] > 1]
sorted_multi_counts = sorted(multi_counts, key = lambda w: w[1], reverse = True)
sorted_multi_counts

[(u'see', 12),
 (u'look', 11),
 (u'go', 10),
 (u'new', 10),
 (u'think', 10),
 (u'come', 10),
 (u'one', 9),
 (u'us', 9),
 (u'well', 8),
 (u'want', 7),
 (u'realli', 7),
 (u'year', 7),
 (u'take', 7),
 (u'busi', 7),
 (u'get', 7),
 (u'product', 7),
 (u'actual', 7),
 (u'good', 6),
 (u'term', 6),
 (u'continu', 6),
 (u'expect', 6),
 (u'first', 6),
 (u'point', 6),
 (u'like', 6),
 (u'rate', 5),
 (u'two', 5),
 (u'move', 5),
 (u'side', 5),
 (u'NUM', 5),
 (u'lot', 5),
 (u'plan', 5),
 (u'impact', 5),
 (u'make', 5),
 (u'also', 5),
 (u'know', 5),
 (u'back', 5),
 (u'growth', 5),
 (u'question', 5),
 (u'way', 5),
 (u'time', 5),
 (u'consum', 4),
 (u'higher', 4),
 (u'work', 4),
 (u'basi', 4),
 (u'talk', 4),
 (u'increas', 4),
 (u'said', 4),
 (u'last', 4),
 (u'improv', 4),
 (u'much', 4),
 (u'right', 4),
 (u'market', 4),
 (u'opportun', 4),
 (u'start', 4),
 (u'level', 3),
 (u'cost', 3),
 (u'gener', 3),
 (u'great', 3),
 (u'chang', 3),
 (u'use', 3),
 (u'obvious', 3),
 (u'give', 3),
 (u'end', 3),
 (u'grow', 3),
 

## Evaluating topic distributions

LDA should be concentrating topic probabilities for each document into a relatively small number of topics.  Want to inspect to see how true that is.

In [63]:
def get_topic_dists(model_spec):
    '''Perform inference to calc topic distibutions for a model
       on our dataset.
       
       Return a dictionary of topic distributions, keyed by (file_id, qnum, q/a)
       tuples.'''

    topic_dists = defaultdict(dict)
    
    model_dir = LIBRARY_PATH+model_spec["model_directory"]
    qa_dir = LIBRARY_PATH+model_spec["qa_pair_directory"]
    
    with open(qa_dir+"/qa_pairs.txt", "r") as f:
        qa_pairs = json.loads(f.read())
        
    ldamodel = gensim.models.ldamodel.LdaModel.load(model_dir+"/full_model")
    dictionary = gensim.corpora.dictionary.Dictionary.load(model_dir+"/dictionary.txt")
    
    
    print "starting"
    start_time = time.time()
    i = 0
    report_every = 10000
    
    min_sequence_length = model_spec["min_sequence_length"]

    for file_id in qa_pairs:
        for q_number in qa_pairs[file_id]:
            question, answer = qa_pairs[file_id][q_number]
            if (len(question) > min_sequence_length and
                len(answer) > min_sequence_length):
                q_bow = dictionary.doc2bow(question)
                a_bow = dictionary.doc2bow(answer)
                topic_dists[(file_id,q_number,"q")] = ldamodel[q_bow]
                topic_dists[(file_id,q_number,"a")] = ldamodel[a_bow]             
            i += 1
            if i % report_every == 0:
                print "Processed ", i, "pairs in", time.time() - start_time
    print "Finished in", time.time() - start_time
    
#     # Store the results to file for recreating later if wanted
#     with open(model_dir+"/topic_dists.txt", "w") as f:
#         f.write(json.dumps(topic_dists))
        
    return topic_dists

In [60]:
topic_dists = get_topic_dists(model_spec)

starting
Processed  10000 pairs in 11.6608200073
Processed  20000 pairs in 23.8628001213
Processed  30000 pairs in 36.1894099712
Processed  40000 pairs in 49.9212801456
Processed  50000 pairs in 63.1103920937
Processed  60000 pairs in 75.1242649555
Processed  70000 pairs in 87.2788529396
Processed  80000 pairs in 99.4468200207
Finished in 104.995733976


In [65]:
topic_dists.keys()[:5]
for t in topic_dists.keys()[:5]:
    print topic_dists[t]
    total_score = 0
    for topic_score in topic_dists[t]:
        total_score += topic_score[1]
    print total_score

[(2, 0.096762045012519809), (3, 0.15208877193526946), (6, 0.21923988730145816), (12, 0.20457402816166612), (14, 0.079455212086838437), (17, 0.20639154992819525), (19, 0.030655172097030881)]
0.989166666523
[(1, 0.24508400269286673), (3, 0.31195125850352845), (9, 0.29759346645410323), (11, 0.026068670528806487), (12, 0.067185914280934442), (19, 0.040641277533406898)]
0.988524589994
[(4, 0.093323795281222185), (6, 0.4683069243408749), (7, 0.087304707104613244), (10, 0.026278074243636432), (13, 0.20441949528508718), (19, 0.10194595088786072)]
0.981578947143
[(1, 0.026150436063369635), (5, 0.039286616484566807), (6, 0.62770808909889153), (7, 0.02362858017966598), (12, 0.038148621440563316), (14, 0.063139062244211425), (16, 0.089040698378027833), (17, 0.083220476599624379)]
0.990322580489
[(2, 0.20270759961612131), (5, 0.056978566810328417), (6, 0.11799794448351997), (9, 0.13845986831232374), (18, 0.050412865445777548), (19, 0.41971766497326557)]
0.986274509641
