# Topic Modeling with gensim

Switching over to gensim to leverage gensim and its topic coherence library

Guidance from tutorials: https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0

https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models#5.-Build-the-Topic-Model
### Install libraries and import dataset from Google Drive 👇

In [1]:
# !pip install PyDrive
!pip3 install pyLDAvis
!pip3 install gensim
!pip3 install matplotlib
!pip3 install nltk
!pip install gensim
!pip install pyLDAvis
!pip install bs4
# from pydrive.auth import GoogleAuth
# from pydrive.drive import GoogleDrive
# from google.colab import auth
# from oauth2client.client import GoogleCredentials

from time import time
import pandas as pd
import os
import re
from pprint import pprint
import random
from getpass import getpass
from html import unescape

import json
from bs4 import BeautifulSoup

import gensim
from gensim.utils import simple_preprocess
import gensim.corpora as corpora
from gensim.models import CoherenceModel

from matplotlib.ticker import FuncFormatter
from matplotlib import pyplot as plt


import pyLDAvis
import pyLDAvis.gensim
import pickle 

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

t0 = time()
# auth.authenticate_user()
# gauth = GoogleAuth()
# gauth.credentials = GoogleCredentials.get_application_default()
# drive = GoogleDrive(gauth)

# # import the hits.json preprocessed file
# downloaded = drive.CreateFile({'id':"1JL7IY3I_HZg112czpiYChlaMr08sOupi"}) # id for clean_hits_full_week.json
# downloaded.GetContentFile('clean_hits_full_week.json') # preprocessed hits for week long data scrape

# import the preview.json preprocessed file
# downloaded = drive.CreateFile({'id':"1MPiJyGX5FzFif2MSJqRuQnXJgn8xpFip"}) # id for clean_preview.json
# downloaded.GetContentFile('clean_preview.json') # preprocessed hits for week long data scrape

print('INFO: done importing libraries in %0.3fs.' % (time() - t0))

Collecting bs4
  Downloading bs4-0.0.1.tar.gz (1.1 kB)
Building wheels for collected packages: bs4
  Building wheel for bs4 (setup.py) ... [?25ldone
[?25h  Created wheel for bs4: filename=bs4-0.0.1-py3-none-any.whl size=1272 sha256=11615f95484fe3b82f2afaf4d2a20437266b1e217632994db4dbf01002577bcf
  Stored in directory: /private/var/folders/5j/qdx_kfmn7gg1z1m7kk5lp7580000gn/T/pip-ephem-wheel-cache-l1jva77z/wheels/0a/9e/ba/20e5bbc1afef3a491f0b3bb74d508f99403aabe76eda2167ca
Successfully built bs4
Installing collected packages: bs4
Successfully installed bs4-0.0.1


  regargs, varargs, varkwargs, defaults, formatvalue=lambda value: ""
  from collections import Sequence, defaultdict


INFO: done importing libraries in 0.000s.


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rileymiller/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Import the file into a Pandas dataframe

In [2]:
t0 = time()

hits = pd.read_json('datasets/clean_hits_full_week.json', lines=True)
hits.head()

print('INFO: done reading in dataset to pandas dataframe in %0.3fs.' % (time() - t0))

INFO: done reading in dataset to pandas dataframe in 14.462s.


# Clean data script

In [8]:
from time import time
import pandas as pd
import os
import re
import numpy as np
from pprint import pprint

t0 = time()

print('INFO: done importing libraries and dataset in %0.3fs.' % (time() - t0))


t0 = time()

hits = pd.read_json('datasets/20200126-20200312-hits.json', lines=True)
hits.head()

print('INFO: done reading in dataset to pandas dataframe in %0.3fs.' % (time() - t0))



t0 = time()

hits = hits.drop(columns=['_id', 'hit_set_id', 'requester_id','requester_name', 'assignment_duration_in_seconds', 'creation_time', 'assignable_hits_count', 'latest_expiration_time', 'caller_meets_requirements', 'caller_meets_preview_requirements', 'last_updated_time', 'monetary_reward', 'accept_project_task_url', 'requester_url', 'project_tasks_url', 'project_requirements', 'requesterInfo'], axis=1)
hits.head()

print('INFO: done columns from dataframe in %0.3fs.' % (time() - t0))

# removes all punctuation from the description and title if any
t0 = time()

hits['processed_description'] = hits['description'].map(lambda d : re.sub('[,.$()@#%&~!?]', ' ', d))
hits['processed_title'] = hits['title'].map(lambda t : re.sub('[,.$()@#%&~!?]', ' ', t))


hits['processed_description'] = hits['processed_description'].str.replace('\W', ' ')
hits['processed_description'] = hits['processed_description'].map(lambda d : re.sub('\d', '', d))
hits['processed_description'] = hits['processed_description'].str.replace('\s+', ' ')

hits['processed_title'] = hits['processed_title'].map(lambda d : re.sub('\d', '', d))
hits['processed_title'] = hits['processed_title'].str.replace('\W', ' ')
hits['processed_title'] = hits['processed_title'].str.replace('\s+', ' ')



print('INFO: done removing punctuation and numbers from title and description in %0.3fs.' % (time() - t0))

print(hits['processed_description'])
# cleans dataframe by converting all characters to lowercase and removing non-english characters
# t0 = time()

def clean_dat(chunk):
    # Read stopwords
    with open('datasets/stops.txt', 'r') as f:
        stops = f.read().split('\n')

    return ' '.join([ w for w in chunk.split() if w not in set(stops)])

# converts to low caps
hits['processed_description'] = hits['processed_description'].map(lambda x: x.lower())

hits['processed_title'] = hits['processed_title'].map(lambda x: x.lower())

print('INFO: removing stopwords, duplicates, and numbers')

print('INFO: processed_description shape before dropping empty descriptions',hits['processed_description'].shape[0])
print('INFO: processed_title shape before dropping stop words and number', hits['processed_title'].shape[0])

t0 = time()

# removes non allowable characters
hits['processed_description'] = hits['processed_description'].map(lambda x: clean_dat(x))
hits['processed_title'] = hits['processed_title'].map(lambda x: clean_dat(x))

print(hits['processed_description'])

nan_value = float("NaN")
hits.replace("", nan_value, inplace=True)

hits.dropna(subset = ['processed_description', 'processed_title'], inplace=True)


hits['processed_description'].drop_duplicates(keep=False, inplace=True)
hits['processed_title'].drop_duplicates(keep=False, inplace=True)


print('INFO: processed_description shape after removing stopwords, duplicates, and numbers', hits['processed_description'].shape[0])

print('INFO: processed_title shape after removing stopwords, duplicates, and numbers', hits['processed_title'].shape[0])

print('INFO: finished removing stopwords, duplicates, and numbers in %0.3fs' % (time() - t0))

# print out the first couple processed descriptions
t0 = time()
print('INFO: loading descriptions into text file')

hits['processed_description'].to_csv(r'datasets/parsed_full_descriptions.txt', header=None, index=None, sep=' ', mode='a')

# print out the first couple processed titles
t0 = time()
print('INFO: loading titles into text file')

hits['processed_title'].to_csv(r'datasets/parsed_full_titles.txt', header=None, index=None, sep=' ', mode='a')
print('INFO: finished loading titles into text file in %0.3fs' % (time() - t0))



INFO: done importing libraries and dataset in 0.000s.
INFO: done reading in dataset to pandas dataframe in 311.256s.
INFO: done columns from dataframe in 41.037s.
INFO: done removing punctuation and numbers from title and description in 29.006s.
0          You will be presented an image of a gym cardio...
1          Given a sentence and a noun from that sentence...
2                               Transcribing data from image
3          Extract all the items from the receipt You wil...
4          Verify the value of single data point such as ...
                                 ...                        
1108697                           Transcribe data from image
1108698    If this transaction appeared on your bank stat...
1108699    Transcribe all of the purchased items and tota...
1108700                           Transcribe data from image
1108701    If this transaction appeared on your bank stat...
Name: processed_description, Length: 1108702, dtype: object
INFO: removing stopword

In [5]:
import random
import pandas
n = 65893 #number of records in file
s = 3200 #desired sample size
filename = "datasets/parsed_full_previews.txt"
skip = sorted(random.sample(range(n),n-s))
df = pandas.read_csv(filename, skiprows=skip)

print(df)

df.to_csv(r'datasets/parsed_small_5perc_previews.txt', header=None, index=None, sep=' ', mode='a')

print('INFO: finished processing small previews')


     read instructions complete definitions rating comprehensive examples steps follow read question rate answer mark answer options bad answer acceptable good answer acceptable question substitute cilantro answer cilantro coriander cilantro refers leaves stems coriander refers seeds plant flavoring substitute cumin plant fresh parsley bad answers fall categories vulgar violent content intolerant prejudiced insensitive content opinions exaggeration unprovable statements nonsense medical legal privacy information crisis emergency advice task rate quality answers relation question steps methodology answer bad answer acceptable good answer acceptable bad answer acceptable bad answers answers unacceptable people potentially offensive hurtful confusing person bad answers fall categories vulgar violent content answer unnecessary explicit references sex violence include vulgar slang terms sexual body parts acts gratuitous descriptions violence acts illicit solicit emotional judgmental respons

In [63]:
preview_src_cnt = 0
preview_no_src_cnt = 0
previews = []

def clean_dat(chunk):
    # Read stopwords
    with open('datasets/stops.txt', 'r') as f:
        stops = f.read().split('\n')

    return ' '.join([ w for w in chunk.split() if w not in set(stops)])

for line in open('datasets/20200126-20200312-preview.json'):
      preview = json.loads(line)
        
      if 'page_src' in preview:
        preview_src_cnt += 1
        page_src = preview['page_src']
        soup = BeautifulSoup(page_src, 'html.parser')
        # print(soup.find_all('script'))

        # Clear every script tag
        for tag in soup.find_all('script'):
            tag.clear()

        # Clear every style tag
        for tag in soup.find_all('style'):
            tag.clear()

        # print(soup.get_text())
        clean_src = re.sub("<.*?>", "", soup.get_text())
        clean_src = re.sub("\n", " ", clean_src)
        previews.append(clean_src)
      else:
        preview_no_src_cnt += 1

print('INFO: done parsing preview dataset, finished in %0.3fs.' % (time() - t0))
print('INFO: total previews: ', (preview_src_cnt + preview_no_src_cnt), ' Previews with page src: ', preview_src_cnt, ' Preview w/o page src: ', preview_no_src_cnt)


preview_df = pd.DataFrame(previews, columns=['processed_previews'])
preview_df.head()

preview_df['processed_previews'] = preview_df['processed_previews'].map(lambda d : re.sub('[,.$()@#%&~!?]', '', d))
hits['processed_previews'] = hits['processed_previews'].str.replace('\W', ' ')
hits['processed_previews'] = hits['processed_previews'].map(lambda d : re.sub('\d', '', d))
hits['processed_previews'] = hits['processed_previews'].str.replace('\s+', ' ')
preview_df['processed_previews'] = preview_df['processed_previews'].map(lambda d : d.lower())

print('INFO: processed_preview shape before removing stop words and dropping empty previews', preview_df['processed_previews'].shape[0])

preview_df['processed_previews'] = preview_df['processed_previews'].map(lambda d : clean_dat(d))

preview_df['processed_previews'] = preview_df['processed_previews'].map(lambda d : re.sub('"', '', d))
preview_df['processed_previews'] = preview_df['processed_previews'].map(lambda d : re.sub("''", '', d))

nan_value = float("NaN")
preview_df.replace("", nan_value, inplace=True)

preview_df.dropna(subset = ['processed_previews'], inplace=True)


preview_df['processed_previews'].drop_duplicates(keep=False, inplace=True)


print('INFO: processed_preview shape after removing stop words and dropping empty previews', preview_df['processed_previews'].shape[0])




# print out the first couple processed descriptions
t0 = time()

print('INFO: loading previews into text file')
preview_df['processed_previews'].to_csv(r'datasets/parsed_full_previews.txt', header=None, index=None, sep=' ', mode='a')

print('INFO: finished loading previews into text file in %0.3fs' % (time() - t0))

print(new_preview)

INFO: done parsing preview dataset, finished in 84.957s.
INFO: total previews:  24  Previews with page src:  16  Preview w/o page src:  8
INFO: processed_preview shape before removing stop words and dropping empty previews 16
INFO: processed_preview shape after removing stop words and dropping empty previews 10
INFO: loading previews into text file
INFO: finished loading previews into text file in 0.002s
                                   processed_previews
2   item - electronics - batch id preview - amazon...
3   aidea - mturk review summary text text my dear...
5   hit extract purchased items shopping receipt h...
7   complete survey answer survey minutes study in...
9   worker - consensyou enable javascript run apps...
10  extract data shopping receipt images part shop...
12  record id: name: listen provided audio fill na...
14  amazon mechanical turk skip main content worke...
15  job spotter moderationyou accept hit submit re...


## Clean Previewss script

Will then proceed to drop all of the unecessary columns

In [8]:
t0 = time()

hits = hits.drop(columns=['_id', 'hit_set_id', 'requester_id','requester_name', 'assignment_duration_in_seconds', 'creation_time', 'assignable_hits_count', 'latest_expiration_time', 'caller_meets_requirements', 'caller_meets_preview_requirements', 'last_updated_time', 'monetary_reward', 'accept_project_task_url', 'requester_url', 'project_tasks_url', 'project_requirements', 'requesterInfo'], axis=1)
hits.head()

print('INFO: done columns from dataframe in %0.3fs.' % (time() - t0))

# removes all punctuation from the description and title if any
t0 = time()

hits['processed_description'] = hits['description'].map(lambda d : re.sub('[,.!?]', '', d))
hits['processed_title'] = hits['title'].map(lambda t : re.sub('[,.!?]', '', t))

print('INFO: done removing punctuation from title and description in %0.3fs.' % (time() - t0))


# converts the text to lowercase
t0 = time()

hits['processed_description'] = hits['processed_description'].map(lambda x: x.lower())
hits['processed_title'] = hits['processed_title'].map(lambda x: x.lower())

print('INFO: done converting text to lowercase in %0.3fs.' % (time() - t0))


# print out the first couple processed descriptions
hits['processed_description'].head()



NameError: name 'hits' is not defined

In [9]:
# print out the first couple processed titles
hits['processed_title'].head()

NameError: name 'hits' is not defined

## Tokenize documents

Using the gensim library tokenize the processed titles and descriptions

In [10]:
# function to tokenize the unstructured text
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

'''
clean_stop_words

@param docs: a list of unstructured documents

Removes stop words from the documents in the list using the gensim simple_preprocess
'''
def clean_stop_words(docs):
  return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in docs]


'''
  get_bigram_trigram_tuple
  
  @param data_set: a list of tokens

  converts the dataset into a tuple of bigrams and trigrams
  using the gensim Phrases and Phraser models
'''
def get_bigram_trigram_tuple(data_set):
  # https://radimrehurek.com/gensim/models/phrases.html
  no_stop_word_data = clean_stop_words(data_set)

  bigram_base = gensim.models.Phrases(data_set, min_count=5, threshold=100)
  trigram_base = gensim.models.Phrases(bigram_base[data_set], threshold=100)

  bigram_mod = gensim.models.phrases.Phraser(bigram_base)
  trigram_mod = gensim.models.phrases.Phraser(trigram_base)

  return([bigram_mod[doc] for doc in no_stop_word_data], [trigram_mod[bigram_mod[doc]] for doc in no_stop_word_data])

stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])


## Build LDA model

Using the gensim library, generate Latentent Dirichlet Allocation model to extract topics from the MTURK web scrape.

In [None]:
def generate_lda(corpus, id2word, num_topics):
  lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                        id2word=id2word,
                                        num_topics=20, 
                                        random_state=100,
                                        chunksize=100,
                                        passes=13,
                                        per_word_topics=True)
  return lda_model


## View topics from LDA models

This snippet will print off the top 20 topics and their 15 most prevalent keywords.

In [None]:
def print_lda(lda):
  pprint(lda.print_topics(num_topics=20, num_words=10))

## Most frequently discussed topics

In [None]:
'''
topics_per_documents

@param model: the LDA model
@param corpus: the corpus the LDA is performing the distribution on
@param start: specifies the starting distribution bound on the corpus
@param end: specifies the ending distribution bound on the corpus

returns a tuple with the dominant topics and the topic percentages in the documents
'''
def topics_per_document(model, corpus, start=0, end=1):
    corpus_sel = corpus[start:end]
    dominant_topics = []
    topic_percentages = []
    for i, corp in enumerate(corpus_sel):
        topic_percs, wordid_topics, wordid_phivalues = model[corp]
        dominant_topic = sorted(topic_percs, key = lambda x: x[1], reverse=True)[0][0]
        dominant_topics.append((i, dominant_topic))
        topic_percentages.append(topic_percs)
    return(dominant_topics, topic_percentages)


def generate_topic_distribution_vis(model, corpus):
  print('INFO: generating the topic distribution dataframes and visualizations')

  t0 = time()

  dominant_topics, topic_percentages = topics_per_document(model=model, corpus=corpus, end=-1)            

  print('INFO: done calculating dominant topics and topic percentages for LDA in %0.3fs.' % (time() - t0))

  # Distribution of Dominant Topics in Each Document
  t0 = time()

  df = pd.DataFrame(dominant_topics, columns=['Document_Id', 'Dominant_Topic'])
  dominant_topic_in_each_doc = df.groupby('Dominant_Topic').size()
  df_dominant_topic_in_each_doc = dominant_topic_in_each_doc.to_frame(name='count').reset_index()

  print('INFO: done calculating topic distribution of LDA in %0.3fs.' % (time() - t0))


  # Total Topic Distribution by actual weight
  t0 = time()

  topic_weightage_by_doc = pd.DataFrame([dict(t) for t in topic_percentages])
  df_topic_weightage_by_doc = topic_weightage_by_doc.sum().to_frame(name='count').reset_index()

  print('INFO: done calulating topic weightage of LDA in %0.3fs.' % (time() - t0))

  # Top 3 Keywords for each Topic
  t0 = time()

  topic_top3words = [(i, topic) for i, topics in model.show_topics(num_topics=20, num_words=10, formatted=False) 
                                  for j, (topic, wt) in enumerate(topics) if j < 3]


  df_top3words_stacked = pd.DataFrame(topic_top3words, columns=['topic_id', 'words'])
  df_top3words = df_top3words_stacked.groupby('topic_id').agg(', \n'.join)
  df_top3words.reset_index(level=0,inplace=True)

  print('INFO: done finding top 3 words of topics for LDA in %0.3fs.' % (time() - t0))

  # Plot
  fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(30, 4), dpi=120, sharey=True)

  # Topic Distribution by Dominant Topics
  ax1.bar(x='Dominant_Topic', height='count', data=df_dominant_topic_in_each_doc, width=.5, color='firebrick')
  ax1.set_xticks(range(df_dominant_topic_in_each_doc.Dominant_Topic.unique().__len__()))
  print(df_dominant_topic_in_each_doc.shape)
  print(df_top3words.shape)
  print(df_top3words.loc[df_top3words.topic_id==16, 'words'].values[0])
  # + df_top3words.loc[df_top3words.topic_id==x, 'words']
  tick_formatter = FuncFormatter(lambda x, pos: 'Topic ' + str(x)+ '\n' + df_top3words.loc[df_top3words.topic_id==x, 'words'].values[0])
  ax1.xaxis.set_major_formatter(tick_formatter)
  ax1.set_title('Number of Documents by Dominant Topic', fontdict=dict(size=10))
  ax1.set_ylabel('Number of Documents')
  ax1.set_ylim(0, 30000)
  ax1.tick_params(direction='out', length=3, width=1,
                grid_color='r', grid_alpha=0.5, labelsize=6)
  #ax1.xaxis.
  # fig.update_xaxes(tickangle=45, tickfont=dict(family='Rockwell', color='crimson', size=14))


  # Topic Distribution by Topic Weights
  ax2.bar(x='index', height='count', data=df_topic_weightage_by_doc, width=.5, color='steelblue')
  ax2.set_xticks(range(df_topic_weightage_by_doc.index.unique().__len__()))
  ax2.xaxis.set_major_formatter(tick_formatter)
  ax2.set_title('Number of Documents by Topic Weightage', fontdict=dict(size=10))
  #ax2.update_xaxes(tickangle=45, tickfont=dict(family='Rockwell', color='crimson', size=8))
  ax2.tick_params(direction='out', length=3, width=1,
                grid_color='r', grid_alpha=0.5, labelsize=6)
  # fig.update_xaxes(fontdict=dict(size=10))

  plt.show()


## Calculate Topic Coherence Score

In [None]:
# calculate the coherence score
def calculate_coherence_score(topics, texts, id2word, coherence):
  coherence_lda = CoherenceModel(topics=topics, texts=texts, dictionary=id2word, coherence=coherence)
  coherence_lda_score = coherence_lda.get_coherence()
  return coherence_lda_score


## Evaluate Model over number of topics

In [None]:
def evaluate_model(id2word, corpus, texts, start, limit):
    """
    Function to display num_topics - LDA graph using c_v coherence
    
    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    limit : topic limit
    
    Returns:
    -------
    lm_list : List of LDA topic models
    c_v : Coherence values corresponding to the LDA model with respective number of topics
    """
    model_coherence = []
    c_v = []
    print('INFO: starting to generate LDAs and calculate coherence')
    for num_topics in range(start, limit):
      t0 = time()
      lm = generate_lda(corpus=corpus, id2word=id2word, num_topics=num_topics)
      print('INFO: done generating LDA on ', num_topics, ' topics in %0.3fs.' % (time() - t0))

      model_topics = lm.show_topics(formatted=False)
      model_topics = [[word for word, prob in topic] for topicid, topic in model_topics]
      cs = calculate_coherence_score(topics=model_topics, texts=texts, id2word=id2word, coherence='c_v')
      model_coherence.append((lm,cs))
      c_v.append(cs)

    print('INFO: finished generating models, creating evalution visualization')  
    # Show graph
    x = range(start, limit)
    plt.plot(x, c_v)
    plt.xlabel("num_topics")
    plt.ylabel("Coherence score")
    plt.legend(("c_v"), loc='best')
    plt.show()
    
    return model_coherence

## Description Analysis

In [None]:
# strip descriptions into tokens
t0 = time()

# grabs the descriptions from the hits dataframe and puts them in a list
description_data = hits.processed_description.values.tolist()

# breaks the descriptions into individual word tokens
description_tokens = list(sent_to_words(description_data))

print('INFO: done tokenizing descriptions in %0.3fs.' % (time() - t0))

t0 = time()

# generates a tuple with the description converted into bigrams and trigrams
description_ngrams = get_bigram_trigram_tuple(description_tokens)

print('INFO: done generating description n-grams in %0.3fs.' % (time() - t0))

# extracts the description bigram list
description_bigrams = description_ngrams[0]

t0 = time()

# generate id2word dictionary of bigrams
description_bi_id2word = corpora.Dictionary(description_bigrams)

# generate corpus from bigrams
description_bi_corpus = [description_bi_id2word.doc2bow(tok) for tok in description_bigrams]

print('INFO: done generating description corpus in %0.3fs.' % (time() - t0))

In [None]:
# generate description bigram LDA
t0 = time()

description_bi_lda = generate_lda(description_bi_corpus, description_bi_id2word, 20)

print('INFO: done generating description bigram LDA in %0.3fs.' % (time() - t0))

In [None]:
# print description bigram lda topics
t0 = time()

print_lda(description_bi_lda)

print('INFO: done printing description LDA topics in %0.3fs.' % (time() - t0))

In [None]:
# Create visualization for description document distribution
t0 = time()

generate_topic_distribution_vis(description_bi_lda, description_bi_corpus)

print('INFO: done generating description bigram LDA document-topic distribution visualization in %0.3fs.' % (time() - t0))

In [None]:
# groom the topics from the description bigram lda for topic coherence measure
description_bi_lda_topics = description_bi_lda.show_topics(formatted=False)
description_bi_lda_topics = [[word for word, prob in topic] for topicid, topic in description_bi_lda_topics]

# calculate description bigram topic coherence
t0 = time()


# calculates the first 10 keywords in a topic
print('Description LDA w/ bigrams UMass Topic Coherence Score: ', calculate_coherence_score(description_bi_lda_topics[:10], description_bigrams, description_bi_id2word, 'u_mass'))

print('INFO: done printing description bigram UMass topic coherence score in %0.3fs.' % (time() - t0))

t0 = time()

# calculates the first 10 keywords in a topic
print('Description LDA w/ bigrams UCI Topic Coherence Score: ', calculate_coherence_score(description_bi_lda_topics[:10], description_bigrams, description_bi_id2word, 'c_uci'))

print('INFO: done printing description bigram UCI topic coherence score in %0.3fs.' % (time() - t0))



In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

models = evaluate_model(id2word=description_bi_id2word, corpus=description_bi_corpus, texts=description_bigrams, start=7, limit=25)

In [None]:
print(len(models))
for i in range(25-7):
  print("Number of topics", i + 7, models[i][1])

In [None]:
print("LDA description bigram performed best at 9 topics", models[2][1])

## Title Analysis

In [None]:
#strip titles into tokens
t0 = time()

title_data = hits.processed_title.values.tolist()
title_tokens = list(sent_to_words(title_data))

print('INFO: done tokenizing titles in %0.3fs.' % (time() - t0))

t0 = time()

title_ngrams = get_bigram_trigram_tuple(title_tokens)

print('INFO: done generating title n-grams in %0.3fs.' % (time() - t0))

# TODO: optimize the hyperparameters and the K value of the LDA

Find out what the best hyperparameters are for the LDA and the best number of topics.

## Visualize the Topic Model

Load the LDA model into `pyLDAvis` to analyze the topic model

In [None]:
pyLDAvis.enable_notebook()

# generate pyLDAvis dashboard
t0 = time()

LDAvis_prepared = pyLDAvis.gensim.prepare(description_lda, description_corpus, description_dict)

print('INFO: done generating description LDA pyLDAvis dashboard in %0.3fs.' % (time() - t0))

LDAvis_prepared


# LDA Analysis
Need to clean up the dataset desperately, as you can see the words 'receipt', 'transcribe', and 'receipts' happen almost twice as much as the next highest occuring words which means that we need to reduce the occurence of these HITs in our dataset to prevent the topic from becoming oversaturated.

#### Document Duplication
> Using synthetic repetition of data, we find that as documents are repeated,
topic models begin to devote topics exclusively to
the repeated documents. Repeated documents show
very low topical entropy and high likelihood. However, text without these repetitions is largely unaffected: repeated text is quickly fit well to one or a few topics, leaving the rest of the model unaffected,
except for the implicit loss of modeling power
caused by “losing” one or more topics. We find that
topic models can accommodate occasional duplicates and fit topics to a repeated string across many documents, but that this is more difficult if the repeated text has similar language to the content of interest. In our experiments, effects of duplication were minimal until duplicate documents became a
substantial proportion of the corpus, whether one
document repeated over a thousand times or 1% of
the corpus repeated four times.

*Understanding Text Pre-Processing for Latent Dirichlet Allocation* (Schofield et al. 2017)

Basically this will only effect HITs that have content relating to transcription or receipts which may otherwise be matched to another topic.

In [None]:
hits['processed_description'].head()

# Topic Coherence Evaluation

In [None]:
def evaluate_model(id2word, corpus, texts, start, limit):
    """
    Function to display num_topics - LDA graph using c_v coherence
    
    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    limit : topic limit
    
    Returns:
    -------
    lm_list : List of LDA topic models
    c_v : Coherence values corresponding to the LDA model with respective number of topics
    """
    model_coherence = []
    c_v = []
    print('INFO: starting to generate LDAs and calculate coherence')
#     for num_topics in range(start, limit):
  t0 = time()
#       lm = generate_lda(corpus=corpus, id2word=id2word, num_topics=num_topics)
#       print('INFO: done generating LDA on ', num_topics, ' topics in %0.3fs.' % (time() - t0))

#       model_topics = lm.show_topics(formatted=False)
#       model_topics = [[word for word, prob in topic] for topicid, topic in model_topics]
  cs = calculate_coherence_score(topics=model_topics, texts=texts, id2word=id2word, coherence='c_v')
  model_coherence.append((lm,cs))
  c_v.append(cs)

    print('INFO: finished generating models, creating evalution visualization')  
    # Show graph
    x = range(start, limit)
    plt.plot(x, c_v)
    plt.xlabel("num_topics")
    plt.ylabel("Coherence score")
    plt.legend(("c_v"), loc='best')
    plt.show()
    
    return model_coherence


'''
fn to get topic coherence from a list of topics

topics: list of topics
texts: the raw text corpus
id2word: the gensim corpora dictionary
'''
def get_coherence(topics, texts, id2word, coherence='umass')
t0 = time()
#       lm = generate_lda(corpus=corpus, id2word=id2word, num_topics=num_topics)
#       print('INFO: done generating LDA on ', num_topics, ' topics in %0.3fs.' % (time() - t0))

#       model_topics = lm.show_topics(formatted=False)
#       model_topics = [[word for word, prob in topic] for topicid, topic in model_topics]
    cs = calculate_coherence_score(topics=model_topics, texts=texts, id2word=id2word, coherence='c_v')
    print('INFO: topic coherence on', coherence, 'metric', cs)
    return cs

id2word = corpora.Dictionary(description_bigrams)
