# lda2vec setup in jupyter notebook

### Install all dependencies

In [1]:
!pip install pyLDAvis
!pip3 install gensim
!pip3 install matplotlib
!pip3 install nltk

from time import time
import pandas as pd
import os
import re
from pprint import pprint


import gensim
from gensim.utils import simple_preprocess
import gensim.corpora as corpora
from gensim.models import CoherenceModel

from matplotlib.ticker import FuncFormatter
from matplotlib import pyplot as plt


import pyLDAvis
import pyLDAvis.gensim
import pickle 

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

t0 = time()

print('INFO: done importing libraries and dataset in %0.3fs.' % (time() - t0))



  regargs, varargs, varkwargs, defaults, formatvalue=lambda value: ""
  from collections import Sequence, defaultdict


INFO: done importing libraries and dataset in 0.000s.


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rileymiller/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Import dataset into dataframe

In [2]:
t0 = time()

hits = pd.read_json('datasets/clean_hits_full_week.json', lines=True)
hits.head()

print('INFO: done reading in dataset to pandas dataframe in %0.3fs.' % (time() - t0))

INFO: done reading in dataset to pandas dataframe in 18.935s.


### Clean data 

In [3]:
t0 = time()

hits = hits.drop(columns=['_id', 'hit_set_id', 'requester_id','requester_name', 'assignment_duration_in_seconds', 'creation_time', 'assignable_hits_count', 'latest_expiration_time', 'caller_meets_requirements', 'caller_meets_preview_requirements', 'last_updated_time', 'monetary_reward', 'accept_project_task_url', 'requester_url', 'project_tasks_url', 'project_requirements', 'requesterInfo'], axis=1)
hits.head()

print('INFO: done columns from dataframe in %0.3fs.' % (time() - t0))

# removes all punctuation from the description and title if any
t0 = time()

hits['processed_description'] = hits['description'].map(lambda d : re.sub('[,.!?]', '', d))
hits['processed_title'] = hits['title'].map(lambda t : re.sub('[,.!?]', '', t))

print('INFO: done removing punctuation from title and description in %0.3fs.' % (time() - t0))


# converts the text to lowercase
t0 = time()

hits['processed_description'] = hits['processed_description'].map(lambda x: x.lower())
hits['processed_title'] = hits['processed_title'].map(lambda x: x.lower())

print('INFO: done converting text to lowercase in %0.3fs.' % (time() - t0))


# print out the first couple processed descriptions
hits['processed_description'].head()



INFO: done columns from dataframe in 1.353s.
INFO: done removing punctuation from title and description in 0.988s.
INFO: done converting text to lowercase in 0.263s.


0    you will be presented an image of a gym cardio...
1    given a sentence and a noun from that sentence...
2                         transcribing data from image
3    extract all the items from the receipt you wil...
4    verify the value of single data point (such as...
Name: processed_description, dtype: object

### Load GloVe embeddings

In [4]:
# !pip3 install lda2vec
from lda2vec.nlppipe import Preprocessor
# Where to save preprocessed data
clean_data_dir = "data/clean_data"
# Name of input file. Should be inside of data_dir
# input_file = "20_newsgroups.txt"
# Should we load pretrained embeddings from file
load_embeds = True

# Read in data file
# df = pd.read_csv(data_dir+"/"+input_file, sep="\t")

# Initialize a preprocessor
P = Preprocessor(hits, "processed_description", max_features=30000, maxlen=10000, min_count=30)

# Run the preprocessing on your dataframe
t0 = time()
    
print('INFO: beginning preprocesssing tokens from descriptions')

P.preprocess()

print('INFO: finished preprocessing tokens from descriptions in %0.3fs.' % (time() - t0))

# Load embeddings from file if we choose to do so
if load_embeds:
    # Load embedding matrix from file path - change path to where you saved them
    t0 = time()
    
    print('INFO: loading glove embeddings')
    embedding_matrix = P.load_glove("embeddings/glove.42B.300d.txt")
    
    print('INFO: finished loading glove embeddings in %0.3fs.' % (time() - t0))
else:
    embedding_matrix = None

# Save data to data_dir
P.save_data(clean_data_dir, embedding_matrix=embedding_matrix)

Using TensorFlow backend.


INFO: beginning preprocesssing tokens from descriptions


0it [00:00, ?it/s]


---------- Tokenizing Texts ----------


231021it [00:07, 31654.86it/s]


Removing 3210 low frequency tokens out of 4448 total tokens


21it [00:00, 189.71it/s]


---------- Getting Skipgrams ----------


231021it [01:14, 3100.51it/s]


INFO: finished preprocessing tokens from descriptions in 116.227s.
INFO: loading glove embeddings


  exec(code_obj, self.user_global_ns, self.user_ns)


INFO: finished loading glove embeddings in 170.137s.


### Load and train model

In [5]:
import tensorflow as tf
print(tf.__version__)
print(tf.keras.__version__)

1.15.0
2.2.4-tf


In [None]:
from lda2vec import utils, model

print(tf.__version__)
# Path to preprocessed data
clean_data_dir = "data/clean_data"
# Whether or not to load saved embeddings file
load_embeds = True

# Load data from files
t0 = time()

print('INFO: loading preprocessed data')
(idx_to_word, word_to_idx, freqs, pivot_ids,
 target_ids, doc_ids, embed_matrix) = utils.load_preprocessed_data(clean_data_dir, load_embed_matrix=load_embeds)

print('INFO: finished loading preprocessed data in %0.3fs.' % (time() - t0))

# Number of unique documents
num_docs = doc_ids.max() + 1
# Number of unique words in vocabulary (int)
vocab_size = len(freqs) + 500
# Embed layer dimension size
# If not loading embeds, change 128 to whatever size you want.
embed_size = embed_matrix.shape[1] if load_embeds else 128
# Number of topics to cluster into
num_topics = 9
# Amount of iterations over entire dataset
num_epochs = 200
# Batch size - Increase/decrease depending on memory usage
batch_size = 4096
# Epoch that we want to "switch on" LDA loss
switch_loss_epoch = 0
# Pretrained embeddings value
pretrained_embeddings = embed_matrix if load_embeds else None
# If True, save logdir, otherwise don't
save_graph = True

# Initialize the model
t0 = time()

print('INFO: initializing lda2vec model')
m = model(num_docs,
          vocab_size,
          num_topics,
          embedding_size=embed_size,
          pretrained_embeddings=pretrained_embeddings,
          freqs=freqs,
          batch_size = batch_size,
          save_graph_def=save_graph)

print('INFO: finished initializing lda2vec model in %0.3fs.' % (time() - t0))


# Train the model
t0 = time()

print('INFO: training lda2vec')
m.train(pivot_ids,
        target_ids,
        doc_ids,
        len(pivot_ids),
        num_epochs,
        idx_to_word=idx_to_word,
        switch_loss_epoch=switch_loss_epoch)

print('INFO: finished training lda2vec model in %0.3fs.' % (time() - t0))


1.15.0
INFO: loading preprocessed data
INFO: finished loading preprocessed data in 9.305s.
INFO: initializing lda2vec model






Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.






INFO: finished initializing lda2vec model in 1.889s.
INFO: training lda2vec




EPOCH: 1
