In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
from gensim.models.ldamodel import LdaModel
from gensim.models import LdaMulticore
from gensim import corpora
from collections import defaultdict

In [18]:
import os, multiprocessing

### Load Data

In [3]:
train = pd.read_csv('data/labeledTrainData.tsv', sep='\t')
print(train.shape)

(25000, 3)


In [4]:
documents = list(train['review'])

### Create Corpus

In [5]:
stop_words = ["a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also","although","always","am","among", "amongst", "amoungst", "amount",  "an", "and", "another", "any","anyhow","anyone","anything","anyway", "anywhere", "are", "around", "as",  "at", "back","be","became", "because","become","becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom","but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven","else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own","part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "the"]

In [6]:
texts = [[word for word in document.lower().split() if word not in stop_words] for document in documents]

In [7]:
FREQ_THRESHOLD = 10
frequency = defaultdict(int)

for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > FREQ_THRESHOLD] for text in texts]

In [8]:
dictionary = corpora.Dictionary(texts)

In [9]:
len(dictionary)

25046

In [10]:
dictionary.save('models/topic_dict.dict')

In [15]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [16]:
corpora.MmCorpus.serialize('models/topic_corpus.mm', corpus)

### Create LDA Model

In [13]:
dictionary = corpora.Dictionary.load('models/topic_dict.dict')

In [None]:
corpus = corpora.MmCorpus.load('models/topic_corpus.mm')

In [19]:
N_WORKERS = multiprocessing.cpu_count() - 1
print('Number of workers: '+ str(N_WORKERS))

Number of workers: 11


In [20]:
N_TOPICS = 50

In [21]:
lda = LdaMulticore(corpus=corpus,
                   id2word=dictionary,
                   num_topics=N_TOPICS,
                   workers=N_WORKERS,
                   chunksize=2500,
                   passes=20,
                   random_state=42)

In [22]:
lda.save('models/topic_lda_model')

### Inspect Topics

In [23]:
lda = LdaMulticore.load('models/topic_lda_model')

In [24]:
lda.print_topics(50)

[(0,
  '0.011*"film" + 0.011*"/><br" + 0.010*"jerry" + 0.008*"-" + 0.007*"like" + 0.006*"i" + 0.006*"it\'s" + 0.005*"tom" + 0.004*"films" + 0.003*"heston"'),
 (1,
  '0.017*"film" + 0.015*"i" + 0.011*"best" + 0.009*"(" + 0.006*"/><br" + 0.005*"just" + 0.004*"like" + 0.004*"it\'s" + 0.004*")" + 0.004*"old"'),
 (2,
  '0.014*"film" + 0.011*"/><br" + 0.009*"-" + 0.005*"man" + 0.003*"great" + 0.003*"love" + 0.003*"story" + 0.003*"role" + 0.003*"best" + 0.003*"film,"'),
 (3,
  '0.034*"!" + 0.014*"/><br" + 0.012*"good" + 0.011*"i" + 0.009*"film" + 0.009*"movie" + 0.008*"great" + 0.005*"lugosi" + 0.004*"like" + 0.004*"it\'s"'),
 (4,
  '0.018*"i" + 0.015*"/><br" + 0.010*"movie" + 0.010*"film" + 0.007*"&" + 0.006*"character" + 0.005*"like" + 0.005*"really" + 0.005*"just" + 0.004*"it\'s"'),
 (5,
  '0.019*"film" + 0.018*"i" + 0.011*"/><br" + 0.009*"good" + 0.008*"great" + 0.006*"story" + 0.006*"like" + 0.005*"just" + 0.005*"movie" + 0.005*"-"'),
 (6,
  '0.056*"&" + 0.016*"/><br" + 0.016*"it\'s" + 0

In [30]:
train['review'].iloc[1000]

"I watched this movie really late last night and usually if it's late then I'm pretty forgiving of movies. Although I tried, I just could not stand this movie at all, it kept getting worse and worse as the movie went on. Although I know it's suppose to be a comedy but I didn't find it very funny. It was also an especially unrealistic, and jaded portrayal of rural life. In case this is what any of you think country life is like, it's definitely not. I do have to agree that some of the guy cast members were cute, but the french guy was really fake. I do have to agree that it tried to have a good lesson in the story, but overall my recommendation is that no one over 8 watch it, it's just too annoying."

In [31]:
lda[dictionary.doc2bow(train['review'].iloc[1000].lower().split(" "))]

[(21, 0.10138979791276496), (34, 0.88563722911426301)]