In [118]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import logging
from collections import Counter

import gensim
from gensim.parsing.preprocessing import STOPWORDS

import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES

import torch
from torchtext import data, datasets

import re
import html
re1 = re.compile(r'  +')

In [119]:
data_path = Path("../data/processed/") / "aclImdb"

In [120]:
def fixup(x):
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>','u_n').replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))

In [121]:
spacy_en = spacy.load('en')
def tokenizer(text): # create a tokenizer function
    return [tok.lemma_ for tok in spacy_en.tokenizer(fixup(text)) if tok.text not in STOPWORDS]

In [122]:
# ex = test_ds.examples[0]
# for tok in spacy_en(" ".join(ex.text)):
#     print(tok.text, tok.lemma_)

In [123]:
TEXT = data.Field(lower=True, include_lengths=True, batch_first=True, tokenize=tokenizer)
LABEL = data.Field(sequential=False)

# train_ds = datasets.IMDB(str(data_path / "train"), TEXT, LABEL)
test_ds = datasets.IMDB(str(data_path / "test"), TEXT, LABEL)

In [124]:
dictionary = gensim.corpora.Dictionary([ex.text for ex in test_ds.examples])
dictionary.filter_extremes(no_below=20, no_above=0.2)

In [125]:
counts = Counter(dictionary.dfs)
[(dictionary[i], c) for i, c in counts.most_common(10)]

[('want', 4968),
 ('...', 4950),
 ('and', 4916),
 ('a', 4825),
 ('man', 4817),
 ("'", 4773),
 ('try', 4707),
 ('there', 4678),
 ('in', 4626),
 ('year', 4603)]

In [126]:
corpus = [dictionary.doc2bow(ex.text) for ex in test_ds.examples]
gensim.corpora.MmCorpus.serialize('/tmp/imdb_bow.mm', corpus)

In [127]:
mm_corpus = gensim.corpora.MmCorpus('/tmp/imdb_bow.mm')
print(mm_corpus)

MmCorpus(25000 documents, 9527 features, 1772669 non-zero entries)


In [128]:
tfidf_model = gensim.models.TfidfModel(mm_corpus, id2word=dictionary)

In [129]:
lsi_model = gensim.models.LsiModel(tfidf_model[mm_corpus], id2word=dictionary, num_topics=20)

In [130]:
lsi_model.print_topics()

[(0,
  '0.194*"\'" + 0.129*"..." + 0.114*"*" + 0.079*";" + 0.078*"and" + 0.077*"wrong" + 0.076*"funny" + 0.075*"feel" + 0.075*"man" + 0.075*"want"'),
 (1,
  '-0.987*"*" + 0.040*"\'" + -0.036*"1/2" + -0.032*"spoilers" + -0.026*"spoiler" + 0.018*"..." + 0.015*"episode" + 0.014*"life" + 0.013*"enjoy" + 0.013*"series"'),
 (2,
  '-0.956*"\'" + 0.076*"..." + -0.046*"`" + 0.040*"guy" + 0.040*"wrong" + 0.040*"funny" + 0.034*".." + 0.033*"horror" + 0.032*"stupid" + 0.031*"if"'),
 (3,
  '0.574*"..." + 0.206*".." + 0.139*"\'" + 0.127*"wrong" + 0.124*"...." + -0.113*"--" + 0.104*"horror" + -0.103*"war" + 0.102*"waste" + 0.091*"laugh"'),
 (4,
  '-0.572*"..." + 0.385*"&" + 0.238*"horror" + -0.162*".." + -0.159*"book" + 0.110*"budget" + 0.096*"effect" + 0.095*"low" + 0.091*"wrong" + 0.087*"gore"'),
 (5,
  '-0.802*"&" + -0.301*"..." + 0.152*"funny" + -0.145*".." + 0.123*"episode" + 0.113*"laugh" + 0.082*"series" + 0.071*"wrong" + 0.070*"joke" + 0.063*"waste"'),
 (6,
  '0.366*"episode" + 0.325*"&" + 0.

In [131]:
clipped_corpus = gensim.utils.ClippedCorpus(mm_corpus, 25000)
lda_model = gensim.models.ldamulticore.LdaMulticore(clipped_corpus, num_topics=20, id2word=dictionary, passes=10, workers=6)

In [132]:
lda_model.print_topics()

[(0,
  '0.017*"episode" + 0.012*"series" + 0.006*"--" + 0.005*"in" + 0.005*";" + 0.005*"show" + 0.005*"season" + 0.004*"year" + 0.004*"new" + 0.003*"child"'),
 (1,
  '0.005*"\'" + 0.005*"little" + 0.004*"director" + 0.004*"black" + 0.004*"feel" + 0.003*"point" + 0.003*"in" + 0.003*"style" + 0.003*"there" + 0.003*"white"'),
 (2,
  '0.006*";" + 0.006*"performance" + 0.005*"man" + 0.005*"role" + 0.004*"director" + 0.004*"set" + 0.004*"woman" + 0.004*"in" + 0.003*"\'" + 0.003*"cast"'),
 (3,
  '0.009*"music" + 0.007*"song" + 0.007*"&" + 0.006*"cartoon" + 0.005*"kid" + 0.005*"video" + 0.005*"want" + 0.005*"little" + 0.005*"if" + 0.004*"year"'),
 (4,
  '0.006*"series" + 0.006*"jack" + 0.005*"zombie" + 0.005*"sci" + 0.005*"fi" + 0.004*"alien" + 0.004*"star" + 0.004*"tv" + 0.004*"wrong" + 0.004*"original"'),
 (5,
  '0.014*";" + 0.007*"man" + 0.005*"tell" + 0.005*"a" + 0.005*"in" + 0.004*"and" + 0.004*"woman" + 0.004*"--" + 0.003*"there" + 0.003*"moment"'),
 (6,
  '0.033*"book" + 0.012*"read" + 