# LDA on Reviews
##### Sean Wade

In [4]:
import pandas as pd
import tqdm

from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from gensim import corpora, models
import gensim
import pyLDAvis.gensim

from IPython.core.display import HTML
import urllib2
HTML(urllib2.urlopen('http://seanwade.com/jupyter.css').read())

In [9]:
!pwd

/Users/seanwade/projects/yelp-prediction/sean/nlp/lda


In [10]:
users = pd.read_csv('../../../raw_data/csv/yelp_academic_dataset_user.csv')
review = pd.read_csv('../../../raw_data/csv/yelp_academic_dataset_review.csv')

In [15]:
short_reviews = review[:3]

In [31]:
doc_set = list(review['text'].apply(lambda x: x.decode('utf-8')))

In [32]:
doc_set

[u'Mr Hoagie is an institution. Walking in, it does seem like a throwback to 30 years ago, old fashioned menu board, booths out of the 70s, and a large selection of food. Their speciality is the Italian Hoagie, and it is voted the best in the area year after year. I usually order the burger, while the patties are obviously cooked from frozen, all of the other ingredients are very fresh. Overall, its a good alternative to Subway, which is down the road.',
 u"Excellent food. Superb customer service. I miss the mario machines they used to have, but it's still a great place steeped in tradition.",
 u'Yes this place is a little out dated and not opened on the weekend. But other than that the staff is always pleasant and fast to make your order. Which is always spot on fresh veggies on their hoggies and other food. They also have daily specials and ice cream which is really good. I had a banana split they piled the toppings on. They win pennysaver awards ever years i see why.',
 u'PROS: Ital

## Clean & Prepare Data

In [33]:
texts_list = []
for doc in tqdm.tqdm(doc_set):
    
    # Parse the doc into tokens
    tokenizer = RegexpTokenizer(r'\w+')
    raw = doc.lower()
    tokens = tokenizer.tokenize(raw)

    # Remove stop words
    en_stop = stopwords.words('english')
    stopped_tokens = [i for i in tokens if not i in en_stop]

    # Stem the words
    p_stemmer = PorterStemmer()
    texts = [p_stemmer.stem(i) for i in stopped_tokens]
    texts_list.append(texts)



  0%|          | 0/2685066 [00:00<?, ?it/s][A[A

  0%|          | 28/2685066 [00:00<2:44:07, 272.67it/s][A[A

  0%|          | 65/2685066 [00:00<2:31:37, 295.15it/s][A[A

  0%|          | 105/2685066 [00:00<2:20:18, 318.92it/s][A[A

  0%|          | 139/2685066 [00:00<2:23:38, 311.52it/s][A[A

  0%|          | 179/2685066 [00:00<2:15:42, 329.75it/s][A[A

  0%|          | 215/2685066 [00:00<2:12:47, 336.97it/s][A[A

  0%|          | 260/2685066 [00:00<2:03:35, 362.07it/s][A[A

  0%|          | 295/2685066 [00:00<2:15:30, 330.22it/s][A[A

  0%|          | 328/2685066 [00:00<2:17:31, 325.37it/s][A[A

  0%|          | 361/2685066 [00:01<2:26:19, 305.80it/s][A[A

  0%|          | 409/2685066 [00:01<2:10:53, 341.82it/s][A[A

  0%|          | 455/2685066 [00:01<2:01:01, 369.70it/s][A[A

  0%|          | 494/2685066 [00:01<2:06:15, 354.36it/s][A[A

  0%|          | 535/2685066 [00:01<2:01:37, 367.89it/s][A[A

  0%|          | 573/2685066 [00:01<2:10:52, 341.88it

IndexError: string index out of range



           11%|█         | 301036/2685066 [16:58<2:14:28, 295.47it/s][A[A

## Constructing Document-Term Matrix

In [23]:
dictionary = corpora.Dictionary(texts_list)
dictionary.token2id

{u'forget': 566,
 u'chain': 160,
 u'gp': 639,
 u'hint': 961,
 u'consum': 1096,
 u'month': 401,
 u'four': 971,
 u'catch': 1061,
 u'sleev': 1006,
 u'ice': 61,
 u'gluten': 1132,
 u'freakin': 641,
 u'go': 164,
 u'shot': 600,
 u'mill': 133,
 u'golfer': 408,
 u'yinzer': 281,
 u'carpet': 411,
 u'extra': 333,
 u'decid': 312,
 u'accur': 535,
 u'buddi': 783,
 u'sorri': 969,
 u'flash': 213,
 u'readabl': 1020,
 u'friendli': 88,
 u'thursday': 947,
 u'spirit': 866,
 u'0': 282,
 u'certainli': 647,
 u'glad': 578,
 u'leav': 808,
 u'pride': 1024,
 u'worth': 582,
 u'hill': 1127,
 u'sound': 957,
 u'boatload': 855,
 u'woman': 239,
 u'everi': 488,
 u'abund': 998,
 u'sausag': 1097,
 u'far': 426,
 u'hard': 894,
 u'vestig': 621,
 u'unbear': 1073,
 u'electron': 1022,
 u'worst': 135,
 u'failur': 94,
 u'cook': 23,
 u'poorli': 929,
 u'enter': 748,
 u'seemingli': 659,
 u'cool': 1145,
 u'tri': 324,
 u'school': 615,
 u'norfolk': 590,
 u'hour': 438,
 u'fantast': 280,
 u'die': 554,
 u'disqualifi': 181,
 u'list': 774,
 

In [24]:
corpus = [dictionary.doc2bow(text) for text in texts_list]

In [25]:
corpus

[[(0, 1),
  (1, 1),
  (2, 2),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 3),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1)],
 [(19, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1)],
 [(7, 1),
  (10, 1),
  (18, 1),
  (19, 1),
  (33, 1),
  (35, 1),
  (48, 1),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 1),
  (59, 2),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 1)],
 [(2, 2),
  (18, 1),
  (33, 1),
  (34, 1),
  (78, 1),
  (79, 1

corpus is a document term matrix

## Make LDA Model

In [26]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=100, id2word = dictionary,  distributed=True)

In [27]:
print ldamodel.print_topics(num_topics=75, num_words=5)

[(7, u'0.058*"food" + 0.058*"good" + 0.031*"love" + 0.030*"tri" + 0.029*"place"'), (12, u'0.053*"sandwich" + 0.040*"fish" + 0.033*"reuben" + 0.020*"pretti" + 0.017*"portion"'), (2, u'0.029*"place" + 0.020*"alexion" + 0.020*"part" + 0.020*"cream" + 0.020*"factori"'), (49, u'0.021*"fish" + 0.021*"year" + 0.021*"old" + 0.021*"day" + 0.021*"ask"'), (43, u'0.001*"ad" + 0.001*"polic" + 0.001*"street" + 0.001*"scotch" + 0.001*"forev"'), (27, u'0.026*"ball" + 0.026*"hole" + 0.026*"cart" + 0.013*"get" + 0.013*"lot"'), (4, u'0.071*"wing" + 0.055*"sauc" + 0.040*"like" + 0.032*"place" + 0.024*"butter"'), (28, u'0.001*"hoagi" + 0.001*"sandwich" + 0.001*"side" + 0.001*"sauc" + 0.001*"hot"'), (35, u'0.033*"place" + 0.030*"fish" + 0.024*"sandwich" + 0.022*"stop" + 0.016*"year"'), (24, u'0.001*"ad" + 0.001*"polic" + 0.001*"street" + 0.001*"scotch" + 0.001*"forev"')]


In [28]:
ldamodel.print_topic(1)

u'0.001*"fish" + 0.001*"good" + 0.001*"da" + 0.001*"reuben" + 0.001*"tabl" + 0.001*"could" + 0.001*"burgh" + 0.001*"look" + 0.001*"see" + 0.001*"anoth"'

## Visualize

In [29]:
vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

In [30]:
pyLDAvis.display(vis)

In [45]:
ldamodel.save('s.p')