# ATX-DAT-2 | Demo 14

# Gensim

Gensim (http://radimrehurek.com/gensim) is a library of language processing tools focused on latent variable models of text.

In [54]:
import os
import math
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import feature_extraction
from gensim import matutils, models

pd.set_option('display.max_rows', 30)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 30)


plt.style.use('ggplot')

The data is about sentiments on Amazon reviews.

In [75]:
reviews = []
sentiments = []

with open(os.path.join('..', 'datasets', 'amazon-reviews.txt')) as f:
    for line in f.readlines():
        line = line.strip('\n')
        review, sentiment = line.split('\t')
        sentiment = np.nan if sentiment == '' else int(sentiment)

        reviews.append(review.lower())
        sentiments.append(sentiment)

df = pd.DataFrame({'review': reviews, 'sentiment': sentiments})

In [45]:
df.head()

Unnamed: 0,review,sentiment
0,i try not to adjust the volume setting to avoi...,
1,so there is no way for me to plug it in here i...,0.0
2,"good case, excellent value.",1.0
3,i thought motorola made reliable products!.,
4,battery for motorola razr.,


In [46]:
df.dropna(inplace = True) # Let's drop the NaN

In [47]:
df.head()

Unnamed: 0,review,sentiment
1,so there is no way for me to plug it in here i...,0.0
2,"good case, excellent value.",1.0
5,great for the jawbone.,1.0
10,tied to charger for conversations lasting more...,0.0
11,the mic is great.,1.0


## LDA with Gensim

### Let's first translate a set of documents (articles) into a matrix representation with a row per document and a column per feature (word or n-gram)

In [94]:
vectorizer = feature_extraction.text.CountVectorizer(stop_words = 'english')

vectorizer.get_stop_words()

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

In [77]:
documents = vectorizer.fit_transform(df.review)

In [78]:
# Let's now build a mapping of numerical ID to word

id2word = dict(enumerate(vectorizer.get_feature_names()))

In [79]:
len(id2word)

8383

### We want to learn which columns are correlated (i.e., likely to come from the same topic).  This is the word distribution.  We can also determine what topics are in each document, the topic distribution.

In [80]:
# First we convert our word-matrix into gensim's format

corpus = matutils.Sparse2Corpus(documents, documents_columns = False)

(Check https://radimrehurek.com/gensim/matutils as needed)

In [81]:
corpus

<gensim.matutils.Sparse2Corpus at 0x11b4160d0>

(Check https://radimrehurek.com/gensim/models/ldamodel as needed)

In [83]:
# Then we fit an LDA model
model=models.ldamodel.LdaModel(corpus = corpus, num_topics = 10, id2word = id2word, passes = 10)

In this model, we need to explicitly specify the number of topic we want the model to uncover.  This is a critical parameter, but there isn't much guidance on how to choose it.  Try to use domain expertise where possible.

In [19]:
model

<gensim.models.ldamodel.LdaModel at 0x118c9f690>

### Goodness of fit

Now we need to assess the goodness of fit for our model.  Like other unsupervised learning techniques, our validation techniques are mostly about interpretation.

Use the following questions to guide you:
- Did we learn reasonable topics?
- Do the words that make up a topic make sense?
- Is this topic helpful towards our goal?

In [59]:
model.print_topics()

[(19,
  u'0.029*phone + 0.015*great + 0.015*belt + 0.010*make + 0.010*think + 0.010*plan + 0.010*trying + 0.010*plus + 0.010*sure + 0.010*disappointing'),
 (17,
  u'0.041*phone + 0.026*does + 0.022*work + 0.014*charge + 0.013*cool + 0.010*product + 0.010*better + 0.010*said + 0.007*worked + 0.007*time'),
 (3,
  u'0.088*battery + 0.028*life + 0.026*great + 0.022*long + 0.015*works + 0.012*case + 0.012*don + 0.009*lot + 0.007*using + 0.006*phone'),
 (6,
  u'0.037*phone + 0.032*piece + 0.020*junk + 0.019*love + 0.016*disappointment + 0.016*device + 0.014*just + 0.012*screen + 0.012*want + 0.008*features'),
 (8,
  u'0.075*phone + 0.042*great + 0.023*case + 0.020*doesn + 0.016*charger + 0.012*works + 0.011*pleased + 0.011*excellent + 0.010*using + 0.010*work'),
 (12,
  u'0.027*bad + 0.020*new + 0.013*car + 0.013*like + 0.013*love + 0.012*better + 0.011*pretty + 0.010*ear + 0.010*software + 0.010*work'),
 (9,
  u'0.043*waste + 0.034*money + 0.030*phone + 0.016*cell + 0.016*product + 0.015*do

Some topics will be clearer than others.  The following topics represent clear concepts:
- Cooking and Recipes: 0.009 \* cup + 0.009 \* recipe + 0.007 \* make + 0.007 \* food + 0.006 \* sugar
- Cooking and recipes: 0.013 \* butter + 0.010 \* baking + 0.010 \* dough + 0.009 \* cup + 0.009 \* sugar
- Fashion and Style: 0.013 \* fashion + 0.006 \* like + 0.006 \* dress + 0.005 \* style

In [85]:
num_topics = 20
n_words_per_topic = 100
for ti, topic in enumerate(model.show_topics(num_words=n_words_per_topic, num_topics=num_topics)):
    print("Topic: %d" % (ti))
    print (topic)    
    print()

Topic: 0
(0, u'0.064*buy + 0.021*don + 0.016*pay + 0.014*phone + 0.014*card + 0.013*good + 0.012*purchase + 0.011*seller + 0.011*sony + 0.010*piece + 0.010*value + 0.009*product + 0.008*junk + 0.008*decent + 0.008*sure + 0.007*try + 0.007*amazon + 0.007*pleased + 0.007*item + 0.007*ericsson + 0.006*pictures + 0.006*sim + 0.006*need + 0.006*thanks + 0.006*thought + 0.006*memory + 0.006*forget + 0.006*return + 0.005*care + 0.005*calling + 0.005*sucks + 0.005*tell + 0.005*100 + 0.005*sold + 0.005*plan + 0.004*definitely + 0.004*thinking + 0.004*hope + 0.004*data + 0.004*impossible + 0.004*satisfied + 0.004*loved + 0.004*technology + 0.004*crap + 0.004*kit + 0.004*fault + 0.004*research + 0.003*available + 0.003*tone + 0.003*expensive + 0.003*cingular + 0.003*plastic + 0.003*contacts + 0.003*friend + 0.003*skype + 0.003*earbuds + 0.003*breaks + 0.003*sd + 0.003*rating + 0.003*band + 0.003*ones + 0.003*ringtone + 0.003*thats + 0.003*run + 0.003*process + 0.003*50 + 0.003*ordering + 0.003*cl

In [67]:
model.show_topics??

## Word2Vec with Gensim

In [86]:
# Setup the body text
sentences = df.review.map(lambda review: review.split())

In [88]:
sentences

0        [i, try, not, to, adjust, the, volume, setting...
1        [so, there, is, no, way, for, me, to, plug, it...
2                         [good, case,, excellent, value.]
3        [i, thought, motorola, made, reliable, product...
4                          [battery, for, motorola, razr.]
5                              [great, for, the, jawbone.]
6        [when, i, got, this, item, it, was, larger, th...
7        [(i, looked, for, one, that, specifically, sai...
8        [the, first, time, it, was, turned, on, the, s...
9        [in, some, programs, clicking, it, is, the, sa...
10       [tied, to, charger, for, conversations, lastin...
11                                  [the, mic, is, great.]
12       [what, happened, was, that, i, only, had, like...
13       [i, have, to, jiggle, the, plug, to, get, it, ...
14       [i, bought, five, of, thes, for, less, than, f...
                               ...                        
14989                                  [not, spectacular

In [87]:
model = models.Word2Vec(sentences, size = 100, window = 5, min_count = 5, workers = 2)

`Word2Vec` has many arguments:
- `size` represents how many concepts or topics we should use
- `window` represents how many words surrounding a sentence we should use as our original feature
- `min_count` is the number of times that context or word must appear
- `workers` is the number of CPU cores to use to speed up model training

(Check http://radimrehurek.com/gensim/models/word2vec as needed)

In [89]:
model

<gensim.models.word2vec.Word2Vec at 0x11c089d10>

### Most similar words

The model has a `most_similar` function that helps find the words most similar to the one you queried.  This will return words that are most often used in the same context.

In [91]:
model.most_similar(positive = ['ear'])

[('volume', 0.9977969527244568),
 ('ear.', 0.9967939853668213),
 ('buttons', 0.9965591430664062),
 ('comfortable', 0.9965134263038635),
 ('video', 0.9961884021759033),
 ('side', 0.9961011409759521),
 ('loud,', 0.9959816932678223),
 ('ear,', 0.9959695339202881),
 ('quiet', 0.9959319829940796),
 ('button', 0.995930552482605)]

In [34]:
vectorizer.get_feature_names()

In [None]:
sentences

In [93]:
sentences = list(map(lambda sentence: list(filter(lambda word: word in vectorizer.get_feature_names(), sentence)), sentences))

KeyboardInterrupt: 

In [92]:
sentences

0        [i, try, not, to, adjust, the, volume, setting...
1        [so, there, is, no, way, for, me, to, plug, it...
2                         [good, case,, excellent, value.]
3        [i, thought, motorola, made, reliable, product...
4                          [battery, for, motorola, razr.]
5                              [great, for, the, jawbone.]
6        [when, i, got, this, item, it, was, larger, th...
7        [(i, looked, for, one, that, specifically, sai...
8        [the, first, time, it, was, turned, on, the, s...
9        [in, some, programs, clicking, it, is, the, sa...
10       [tied, to, charger, for, conversations, lastin...
11                                  [the, mic, is, great.]
12       [what, happened, was, that, i, only, had, like...
13       [i, have, to, jiggle, the, plug, to, get, it, ...
14       [i, bought, five, of, thes, for, less, than, f...
                               ...                        
14989                                  [not, spectacular

In [37]:
model = models.Word2Vec(sentences, size = 100, window = 5, min_count = 5, workers = 4)

In [41]:
model.most_similar(positive = ['phone'])

[('phones', 0.22747108340263367),
 ('piece', 0.22566647827625275),
 ('battery', 0.19638335704803467),
 ('does', 0.18861348927021027),
 ('hard', 0.18570223450660706),
 ('cheap', 0.18267108500003815),
 ('item', 0.17193655669689178),
 ('days', 0.17131055891513824),
 ('fits', 0.16991697251796722),
 ('broke', 0.16979514062404633)]

In [95]:
import detectEnglish
detectEnglish.isEnglish('chris')

ImportError: No module named detectEnglish

In [98]:
from spacy.en import English
nlp_toolkit = English()

parsed = nlp_toolkit(u'Chris is lost')

for (i, word) in enumerate(parsed): 
    print word
    print "\tParent: {}".format(word.head.lemma_)
    print "\tPhrase type: {}".format(word.dep_)
    print "\tKnown entity type: {}".format(word.ent_type_ if word.ent_type_ else 'n/a')
    print "\tLemma: {}".format(word.lemma_)


Chris
	Parent: lose
	Phrase type: nsubjpass
	Known entity type: PERSON
	Lemma: chris
is
	Parent: lose
	Phrase type: auxpass
	Known entity type: n/a
	Lemma: be
lost
	Parent: lose
	Phrase type: ROOT
	Known entity type: n/a
	Lemma: lose
