# Demo 13 Latent Variable Modeling with `gensim`

`gensim` (http://radimrehurek.com/gensim) is a library of language processing tools focused on latent variable models of text.

In [1]:
import os
from os import path
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from gensim import matutils
from gensim.models.ldamodel import LdaModel
from gensim.models.word2vec import Word2Vec

pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

%matplotlib inline
plt.style.use('ggplot')



The data is about sentiments on Amazon reviews.

In [2]:
reviews = []
sentiments = []

with open('amazon-reviews.txt') as f:
    for line in f.readlines():
        line = line.strip('\n')
        review, sentiment = line.split('\t')
        sentiment = np.nan if sentiment == '' else int(sentiment)

        reviews.append(review.lower())
        sentiments.append(sentiment)

df = pd.DataFrame({'review': reviews, 'sentiment': sentiments})

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,i try not to adjust the volume setting to avoi...,
1,so there is no way for me to plug it in here i...,0.0
2,"good case, excellent value.",1.0
3,i thought motorola made reliable products!.,
4,battery for motorola razr.,


In [4]:
df.dropna(inplace = True) # Let's drop the NaN

In [5]:
df.head()

Unnamed: 0,review,sentiment
1,so there is no way for me to plug it in here i...,0.0
2,"good case, excellent value.",1.0
5,great for the jawbone.,1.0
10,tied to charger for conversations lasting more...,0.0
11,the mic is great.,1.0


## LDA with `gensim`

### Let's first translate a set of documents (articles) into a matrix representation with a row per document and a column per feature (word or n-gram)

In [6]:
vect = CountVectorizer(stop_words = 'english')

In [7]:
documents = vect.fit_transform(df.review)

In [8]:
# Let's now build a mapping of numerical ID to word

id2word = dict(enumerate(vect.get_feature_names()))

In [9]:
id2word

{0: '10',
 1: '100',
 2: '11',
 3: '12',
 4: '13',
 5: '15',
 6: '15g',
 7: '18',
 8: '20',
 9: '2000',
 10: '2005',
 11: '2160',
 12: '24',
 13: '2mp',
 14: '325',
 15: '350',
 16: '375',
 17: '3o',
 18: '42',
 19: '44',
 20: '45',
 21: '4s',
 22: '50',
 23: '5020',
 24: '510',
 25: '5320',
 26: '680',
 27: '700w',
 28: '8125',
 29: '8525',
 30: '8530',
 31: 'abhor',
 32: 'ability',
 33: 'able',
 34: 'abound',
 35: 'absolutel',
 36: 'absolutely',
 37: 'ac',
 38: 'accept',
 39: 'acceptable',
 40: 'access',
 41: 'accessable',
 42: 'accessing',
 43: 'accessory',
 44: 'accessoryone',
 45: 'accidentally',
 46: 'accompanied',
 47: 'according',
 48: 'activate',
 49: 'activated',
 50: 'activesync',
 51: 'actually',
 52: 'ad',
 53: 'adapter',
 54: 'adapters',
 55: 'add',
 56: 'addition',
 57: 'additional',
 58: 'address',
 59: 'adhesive',
 60: 'adorable',
 61: 'advertised',
 62: 'advise',
 63: 'aggravating',
 64: 'ago',
 65: 'alarm',
 66: 'allot',
 67: 'allow',
 68: 'allowing',
 69: 'allows',


### We want to learn which columns are correlated (i.e., likely to come from the same topic).  This is the word distribution.  We can also determine what topics are in each document, the topic distribution.

In [10]:
# First we convert our word-matrix into gensim's format

corpus = matutils.Sparse2Corpus(documents, documents_columns = False)

(Check https://radimrehurek.com/gensim/matutils as needed)

(Check https://radimrehurek.com/gensim/models/ldamodel as needed)

In [11]:
# Then we fit an LDA model

lda = LdaModel(corpus = corpus, num_topics = 25, id2word = id2word, passes = 10)

In this model, we need to explicitly specify the number of topic we want the model to uncover.  This is a critical parameter, but there isn't much guidance on how to choose it.  Try to use domain expertise where possible.

### Goodness of fit

Now we need to assess the goodness of fit for our model.  Like other unsupervised learning techniques, our validation techniques are mostly about interpretation.

Use the following questions to guide you:
- Did we learn reasonable topics?
- Do the words that make up a topic make sense?
- Is this topic helpful towards our goal?

In [12]:
lda.print_topics()

[(9,
  '0.052*"phone" + 0.012*"worked" + 0.012*"ear" + 0.012*"color" + 0.012*"job" + 0.009*"nice" + 0.008*"battery" + 0.008*"don" + 0.008*"great" + 0.008*"impressed"'),
 (8,
  '0.178*"great" + 0.066*"phone" + 0.051*"works" + 0.030*"love" + 0.022*"waste" + 0.018*"product" + 0.017*"time" + 0.015*"ve" + 0.014*"device" + 0.012*"money"'),
 (23,
  '0.037*"nice" + 0.033*"phone" + 0.018*"work" + 0.015*"did" + 0.012*"weeks" + 0.012*"better" + 0.012*"problem" + 0.012*"mistake" + 0.011*"good" + 0.010*"make"'),
 (17,
  '0.021*"charger" + 0.018*"great" + 0.015*"works" + 0.013*"quality" + 0.012*"use" + 0.012*"phone" + 0.012*"people" + 0.011*"nice" + 0.008*"seriously" + 0.008*"case"'),
 (15,
  '0.028*"ve" + 0.027*"phone" + 0.027*"far" + 0.020*"good" + 0.017*"does" + 0.016*"sound" + 0.012*"quality" + 0.012*"like" + 0.012*"best" + 0.009*"ear"'),
 (6,
  '0.038*"buy" + 0.027*"don" + 0.020*"product" + 0.017*"phone" + 0.016*"makes" + 0.012*"calls" + 0.012*"easier" + 0.011*"working" + 0.011*"hold" + 0.010*"

In [13]:
# We can limit what is returned by specifying the number of topics we're interested in and the number of works to return
num_topics = 10
num_words = 5
for ti, topic in enumerate(lda.show_topics(num_topics = num_topics, num_words= num_words)):
    print("Topic: %d" % (ti))
    print (topic)
    print()


Topic: 0
(18, '0.050*"comfortable" + 0.025*"ear" + 0.019*"ve" + 0.018*"useless" + 0.018*"order"')

Topic: 1
(21, '0.040*"service" + 0.031*"phone" + 0.030*"does" + 0.024*"customer" + 0.022*"better"')

Topic: 2
(9, '0.052*"phone" + 0.012*"worked" + 0.012*"ear" + 0.012*"color" + 0.012*"job"')

Topic: 3
(4, '0.075*"good" + 0.072*"price" + 0.055*"product" + 0.031*"excellent" + 0.025*"purchase"')

Topic: 4
(0, '0.023*"good" + 0.023*"don" + 0.018*"design" + 0.015*"service" + 0.014*"problems"')

Topic: 5
(12, '0.086*"phone" + 0.028*"worst" + 0.020*"new" + 0.018*"sturdy" + 0.015*"great"')

Topic: 6
(17, '0.021*"charger" + 0.018*"great" + 0.015*"works" + 0.013*"quality" + 0.012*"use"')

Topic: 7
(5, '0.040*"happy" + 0.033*"horrible" + 0.024*"phone" + 0.018*"product" + 0.015*"stay"')

Topic: 8
(14, '0.049*"poor" + 0.037*"good" + 0.036*"quality" + 0.025*"phone" + 0.019*"just"')

Topic: 9
(10, '0.028*"headset" + 0.024*"time" + 0.022*"bluetooth" + 0.019*"long" + 0.016*"love"')



In [15]:
#Similar method but without formatting
lda.print_topics(num_topics=3, num_words=5)

[(18,
  '0.050*"comfortable" + 0.025*"ear" + 0.019*"ve" + 0.018*"useless" + 0.018*"order"'),
 (2,
  '0.044*"quality" + 0.028*"sound" + 0.027*"case" + 0.026*"piece" + 0.025*"use"'),
 (21,
  '0.040*"service" + 0.031*"phone" + 0.030*"does" + 0.024*"customer" + 0.022*"better"')]

Some topics will be clearer than others.  The following topics represent clear concepts:
- Cooking and Recipes: 0.009 \* cup + 0.009 \* recipe + 0.007 \* make + 0.007 \* food + 0.006 \* sugar
- Cooking and recipes: 0.013 \* butter + 0.010 \* baking + 0.010 \* dough + 0.009 \* cup + 0.009 \* sugar
- Fashion and Style: 0.013 \* fashion + 0.006 \* like + 0.006 \* dress + 0.005 \* style

## Word2Vec with `gensim`

### Preparing the Input
Starting from the beginning, gensim’s word2vec expects a sequence of sentences as its input. Each sentence a list of words (utf8 strings):

In [16]:
# Setup the body text
sentences = df.review.map(lambda review: review.split())

In [17]:
sentences

1       [so, there, is, no, way, for, me, to, plug, it...
2                        [good, case,, excellent, value.]
5                             [great, for, the, jawbone.]
10      [tied, to, charger, for, conversations, lastin...
11                                 [the, mic, is, great.]
                              ...                        
2925    [the, screen, does, get, smudged, easily, beca...
2930    [what, a, piece, of, junk.., i, lose, more, ca...
2934                   [item, does, not, match, picture.]
2935    [the, only, thing, that, disappoint, me, is, t...
2937    [you, can, not, answer, calls, with, the, unit...
Name: review, Length: 1000, dtype: object

### Training
Word2vec accepts several parameters that affect both training speed and quality.

One of them is for pruning the internal dictionary. Words that appear only once or twice in a billion-word corpus are probably uninteresting typos and garbage. In addition, there’s not enough data to make any meaningful training on those words, so it’s best to ignore them:

**1
model = Word2Vec(sentences, min_count=10)**  

Note: default value is 5
A reasonable value for **min_count** is between 0-100, depending on the size of your dataset.

Another parameter is the size of the **NN layers**, which correspond to the “degrees” of freedom the training algorithm has:

**1
model = Word2Vec(sentences, size=200)**  

Note: default value is 100
Bigger size values require more training data, but can lead to better (more accurate) models. Reasonable values are in the tens to hundreds.

The last of the major parameters (full list here) is for **training parallelization**, to speed up training:

**1
model = Word2Vec(sentences, workers=4)** 

Note: default = 1 worker = no parallelization
The workers parameter has only effect if you have Cython installed. Without Cython, you’ll only be able to use one core because of the GIL (and word2vec training will be miserably slow).

In [18]:
# So lets train against our previously prepared data
model = Word2Vec(sentences, size = 100, window = 5, min_count = 5, workers = 4)

# I threw another parameter in there - of which there are many - what does it do?

## Evaluating
Word2vec training is an unsupervised task, there’s no good way to objectively evaluate the result. Evaluation depends on your end application.

Google have released their testing set of about 20,000 syntactic and semantic test examples, following the “A is to B as C is to D” task: https://raw.githubusercontent.com/RaRe-Technologies/gensim/develop/gensim/test/test_data/questions-words.txt.

Gensim support the same evaluation set, in exactly the same format:

Once again, good performance on this test set doesn’t mean word2vec will work well in your application, or vice versa. It’s always best to evaluate directly on your intended task.

`Word2Vec` has many arguments:
- `size` represents how many concepts or topics we should use
- `window` represents how many words surrounding a sentence we should use as our original feature
- `min_count` is the number of times that context or word must appear
- `workers` is the number of CPU cores to use to speed up model training

(Check http://radimrehurek.com/gensim/models/word2vec as needed)

### Most similar words

The model has a `most_similar` function that helps find the words most similar to the one you queried.  This will return words that are most often used in the same context.

In [19]:
model.wv.most_similar(positive = ['great'])

[('and', 0.9992222785949707),
 ('not', 0.9991952776908875),
 ('is', 0.9991869330406189),
 ('of', 0.9991730451583862),
 ('with', 0.9991649389266968),
 ('my', 0.9991316199302673),
 ('it', 0.9991289377212524),
 ('a', 0.9991236925125122),
 ('the', 0.9991183280944824),
 ('i', 0.9990901947021484)]

In [20]:
# That doesn't seem to line up with what I had in my mind. Let's have a look at the features available

vect.get_feature_names()

# Golly... that list is long. Now we're remembering the idea of pre-processing from last class.

['10',
 '100',
 '11',
 '12',
 '13',
 '15',
 '15g',
 '18',
 '20',
 '2000',
 '2005',
 '2160',
 '24',
 '2mp',
 '325',
 '350',
 '375',
 '3o',
 '42',
 '44',
 '45',
 '4s',
 '50',
 '5020',
 '510',
 '5320',
 '680',
 '700w',
 '8125',
 '8525',
 '8530',
 'abhor',
 'ability',
 'able',
 'abound',
 'absolutel',
 'absolutely',
 'ac',
 'accept',
 'acceptable',
 'access',
 'accessable',
 'accessing',
 'accessory',
 'accessoryone',
 'accidentally',
 'accompanied',
 'according',
 'activate',
 'activated',
 'activesync',
 'actually',
 'ad',
 'adapter',
 'adapters',
 'add',
 'addition',
 'additional',
 'address',
 'adhesive',
 'adorable',
 'advertised',
 'advise',
 'aggravating',
 'ago',
 'alarm',
 'allot',
 'allow',
 'allowing',
 'allows',
 'alot',
 'aluminum',
 'amazed',
 'amazing',
 'amazon',
 'amp',
 'ample',
 'angeles',
 'angle',
 'answer',
 'ant',
 'antena',
 'anti',
 'apart',
 'apartment',
 'apparently',
 'appealing',
 'appearance',
 'appears',
 'applifies',
 'appointments',
 'area',
 'arguing',
 'a

In [21]:
# Lets remember what we have to work with
sentences

1       [so, there, is, no, way, for, me, to, plug, it...
2                        [good, case,, excellent, value.]
5                             [great, for, the, jawbone.]
10      [tied, to, charger, for, conversations, lastin...
11                                 [the, mic, is, great.]
                              ...                        
2925    [the, screen, does, get, smudged, easily, beca...
2930    [what, a, piece, of, junk.., i, lose, more, ca...
2934                   [item, does, not, match, picture.]
2935    [the, only, thing, that, disappoint, me, is, t...
2937    [you, can, not, answer, calls, with, the, unit...
Name: review, Length: 1000, dtype: object

In [22]:
# What if I built a way to filter out more pertinent words

sentences2 = list(map(lambda sentence: list(filter(lambda word: word in vect.get_feature_names(), sentence)), sentences))

In [23]:
# Let's visualize it again to see what happened to my sentences. 
sentences2

# That looks better - shorter lists of the words related to the feature_names

[['way', 'plug', 'unless'],
 ['good', 'excellent'],
 ['great'],
 ['tied', 'charger', 'conversations', 'lasting', '45'],
 ['mic'],
 ['jiggle', 'plug', 'line', 'right', 'decent'],
 ['dozen', 'imagine', 'fun', 'sending'],
 ['razr'],
 ['needless', 'wasted'],
 ['waste', 'money'],
 ['sound', 'quality'],
 ['impressed', 'going', 'original', 'battery', 'extended'],
 ['seperated',
  'mere',
  'ft',
  'started',
  'notice',
  'excessive',
  'static',
  'garbled',
  'sound'],
 ['good', 'quality'],
 ['design', 'ear', 'comfortable'],
 ['highly', 'recommend', 'blue', 'tooth'],
 ['advise'],
 ['far'],
 ['works'],
 ['clicks', 'place', 'way', 'makes', 'wonder', 'long', 'mechanism'],
 ['went', 'website', 'followed', 'pair'],
 ['bought', 'use', 'kindle', 'absolutely', 'loved'],
 ['commercials'],
 ['run', 'new', 'battery', 'bars', 'days'],
 ['bought', 'mother', 'problem'],
 ['great', 'pocket', 'pc', 'phone'],
 ['owned', 'phone', 'months', 'say', 'best', 'mobile', 'phone'],
 ['think', 'instructions', 'provid

In [24]:
#Time to drop it back into my model
model_new = Word2Vec(sentences2, size = 100, window = 5, min_count = 5, workers = 4)

In [25]:
#Run this little thing again
model_new.wv.most_similar(positive = ['great'])

#Ok - that looks much better. So.... what did we learn?

[('makes', 0.27968427538871765),
 ('talk', 0.27064287662506104),
 ('works', 0.25068968534469604),
 ('headsets', 0.24682296812534332),
 ('purchase', 0.21913033723831177),
 ('long', 0.18717731535434723),
 ('gets', 0.18573476374149323),
 ('did', 0.18376567959785461),
 ('fits', 0.1788528561592102),
 ('phones', 0.1679503619670868)]

## Preprocessing

Let's look at using pre-processing to my information before I kick it into once of these models - in this case LDA

In [28]:
# Not the best method so let's use some of the pre-processing tools we learned last class to setup a cleaning function.
# You'll need to make sure you have some resources downloaded. Run the below then comment it out for the future
#import nltk
#nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\slongstreet\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [28]:
# What am I grabbing? Stopwords, lemmatizer and string (to deal with punctuation)
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string


In [29]:
# I hate typing the whole thing and I like to set these against key values like with my stopwords
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()

In [26]:
# Time to package these into a function
def clean(doc):
    #Lets create a stopword free list of lowercase letters that do not include my stopwords
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop]) 
    #Now take that list and remove any punctuation
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    #Now to lemmatize each word so I have a fully cleaned list
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

In [30]:
#I'm creating doc_clean to received my cleaned list then saying I want individual words for each document in df.review
doc_clean = [clean(doc).split() for doc in df.review]   

In [31]:
# Importing Gensim
import gensim
from gensim import corpora

# Creating the term dictionary of our courpus, where every unique term is assigned an index. 
dictionary = corpora.Dictionary(doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

In [32]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)

In [34]:
# Showing Results
ldamodel.print_topics(num_topics=3, num_words=5)

[(0,
  '0.036*"phone" + 0.035*"great" + 0.026*"work" + 0.019*"product" + 0.018*"good"'),
 (1,
  '0.017*"phone" + 0.014*"headset" + 0.012*"charger" + 0.011*"it" + 0.009*"im"'),
 (2,
  '0.018*"phone" + 0.012*"one" + 0.012*"ear" + 0.011*"headset" + 0.011*"ive"')]

## What about Word2vec?

In [71]:
# Can we use any of the word we've done so far? Let's look at doc_clean
doc_clean

[['way', 'plug', 'u', 'unless', 'go', 'converter'],
 ['good', 'case', 'excellent', 'value'],
 ['great', 'jawbone'],
 ['tied',
  'charger',
  'conversation',
  'lasting',
  '45',
  'minutesmajor',
  'problem'],
 ['mic', 'great'],
 ['jiggle', 'plug', 'get', 'line', 'right', 'get', 'decent', 'volume'],
 ['several',
  'dozen',
  'several',
  'hundred',
  'contact',
  'imagine',
  'fun',
  'sending',
  'one',
  'one'],
 ['razr', 'owneryou', 'must', 'this'],
 ['needle', 'say', 'wasted', 'money'],
 ['waste', 'money', 'time'],
 ['sound', 'quality', 'great'],
 ['impressed', 'going', 'original', 'battery', 'extended', 'battery'],
 ['two',
  'seperated',
  'mere',
  '5',
  'ft',
  'started',
  'notice',
  'excessive',
  'static',
  'garbled',
  'sound',
  'headset'],
 ['good', 'quality', 'though'],
 ['design', 'odd', 'ear', 'clip', 'comfortable', 'all'],
 ['highly', 'recommend', 'one', 'blue', 'tooth', 'phone'],
 ['advise', 'everyone', 'fooled'],
 ['far', 'good'],
 ['work', 'great'],
 ['click',
 

In [35]:
# It looks right to me. Let's put it back into the model

model_clean = Word2Vec(doc_clean, size = 100, window = 5, min_count = 5, workers = 4)

In [36]:
# How did it do? Let's try this function again
model_new.wv.most_similar(positive = ['great'])

[('makes', 0.27968427538871765),
 ('talk', 0.27064287662506104),
 ('works', 0.25068968534469604),
 ('headsets', 0.24682296812534332),
 ('purchase', 0.21913033723831177),
 ('long', 0.18717731535434723),
 ('gets', 0.18573476374149323),
 ('did', 0.18376567959785461),
 ('fits', 0.1788528561592102),
 ('phones', 0.1679503619670868)]