In [31]:
import time
import random
from math import *
import operator
import pandas as pd
import numpy as np
pd.set_option("display.max_columns", 10000)
import string

# import plotting libraries
import matplotlib
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from matplotlib import style
%matplotlib inline 

import seaborn as sns
sns.set(style="white", color_codes=True)
sns.set(font_scale=1.5)

# For text processing
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import word_tokenize  
from nltk.tokenize import sent_tokenize 
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import spacy
from spacy.lang.en import English

from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer

# Importing Gensim
import gensim

# Gensim

Gensim is known as a Natural Language Processing package that does ‘Topic Modeling’. But its practically much more than that.

It is a great package for processing texts, 
- working with word vector models (such as Word2Vec, FastText etc) 
- and for building topic models.

The core concepts of gensim are:

    Document: some text.
    Corpus: a collection of documents.
    Dictionary: contains a map of all words (tokens) to its unique id.
    Vector: a mathematically convenient representation of a document.
    Model: an algorithm for transforming vectors from one representation to another.
    
## 1. Document

In Gensim, a document is an object of the text sequence type (commonly known as str in Python 3). 

A document could be anything from a short 140 character tweet, a single paragraph (i.e., journal article abstract), a news article, or a book.

In [64]:
document = "Human machine interface for lab abc computer applications"

## 2. Corpus

A corpus is a collection of Document objects. serves 2 roles in Gensim:

(1) Input for training a Model. During training, the models use this training corpus to look for common themes and topics, initializing their internal model parameters.

Gensim focuses on unsupervised models so that no human intervention, such as costly annotations or tagging documents by hand, is required.

(2) Documents to organize. After training, a topic model can be used to extract topics from new documents (documents not seen in the training corpus).

Such corpora can be indexed for __Similarity Queries__, queried by semantic similarity, clustered etc.

Here is an example corpus. 

It consists of 9 documents, where each document is a string consisting of a single sentence.

In [65]:
text_corpus = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
]

Another example could be a 
- list of all the plays written by Shakespeare, 
- list of all wikipedia articles, or 
- all tweets by a particular person/object of interest.

After collecting our corpus, there are typically __a number of preprocessing__ steps we want to undertake ...

In [67]:
set('for a of the and to in in in'.split(' '))

{'a', 'and', 'for', 'in', 'of', 'the', 'to'}

In [69]:
# Create a set of frequent words
stoplist = set('for a of the and to in'.split(' '))

# Lowercase each document, split it by white space and filter out stopwords
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in text_corpus]

In [70]:
texts

[['human', 'machine', 'interface', 'lab', 'abc', 'computer', 'applications'],
 ['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'management', 'system'],
 ['system', 'human', 'system', 'engineering', 'testing', 'eps'],
 ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'],
 ['generation', 'random', 'binary', 'unordered', 'trees'],
 ['intersection', 'graph', 'paths', 'trees'],
 ['graph', 'minors', 'iv', 'widths', 'trees', 'well', 'quasi', 'ordering'],
 ['graph', 'minors', 'survey']]

In [71]:
# Count word frequencies
from collections import defaultdict

frequency = defaultdict(int)

for text in texts:
    for token in text:
        frequency[token] += 1

In [72]:
frequency

defaultdict(int,
            {'human': 2,
             'machine': 1,
             'interface': 2,
             'lab': 1,
             'abc': 1,
             'computer': 2,
             'applications': 1,
             'survey': 2,
             'user': 3,
             'opinion': 1,
             'system': 4,
             'response': 2,
             'time': 2,
             'eps': 2,
             'management': 1,
             'engineering': 1,
             'testing': 1,
             'relation': 1,
             'perceived': 1,
             'error': 1,
             'measurement': 1,
             'generation': 1,
             'random': 1,
             'binary': 1,
             'unordered': 1,
             'trees': 3,
             'intersection': 1,
             'graph': 3,
             'paths': 1,
             'minors': 2,
             'iv': 1,
             'widths': 1,
             'well': 1,
             'quasi': 1,
             'ordering': 1})

In [74]:
# Only keep words that appear more than once
processed_corpus = [[token for token in text iv for text in texts]
print(processed_corpus)

[['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time'], ['eps', 'user', 'interface', 'system'], ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey']]


## 3. Dictionary

In gensim, the dictionary contains a map of all words (tokens) to its unique id.

You can create a dictionary from a paragraph of sentences, from a text file that contains multiple lines of text and from multiple such text files contained in a directory. 

In [75]:
from gensim import corpora

In [76]:
dictionary = corpora.Dictionary(processed_corpus)
print(dictionary)

Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)


## 4. Vector

Our processed corpus has 12 unique words in it, which means that each document will be represented by a 12-dimensional vector under the __bag-of-words__ model. 

We can use the dictionary to turn tokenized documents into these 12-dimensional vectors. We can see what these IDs correspond to:

In [77]:
dictionary.token2id

{'computer': 0,
 'human': 1,
 'interface': 2,
 'response': 3,
 'survey': 4,
 'system': 5,
 'time': 6,
 'user': 7,
 'eps': 8,
 'trees': 9,
 'graph': 10,
 'minors': 11}

### bag of wrods

#### doc2bow(document, allow_update=False, return_missing=False)
Convert document (a __list__ of words) into the bag-of-words format = list of (token_id, token_count) 2-tuples. 

Each word is assumed to be a tokenized and normalized string (either unicode or utf8-encoded). No further preprocessing is done on the words in document; apply tokenization, stemming etc. before calling this method.

If _allow_update_ is set, then also update dictionary in the process: create ids for new words. At the same time, update document frequencies – for each word appearing in this document, increase its document frequency (self.dfs) by one.

If allow_update is not set, this function is const, aka read-only.

In [81]:
new_doc = "Human computer survey survey survey interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)

[(0, 1), (1, 1), (4, 3)]


- The __first entry__ in each tuple corresponds to the ID of the token in the dictionary, 
- the __second__ corresponds to the count of this token.

Note that “interaction” did not occur in the original corpus and so it was not included in the vectorization. 

Also note that this vector only contains entries for words that actually appeared in the document. 

Because any given document will only contain a few words out of the many words in the dictionary, words that do not appear in the vectorization are represented as implicitly zero as a space saving measure.

We can convert our entire original corpus to a list of vectors:

In [82]:
processed_corpus

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

In [83]:
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]

In [84]:
bow_corpus

[[(0, 1), (1, 1), (2, 1)],
 [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(2, 1), (5, 1), (7, 1), (8, 1)],
 [(1, 1), (5, 2), (8, 1)],
 [(3, 1), (6, 1), (7, 1)],
 [(9, 1)],
 [(9, 1), (10, 1)],
 [(9, 1), (10, 1), (11, 1)],
 [(4, 1), (10, 1), (11, 1)]]

## 5. Model
Now that we have vectorized our corpus we can begin to transform it using models. 

We use model as an abstract term referring to a transformation from one document representation to another. 

In gensim documents are represented as vectors so a model can be thought of as a transformation between two vector spaces. 

The model learns the details of this transformation during training, when it reads the training Corpus.

_One simple example of a model is tf-idf_. 

The tf-idf model transforms vectors from the bag-of-words representation to a vector space where the frequency counts are weighted according to the relative rarity of each word in the corpus.

In [60]:
from gensim import models

# train the model
tfidf = models.TfidfModel(bow_corpus)

# transform the "system minors" string
words = "system minors".lower().split()

print(tfidf[dictionary.doc2bow(words)])

[(5, 0.5898341626740045), (11, 0.8075244024440723)]


The tfidf model again returns a list of tuples, where the first entry is the token ID and the second entry is the tf-idf weighting. 

_Note that the ID corresponding to “system” (which occurred 4 times in the original corpus) has been weighted lower than the ID corresponding to “minors” (which only occurred twice)_.

In [61]:
from gensim import similarities

index = similarities.SparseMatrixSimilarity(tfidf[bow_corpus], num_features=12)

and to query the similarity of our query document query_document against every document in the corpus:

In [85]:
query_document = 'system engineering'.split()

query_bow      = dictionary.doc2bow(query_document)

sims           = index[tfidf[query_bow]]

print(list(enumerate(sims)))

[(0, 0.0), (1, 0.32448703), (2, 0.41707572), (3, 0.7184812), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0)]


Document 3 has a similarity score of 0.718=72%, document 2 has a similarity score of 42% etc. 

We can make this slightly more readable by sorting:

In [86]:
for document_number, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True):
    print(document_number, score)

3 0.7184812
2 0.41707572
1 0.32448703
0 0.0
4 0.0
5 0.0
6 0.0
7 0.0
8 0.0


In [87]:
query_document = 'trees'.split()

query_bow      = dictionary.doc2bow(query_document)

sims           = index[tfidf[query_bow]]

print(list(enumerate(sims)))

[(0, 0.0), (1, 0.0), (2, 0.0), (3, 0.0), (4, 0.0), (5, 1.0), (6, 0.70710677), (7, 0.5080429), (8, 0.0)]


In [88]:
for document_number, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True):
    print(document_number, score)

5 1.0
6 0.70710677
7 0.5080429
0 0.0
1 0.0
2 0.0
3 0.0
4 0.0
8 0.0


In [2]:
# install microsoft C++ build tools 
# as admin
# - pip install spacy
# - python -m spacy download en
# - pip install gensim