# Corpora and Vector Space

In [1]:
from gensim import corpora
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import defaultdict
from pprint import pprint



In [2]:
documents = ["Human machine interface for lab abc computer applications",
              "A survey of user opinion of computer system response time",
              "The EPS user interface management system",
              "System and human system engineering testing of EPS",
              "Relation of user perceived response time to error measurement",
              "The generation of random binary unordered trees",
              "The intersection graph of paths in trees",
              "Graph minors IV Widths of trees and well quasi ordering",
              "Graph minors A survey"]

In [3]:
stop_words = set(stopwords.words('english'))

In [4]:
doc_without_stopwords = [[word for word in word_tokenize(doc.lower()) if word not in stop_words] for doc in documents]

In [5]:
doc_without_stopwords

[['human', 'machine', 'interface', 'lab', 'abc', 'computer', 'applications'],
 ['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'management', 'system'],
 ['system', 'human', 'system', 'engineering', 'testing', 'eps'],
 ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'],
 ['generation', 'random', 'binary', 'unordered', 'trees'],
 ['intersection', 'graph', 'paths', 'trees'],
 ['graph', 'minors', 'iv', 'widths', 'trees', 'well', 'quasi', 'ordering'],
 ['graph', 'minors', 'survey']]

In [6]:
freq = defaultdict(int)

In [7]:
for doc in doc_without_stopwords:
    for word in doc:
        freq[word]+=1

In [8]:
freq

defaultdict(int,
            {'abc': 1,
             'applications': 1,
             'binary': 1,
             'computer': 2,
             'engineering': 1,
             'eps': 2,
             'error': 1,
             'generation': 1,
             'graph': 3,
             'human': 2,
             'interface': 2,
             'intersection': 1,
             'iv': 1,
             'lab': 1,
             'machine': 1,
             'management': 1,
             'measurement': 1,
             'minors': 2,
             'opinion': 1,
             'ordering': 1,
             'paths': 1,
             'perceived': 1,
             'quasi': 1,
             'random': 1,
             'relation': 1,
             'response': 2,
             'survey': 2,
             'system': 4,
             'testing': 1,
             'time': 2,
             'trees': 3,
             'unordered': 1,
             'user': 3,
             'well': 1,
             'widths': 1})

In [9]:
doc_without_stopwords_lessfrequent = [[word for word in doc if freq[word] > 1] for doc in doc_without_stopwords]

In [10]:
doc_without_stopwords_lessfrequent

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

In [11]:
dictionary = corpora.Dictionary(doc_without_stopwords_lessfrequent)

In [12]:
for  items in dictionary.iteritems():
    print(items)

(0, 'human')
(1, 'interface')
(2, 'computer')
(3, 'survey')
(4, 'user')
(5, 'system')
(6, 'response')
(7, 'time')
(8, 'eps')
(9, 'trees')
(10, 'graph')
(11, 'minors')


In [13]:
dictionary.save('C:/Users/Sanjit/Python Practice/temp/corpora_and_vector_space/prac.dict')

In [14]:
print(dictionary)

Dictionary(12 unique tokens: ['human', 'interface', 'computer', 'survey', 'user']...)


In [15]:
print(dictionary.token2id)

{'human': 0, 'interface': 1, 'computer': 2, 'survey': 3, 'user': 4, 'system': 5, 'response': 6, 'time': 7, 'eps': 8, 'trees': 9, 'graph': 10, 'minors': 11}


In [16]:
sent = "Human Computer Interaction"
sent_bow = dictionary.doc2bow(word_tokenize(sent.lower()))

In [17]:
sent_bow

[(0, 1), (2, 1)]

In [18]:
corpus_doc_to_bow = [dictionary.doc2bow(doc) for doc in doc_without_stopwords_lessfrequent]

In [19]:
pprint(corpus_doc_to_bow)

[[(0, 1), (1, 1), (2, 1)],
 [(2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(1, 1), (4, 1), (5, 1), (8, 1)],
 [(0, 1), (5, 2), (8, 1)],
 [(4, 1), (6, 1), (7, 1)],
 [(9, 1)],
 [(9, 1), (10, 1)],
 [(9, 1), (10, 1), (11, 1)],
 [(3, 1), (10, 1), (11, 1)]]


In [20]:
corpora.MmCorpus.serialize('C:/Users/Sanjit/Python Practice/temp/corpora_and_vector_space/prac.mm',corpus_doc_to_bow)

In [21]:
mm = corpora.MmCorpus('C:/Users/Sanjit/Python Practice/temp/corpora_and_vector_space/prac.mm')

In [22]:
print(mm[0])

[(0, 1.0), (1, 1.0), (2, 1.0)]


In [23]:
print(mm[1])

[(2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0)]


In [24]:
print(mm[2])

[(1, 1.0), (4, 1.0), (5, 1.0), (8, 1.0)]


# TF-IDF

In [25]:
from gensim import models
from gensim import similarities

In [26]:
doc_corpus = corpora.MmCorpus('C:/Users/Sanjit/Python Practice/temp/corpora_and_vector_space/prac.mm')

In [27]:
for i in range(doc_corpus.num_docs):
    print(doc_corpus[i])

[(0, 1.0), (1, 1.0), (2, 1.0)]
[(2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0)]
[(1, 1.0), (4, 1.0), (5, 1.0), (8, 1.0)]
[(0, 1.0), (5, 2.0), (8, 1.0)]
[(4, 1.0), (6, 1.0), (7, 1.0)]
[(9, 1.0)]
[(9, 1.0), (10, 1.0)]
[(9, 1.0), (10, 1.0), (11, 1.0)]
[(3, 1.0), (10, 1.0), (11, 1.0)]


In [28]:
tfidf_corpus_model = models.TfidfModel(doc_corpus)

In [29]:
sent_bow = [(0,1),(1,1)]

In [30]:
print(tfidf_corpus_model[sent_bow])

[(0, 0.7071067811865476), (1, 0.7071067811865476)]


In [31]:
tfidf_corpus = tfidf_corpus_model[doc_corpus]

In [32]:
for doc_tfidf in tfidf_corpus:
    print(doc_tfidf)

[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]
[(2, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.3244870206138555), (5, 0.3244870206138555), (6, 0.44424552527467476), (7, 0.44424552527467476)]
[(1, 0.5710059809418182), (4, 0.4170757362022777), (5, 0.4170757362022777), (8, 0.5710059809418182)]
[(0, 0.49182558987264147), (5, 0.7184811607083769), (8, 0.49182558987264147)]
[(4, 0.45889394536615247), (6, 0.6282580468670046), (7, 0.6282580468670046)]
[(9, 1.0)]
[(9, 0.7071067811865475), (10, 0.7071067811865475)]
[(9, 0.5080429008916749), (10, 0.5080429008916749), (11, 0.695546419520037)]
[(3, 0.6282580468670046), (10, 0.45889394536615247), (11, 0.6282580468670046)]


In [33]:
sim_model = similarities.SparseMatrixSimilarity(tfidf_corpus,num_docs=doc_corpus.num_docs,num_features=doc_corpus.num_terms)

In [34]:
query_bow = [(0,1),(1,1),(2,1),(5,1)]

In [35]:
sim_model.get_similarities(tfidf_corpus_model[query_bow])

array([ 0.92141873,  0.36241665,  0.46582818,  0.54082316,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ], dtype=float32)

In [36]:
index = similarities.SparseMatrixSimilarity(tfidf_corpus, num_docs=doc_corpus.num_docs, num_features=doc_corpus.num_terms)

In [37]:
sim = index[tfidf_corpus_model[query_bow]]

In [38]:
list(enumerate(sim))

[(0, 0.92141873),
 (1, 0.36241665),
 (2, 0.46582818),
 (3, 0.54082316),
 (4, 0.0),
 (5, 0.0),
 (6, 0.0),
 (7, 0.0),
 (8, 0.0)]