# Gensim = "Generate Similar"

In [1]:
corpus = [
   "A survey of user opinion of computer system response time.", 
   "Relation of user perceived response time to error measurement.", 
   "The generation of random binary unordered trees.", 
   "The intersection graph of paths in trees.", 
   "Graph minors IV Widths of trees and well quasi ordering.",
]

In [2]:
import nltk
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english')) 

In [3]:
from nltk.tokenize import word_tokenize

processed_corpus = [[word for word in word_tokenize(document.lower()) 
                        if not word in stop_words and word.isalnum()] 
                    for document in corpus]

processed_corpus

[['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'],
 ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'],
 ['generation', 'random', 'binary', 'unordered', 'trees'],
 ['intersection', 'graph', 'paths', 'trees'],
 ['graph', 'minors', 'iv', 'widths', 'trees', 'well', 'quasi', 'ordering']]

In [4]:
from gensim import corpora
dictionary = corpora.Dictionary(processed_corpus)
print(dictionary)

Dictionary(25 unique tokens: ['computer', 'opinion', 'response', 'survey', 'system']...)


In [5]:
dictionary.token2id

{'computer': 0,
 'opinion': 1,
 'response': 2,
 'survey': 3,
 'system': 4,
 'time': 5,
 'user': 6,
 'error': 7,
 'measurement': 8,
 'perceived': 9,
 'relation': 10,
 'binary': 11,
 'generation': 12,
 'random': 13,
 'trees': 14,
 'unordered': 15,
 'graph': 16,
 'intersection': 17,
 'paths': 18,
 'iv': 19,
 'minors': 20,
 'ordering': 21,
 'quasi': 22,
 'well': 23,
 'widths': 24}

In [6]:
# bag of word
bow = [dictionary.doc2bow(text) for text in processed_corpus]
bow

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)],
 [(2, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1)],
 [(11, 1), (12, 1), (13, 1), (14, 1), (15, 1)],
 [(14, 1), (16, 1), (17, 1), (18, 1)],
 [(14, 1), (16, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1)]]

In [7]:
from gensim import models
tfidf = models.TfidfModel(bow)
words = "trees graph".lower().split()
tfidf[dictionary.doc2bow(words)]

[(14, 0.4869354917707381), (16, 0.8734379353188121)]

In [8]:
from gensim.utils import simple_preprocess

doc_list = [
   "Hello, how are you?", "How do you do?", 
   "Hey what are you doing? yes you What are you doing?"
]

doc_tokenized = [simple_preprocess(doc) for doc in doc_list]
doc_tokenized

[['hello', 'how', 'are', 'you'],
 ['how', 'do', 'you', 'do'],
 ['hey',
  'what',
  'are',
  'you',
  'doing',
  'yes',
  'you',
  'what',
  'are',
  'you',
  'doing']]

In [9]:
doc_dictionary = corpora.Dictionary()
doc_bow = [doc_dictionary.doc2bow(doc, allow_update=True) for doc in doc_tokenized]
doc_bow

[[(0, 1), (1, 1), (2, 1), (3, 1)],
 [(2, 1), (3, 1), (4, 2)],
 [(0, 2), (3, 3), (5, 2), (6, 1), (7, 2), (8, 1)]]

In [10]:
id_words = [[(doc_dictionary[id], count) for id, count in line] for line in doc_bow]
print(id_words)

[[('are', 1), ('hello', 1), ('how', 1), ('you', 1)], [('how', 1), ('you', 1), ('do', 2)], [('are', 2), ('you', 3), ('doing', 2), ('hey', 1), ('what', 2), ('yes', 1)]]
