In [1]:
import gensim
from gensim import models
import numpy as np

documents = ["This is the first line",
          "This is the second sentence",
          "This third document third "]

In [2]:
#Tokenize using nltk module
from nltk.tokenize import word_tokenize
gen_docs = [[w.lower() for w in word_tokenize(text)]
            for text in documents]
print(gen_docs)

[['this', 'is', 'the', 'first', 'line'], ['this', 'is', 'the', 'second', 'sentence'], ['this', 'third', 'document', 'third']]


In [3]:
dictionary = gensim.corpora.Dictionary(gen_docs)
dictionary.token2id

#dictionary is like bag of words

{'first': 0,
 'is': 1,
 'line': 2,
 'the': 3,
 'this': 4,
 'second': 5,
 'sentence': 6,
 'document': 7,
 'third': 8}

In [4]:
documents[0:3:2]

['This is the first line', 'This third document third ']

In [5]:
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
print(corpus)
print(gen_docs)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)], [(1, 1), (3, 1), (4, 1), (5, 1), (6, 1)], [(4, 1), (7, 1), (8, 2)]]
[['this', 'is', 'the', 'first', 'line'], ['this', 'is', 'the', 'second', 'sentence'], ['this', 'third', 'document', 'third']]


In [6]:
tfidf = models.TfidfModel(corpus)
#Show the TF-IDF weights
mydict = dictionary
for doc in tfidf[corpus]:
    print([[mydict[id], np.around(freq, decimals=2)]for id, freq in doc])

[['first', 0.66], ['is', 0.24], ['line', 0.66], ['the', 0.24]]
[['is', 0.24], ['the', 0.24], ['second', 0.66], ['sentence', 0.66]]
[['document', 0.45], ['third', 0.89]]


The words 'is' and 'the' occur in two documents and were weighted down.The word 'this' appearing in all theee documents were removed together.
In Simple terms ,words that occur more has lower weights.

In [7]:
# Denfine the documents
doc_trump = "Mr. Trump became president after winning the political election. Though he lost the support"
doc_election = "President Trump says Putin had no political interference is the election outcome. "
doc_putin = "Post elections, Vladimir Putin became President of Russia. President Putin had served "
doc_leo = "Post elections, Leo became President of Russia.President Leo had served"
documents = [doc_trump, doc_election,doc_putin,doc_leo]

In [8]:
print(doc_putin)

Post elections, Vladimir Putin became President of Russia. President Putin had served 


In [9]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
count_vectorizer = CountVectorizer(stop_words = 'english')
count_vectorizer = CountVectorizer()
sparse_matrix = count_vectorizer.fit_transform(documents)
sparse_matrix

<4x27 sparse matrix of type '<class 'numpy.int64'>'
	with 44 stored elements in Compressed Sparse Row format>

In [10]:
doc_term_matrix = sparse_matrix.todense()
df = pd.DataFrame(doc_term_matrix,
                 columns = count_vectorizer.get_feature_names(),
                 index=['doc_trump','doc_election','doc_putin','doc_leo'])
#index=index=['doc_trump','doc_election','doc_putin','test'])
df

Unnamed: 0,after,became,election,elections,had,he,interference,is,leo,lost,...,putin,russia,says,served,support,the,though,trump,vladimir,winning
doc_trump,1,1,1,0,0,1,0,0,0,1,...,0,0,0,0,1,2,1,1,0,1
doc_election,0,0,1,0,1,0,1,1,0,0,...,1,0,1,0,0,1,0,1,0,0
doc_putin,0,1,0,1,1,0,0,0,0,0,...,2,1,0,1,0,0,0,0,1,0
doc_leo,0,1,0,1,1,0,0,0,2,0,...,0,1,0,1,0,0,0,0,0,0


In [11]:
#Compute cosine simmilariry
from sklearn.metrics.pairwise import cosine_similarity
print(cosine_similarity(df,df))

[[1.         0.4330127  0.1875     0.19364917]
 [0.4330127  1.         0.36084392 0.2236068 ]
 [0.1875     0.36084392 1.         0.71004695]
 [0.19364917 0.2236068  0.71004695 1.        ]]


In [12]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\leotu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
stop

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [14]:
print(doc_trump)
j=" "
for x in doc_trump.split():
    if x not in stop:
        j = j + x + ' '
print('new output is:',j)

Mr. Trump became president after winning the political election. Though he lost the support
new output is:  Mr. Trump became president winning political election. Though lost support 
