In [1]:
import nltk
from nltk.corpus import stopwords
import pandas as pd

In [2]:
# Define the documents
doc_trump = "Mr. Trump became president after winning the political election. Though he lost the support of some republican friends, Trump is friends with President Putin"

doc_election = "President Trump says Putin had no political interference is the election outcome. He says it was a witchhunt by political parties. He claimed President Putin is a friend who had nothing to do with the election"

doc_putin = "Post elections, Vladimir Putin became President of Russia. President Putin had served as the Prime Minister earlier in his political career"

documents = [doc_trump, doc_election, doc_putin]

# feature selection using TF-IDF

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
tfvector = vectorizer.fit_transform(documents)
print(vectorizer.get_feature_names())

['career', 'claimed', 'earlier', 'election', 'elections', 'friend', 'friends', 'interference', 'lost', 'minister', 'mr', 'outcome', 'parties', 'political', 'post', 'president', 'prime', 'putin', 'republican', 'russia', 'says', 'served', 'support', 'trump', 'vladimir', 'winning', 'witchhunt']


In [6]:
#of doucs and #of words
print(tfvector.shape)

(3, 27)


In [7]:
tfvector.toarray()

array([[0.        , 0.        , 0.        , 0.20336832, 0.        ,
        0.        , 0.5348098 , 0.        , 0.2674049 , 0.        ,
        0.2674049 , 0.        , 0.        , 0.15793364, 0.        ,
        0.31586728, 0.        , 0.15793364, 0.2674049 , 0.        ,
        0.        , 0.        , 0.2674049 , 0.40673664, 0.        ,
        0.2674049 , 0.        ],
       [0.        , 0.24198165, 0.        , 0.36806657, 0.        ,
        0.24198165, 0.        , 0.24198165, 0.        , 0.        ,
        0.        , 0.24198165, 0.24198165, 0.28583652, 0.        ,
        0.28583652, 0.        , 0.28583652, 0.        , 0.        ,
        0.4839633 , 0.        , 0.        , 0.18403328, 0.        ,
        0.        , 0.24198165],
       [0.28701233, 0.        , 0.28701233, 0.        , 0.28701233,
        0.        , 0.        , 0.        , 0.        , 0.28701233,
        0.        , 0.        , 0.        , 0.16951411, 0.28701233,
        0.33902821, 0.28701233, 0.33902821, 0.    

In [9]:
tfdf = pd.DataFrame(tfvector.toarray(),columns=vectorizer.get_feature_names(), 
                index=['doc_trump', 'doc_election', 'doc_putin'])
tfdf

Unnamed: 0,career,claimed,earlier,election,elections,friend,friends,interference,lost,minister,...,putin,republican,russia,says,served,support,trump,vladimir,winning,witchhunt
doc_trump,0.0,0.0,0.0,0.203368,0.0,0.0,0.53481,0.0,0.267405,0.0,...,0.157934,0.267405,0.0,0.0,0.0,0.267405,0.406737,0.0,0.267405,0.0
doc_election,0.0,0.241982,0.0,0.368067,0.0,0.241982,0.0,0.241982,0.0,0.0,...,0.285837,0.0,0.0,0.483963,0.0,0.0,0.184033,0.0,0.0,0.241982
doc_putin,0.287012,0.0,0.287012,0.0,0.287012,0.0,0.0,0.0,0.0,0.287012,...,0.339028,0.0,0.287012,0.0,0.287012,0.0,0.0,0.287012,0.0,0.0


In [10]:
from sklearn.metrics.pairwise import cosine_similarity
print(cosine_similarity(tfdf, tfdf))

[[1.         0.33027897 0.18740386]
 [0.33027897 1.         0.24226661]
 [0.18740386 0.24226661 1.        ]]


# feature selection using Count

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

# Create the Document Term Matrix
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer = CountVectorizer()
sparse_matrix = count_vectorizer.fit_transform(documents)

# OPTIONAL: Convert Sparse Matrix to Pandas Dataframe if you want to see the word frequencies.
doc_term_matrix = sparse_matrix.todense()
df = pd.DataFrame(doc_term_matrix,
                  columns=count_vectorizer.get_feature_names(), 
                index=['doc_trump', 'doc_election', 'doc_putin'])
df

Unnamed: 0,after,as,became,by,career,claimed,do,earlier,election,elections,...,the,though,to,trump,vladimir,was,who,winning,witchhunt,with
doc_trump,1,0,1,0,0,0,0,0,1,0,...,2,1,0,2,0,0,0,1,0,1
doc_election,0,0,0,1,0,1,1,0,2,0,...,2,0,1,1,0,1,1,0,1,1
doc_putin,0,1,1,0,1,0,0,1,0,1,...,1,0,0,0,1,0,0,0,0,0


In [12]:
from sklearn.metrics.pairwise import cosine_similarity
print(cosine_similarity(df, df))

[[1.         0.51480485 0.38890873]
 [0.51480485 1.         0.38829014]
 [0.38890873 0.38829014 1.        ]]
