In [1]:
from __future__ import division
import string
import pandas as pd

def tokenize(text):
    #remove full stops and commas
    #lower the text and split it by spaces.
    text = text.replace(".", "")
    text = text.replace(",", "")
    return text.lower().split()

def jaccard(first_document, second_document):
    #calculate jaccard similarity
    intersection = set(first_document).intersection(set(second_document))
    union = set(first_document).union(set(second_document))
    return len(intersection)/len(union)

d1 = "I am Sam."
d2 = "Sam I am."
d3 = "I do not like green eggs and ham."
d4 = "I do not like them, Sam I am."

documents_list = [d1, d2, d3, d4]
tokenized_documents = [tokenize(doc) for doc in documents_list] # tokenize all the documents

jaccard_list = {} # dictionary to store all the documents' jaccard similarity to each other.

# append the document index as the key of the dictionary.
for index, doc in enumerate(documents_list):
    for index, doc in enumerate(documents_list):
        jaccard_list[index] = {}

# create a nested dictionary where the nested dictionary holds the jaccard similarity between the documents.
# ie. {0 : {0: ..., 1: ..., 2:..., 3:...}, 1: {0: ..., 1: ..., 2:..., 3:...} and so on}
for first_doc_index, doc in enumerate(documents_list):
    for second_doc_index, doc in enumerate(documents_list):
        jaccard_list[first_doc_index][second_doc_index] = jaccard(tokenized_documents[first_doc_index], tokenized_documents[second_doc_index])


# show the result in a table
overall_matrix = pd.DataFrame(jaccard_list)
overall_matrix.columns = ['d1', 'd2', 'd3', 'd4']
overall_matrix.index = ['d1', 'd2', 'd3', 'd4']
print "Jaccard Similarity Matrix"
overall_matrix

Jaccard Similarity Matrix


Unnamed: 0,d1,d2,d3,d4
d1,1.0,1.0,0.1,0.428571
d2,1.0,1.0,0.1,0.428571
d3,0.1,0.1,1.0,0.363636
d4,0.428571,0.428571,0.363636,1.0


In [2]:
import math
from numpy import dot
from numpy.linalg import norm

def tf(term, document): # calculate term frequency
    return document.count(term)

def idf(documents): # calculate inverse document frequency
    idf_value_list = {}
    all_tokens = []
    for sublist in tokenized_documents:
        for item in sublist:
            all_tokens.append(item) # go through all the documents and tokenize the texts
            
    all_tokens_set = set(all_tokens) # turn the tokenized texts into a set 
                                     # so that it only contains unique tokens
    
    for token in all_tokens_set:
        token_exists = []
        for sublist in tokenized_documents:
            token_exists.append(token in sublist) # if a token exists in a document, mark it as 1 
                                                 # on the token_exists list or else 0. 
        idf_value_list[token] = 1 + math.log(len(tokenized_documents)/(sum(token_exists)))
                                                # idf formula. divide total number of documents by 
                                                # the number of documents where the token exists
    return idf_value_list


def tfidf(documents):
    tfidf = []
    tokenized_documents = [tokenize(doc) for doc in documents]
    inverse_document_frequency = idf(tokenized_documents)
    for document in tokenized_documents:
        tfidf_doc = []
        for term in inverse_document_frequency.keys():
            term_frequency = tf(term, document)
            tfidf_doc.append(term_frequency * inverse_document_frequency[term]) # tf * idf 
        tfidf.append(tfidf_doc)
    return tfidf


def cosine_similarity(first_word_vector, second_word_vector): # cosine similarity of two word vectors
    return float(dot(first_word_vector,second_word_vector) / (norm(first_word_vector) * norm(second_word_vector)))
   
    
# tfidf calculation of all documents
tfidf_docs = tfidf(documents_list)


# now, same as earlier with jaccard similarity, we are representing all the similarities in a table.

cosine_similarity_list = {}

for index, doc in enumerate(documents_list):
    for index, doc in enumerate(documents_list):
        cosine_similarity_list[index] = {}
        
for first_doc_index, doc in enumerate(documents_list):
    for second_doc_index, doc in enumerate(documents_list):
        cosine_similarity_list[first_doc_index][second_doc_index] = cosine_similarity(tfidf_docs[first_doc_index], tfidf_docs[second_doc_index])

# show the result in a table
overall_matrix = pd.DataFrame(cosine_similarity_list)
overall_matrix.columns = ['d1', 'd2', 'd3', 'd4']
overall_matrix.index = ['d1', 'd2', 'd3', 'd4']
print "Cosine Similarity Matrix"
overall_matrix


Cosine Similarity Matrix


Unnamed: 0,d1,d2,d3,d4
d1,1.0,1.0,0.084591,0.550447
d2,1.0,1.0,0.084591,0.550447
d3,0.084591,0.084591,1.0,0.400733
d4,0.550447,0.550447,0.400733,1.0
