In [4]:
#importing necessary libraries
import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist
stopwords_en = stopwords.words("english")
import import_ipynb
import math

In [5]:
#another way to calculate similarities
def similarity_check():
    
    #data preprocessing
    def preprocessing(raw):
        #tokenizing the document
        wordlist = nltk.word_tokenize(raw) 
        #transforming each word to lower case and removing stopwords
        text = [w.lower() for w in wordlist if w not in stopwords_en]
        return text
    
    #preprocessing the first document
    f1 = open('result_text1.txt','r',encoding = "utf8")
    text1 = preprocessing(f1.read())

    #preprocessing the second document
    f2 = open('result_text2.txt','r',encoding = "utf8")
    text2 = preprocessing(f2.read())
    
    #tf-idf vectorizer
    
    word_set = set(text1).union(set(text2))

    #step 1: calculate TF
    freqd_text1 = FreqDist(text1)  
    text1_length = len(text1)
    text1_tf_dict = dict.fromkeys(word_set,0)
    #to store word-count pairs of all the words from common word_set along with their individual count
    for word in text1:
        text1_tf_dict[word] = freqd_text1[word]/text1_length
        
    #calculate term frequency for 2nd text
    freqd_text2 = FreqDist(text2)
    text2_length = len(text2)
    text2_tf_dict = dict.fromkeys(word_set,0)
    #to store word-count pairs of all the words from common word_set along with their individual count
    for word in text2:
        text2_tf_dict[word] = freqd_text2[word]/text2_length

    #step 2: calculating IDF 
    text12_idf_dict = dict.fromkeys(word_set,0)
    text12_length = 2
    for word in text12_idf_dict.keys():
        if word in text1:
            text12_idf_dict[word]+=1
        if word in text2:
            text12_idf_dict[word]+=1

    
    for word,val in text12_idf_dict.items():
        text12_idf_dict[word] = 1+math.log(text12_length/(float(val)))
    
    #calculating TF-IDF
    text1_tfidf_dict = dict.fromkeys(word_set,0)
    for word in text1:
        text1_tfidf_dict[word] =(text1_tf_dict[word])*(text12_idf_dict[word])

    text2_tfidf_dict = dict.fromkeys(word_set,0)
    for word in text2:
        text2_tfidf_dict[word] = (text2_tf_dict[word])*(text12_idf_dict[word])

    #finding the similarity- using cosine similarity
    v1 = list(text1_tfidf_dict.values())
    v2 = list(text2_tfidf_dict.values())
    similarity1 = 1-nltk.cluster.cosine_distance(v1,v2)
    print("similarity index :{:4.2f} %".format(similarity1*100))
    

    #Similarity computation using doc2vec 
    from gensim.models.doc2vec import Doc2Vec,TaggedDocument
    taggeddocs = []
    doc1 = TaggedDocument(words = text1,tags = [u'file1'])
    taggeddocs.append(doc1)
    doc2 = TaggedDocument(words = text2,tags = [u'file2'])
    taggeddocs.append(doc2)

    #build the model
    model = Doc2Vec(taggeddocs,dm=0,alpha=0.025,size=20,min_aclpha = 0.025,min_count=0)


    #training
    for epoch in range(80):
        if epoch%20 ==0:
            print('Now training epoch % epoch')
        model.train(taggeddocs,total_examples=model.corpus_count,epochs=model.iter)
        model.alpha -=0.002
        model.min_alpha = model.alpha

    similarity = model.n_similarity(text1,text2)
    print(similarity)
    return similarity

In [6]:
print(similarity_check())

similarity index :80.72 %
Now training epoch % epoch
Now training epoch % epoch




Now training epoch % epoch
Now training epoch % epoch
0.8213873
0.8213873




word_set = set(text1).union(set(text2))

freqd_text1 = FreqDist(text1)

text1_count_dict = dict.fromkeys(word_set,0)
for word in text1:
    text1_count_dict[word] = freqd_text1[word]
    

freqd_text2 = FreqDist(text2)
text2_count_dict = dict.fromkeys(word_set,0)
for word in text2:
    text2_count_dict[word] = freqd_text2[word]

freqd_text1 = FreqDist(text1)
text1_length = len(text1)
text1_tf_dict = dict.fromkeys(word_set,0)
for word in text1:
    text1_tf_dict[word] = freqd_text1[word]/text1_length
    
freqd_text2 = FreqDist(text2)
text2_length = len(text2)
text2_tf_dict = dict.fromkeys(word_set,0)
for word in text2:
    text2_tf_dict[word] = freqd_text2[word]/text2_length

text12_idf_dict = dict.fromkeys(word_set,0)
text12_length = 2
for word in text12_idf_dict.keys():
    if word in text1:
        text12_idf_dict[word]+=1
    if word in text2:
        text12_idf_dict[word]+=1

import math
for word,val in text12_idf_dict.items():
    text12_idf_dict[word] = 1+math.log(text12_length/(float(val)))


text1_tfidf_dict = dict.fromkeys(word_set,0)
for word in text1:
    text1_tfidf_dict[word] =(text1_tf_dict[word])*(text12_idf_dict[word])
    
text2_tfidf_dict = dict.fromkeys(word_set,0)
for word in text2:
    text2_tfidf_dict[word] = (text2_tf_dict[word])*(text12_idf_dict[word])

v1 = list(text1_tfidf_dict.values())
v2 = list(text2_tfidf_dict.values())
similarity = 1-nltk.cluster.cosine_distance(v1,v2)
print("similarity index :{:4.2f} %".format(similarity*100))

from gensim.models.doc2vec import Doc2Vec,TaggedDocument
taggeddocs = []
doc1 = TaggedDocument(words = text1,tags = [u'file1'])
taggeddocs.append(doc1)
doc2 = TaggedDocument(words = text2,tags = [u'file2'])
taggeddocs.append(doc2)

#build the model
model = Doc2Vec(taggeddocs,dm=0,alpha=0.025,size=20,min_alpha = 0.025,min_count=0)


#training
for epoch in range(80):
    if epoch%20 ==0:
        print('Now training epoch % epoch')
    model.train(taggeddocs,total_examples=model.corpus_count,epochs=model.iter)
    model.alpha -=0.002
    model.min_alpha = model.alpha

similarity = model.n_similarity(text1,text2)
print("similarity index : {:4.2f} %".format(similarity*100))