In [1]:
import math
import nltk 
from  nltk.corpus import stopwords
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

stopwords_en = stopwords.words('english')
corpus = []
word_set = set()

def preprocessing(raw):
    wordlist = nltk.word_tokenize(raw)
    text = [w.lower() for w in wordlist if w not in stopwords_en]
    return text

In [2]:
# TF-IDF calculations
def custom_tfidf(texts):
    # TF calculations
    text_tf_dict = {}
    for idx, text in enumerate(texts):
        freqd_text = FreqDist(text)
        text_length = len(text)
        text_tf_dict[idx] = dict.fromkeys(word_set,0)
        for word in text:
            text_tf_dict[idx][word] = freqd_text[word]/text_length

    # IDF calculations
    text_idf_dict = dict.fromkeys(word_set,0)
    for word in text_idf_dict.keys():
        for text in texts:
            if word in text:
                text_idf_dict[word] += 1
        
    text_length = 3 # UPDATE: 2 documents for now
    for word, val in text_idf_dict.items():
        text_idf_dict[word] = 1 + math.log(text_length/(float(val)))
    
    # TF-IDF calculations = (TF of the word) * (IDF of the word)
    text_tfidf_dict = {}
    for idx, text in enumerate(texts):
        text_tfidf_dict[idx] = dict.fromkeys(word_set,0)
        for word in text:
            text_tfidf_dict[idx][word] = (text_tf_dict[idx][word])*(text_idf_dict[word])
    return text_tfidf_dict

In [9]:
# Doc2Vec
def custom_Doc2Vec(taggeddocs):
    # Building the model
    model = Doc2Vec(taggeddocs, dm=0, alpha=0.025, size=20, min_alpha=0.025, min_count=0)

    # Training
    total_epochs = 80
    total_docs = len(taggeddocs)
    for epoch in range(total_epochs):
        if epoch % 20 == 0:
            print('Now training epoch %s' % epoch)
        model.train(taggeddocs, total_examples=total_epochs, epochs=total_epochs)
        model.alpha -= 0.002 # decrease the learning rate
        model.min_alpha = model.alpha # fix the learning rate, no decay
    
    return model

In [45]:
f1 = open('./input/buscrash_1.txt', 'r', encoding='utf8')
text1 = preprocessing(f1.read())
corpus.append(text1)

f2 = open('./input/buscrash_2.txt', 'r', encoding='utf8')
text2 = preprocessing(f2.read())
corpus.append(text2)

f3 = open('./input/buscrash_3.txt', 'r', encoding='utf8')
text3 = preprocessing(f3.read())
corpus.append(text3)

for text in corpus:
    word_set = word_set.union(set(text))
    
text_tfidf_dict = custom_tfidf(corpus)
v1 = list(text_tfidf_dict[0].values())
v2 = list(text_tfidf_dict[1].values())
v3 = list(text_tfidf_dict[2].values())

taggeddocs = []
doc1 = TaggedDocument(words=text1, tags=[u'NEWS_1'])
doc2 = TaggedDocument(words=text2, tags=[u'NEWS_2'])
doc3 = TaggedDocument(words=text3, tags=[u'NEWS_3'])
taggeddocs.append(doc1)
taggeddocs.append(doc2)
taggeddocs.append(doc3)
model = custom_Doc2Vec(taggeddocs)

# Comput Cosine distances
similarity_tfidf = 1 - nltk.cluster.cosine_distance(v1, v2)
print('Similarity Index: {:4.2f} %'.format(similarity_tfidf*100))
similarity_vec = model.n_similarity(text1, text2)
print('Similarity Index: {:4.2f} %'.format(similarity_vec*100))

Now training epoch 0
Now training epoch 20
Now training epoch 40
Now training epoch 60
Similarity Index: 95.96 %
Similarity Index: 79.81 %


In [52]:
similarity_vec = model.n_similarity(text1, text3)
print('Similarity Index: {:4.2f} %'.format(similarity_vec*100))

Similarity Index: 47.88 %
