In [118]:
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sparse_dot_topn import awesome_cossim_topn



In [119]:
documents = [
"The sky is blue",
"The sun is bright",
"The sun in the sky is bright",
"We can see the shining sun, the bright sun"]

In [129]:
test_text = "The sun in the sky is bright"

In [130]:
count_vectorizer = CountVectorizer()

In [131]:
vocabulary = count_vectorizer.fit(documents + [test_text]).vocabulary_
print(vocabulary)

{'the': 9, 'sky': 7, 'is': 4, 'blue': 0, 'sun': 8, 'bright': 1, 'in': 3, 'we': 10, 'can': 2, 'see': 5, 'shining': 6}


In [132]:
# Convert a collection of raw documents to a matrix of TF-IDF features.
tfidf_vectorizer = TfidfVectorizer(vocabulary=vocabulary)
# Learn vocabulary and idf, return term-document matrix.
test_tfidf_vector = tfidf_vectorizer.fit_transform([test_text])

In [133]:
# this needs to be transposed before multiplying to achieve Cosine Similarity
tfidf_vector = tfidf_vectorizer.fit_transform(documents).transpose()

In [136]:
results = awesome_cossim_topn(test_tfidf_vector, tfidf_vector, 2, 0)


In [137]:
print(test_text)
print('-------------------')

for index, i in enumerate(results.indices):
    print('{}: {}'.format(documents[i], results.data[index]))

The sun in the sky is bright
-------------------
The sun in the sky is bright: 0.9732800618172203
The sun is bright: 0.8066806285173703


######################  Testing      #############################

In [186]:
documents = [
"The sky is blue",
"The sun is bright",
"The sun in the sky is bright",
"We can see the shining sun, the bright sun"]


In [187]:
count_vectorizer = CountVectorizer(stop_words="english")
vocabulary = count_vectorizer.fit(documents).vocabulary_
model = TfidfVectorizer(vocabulary=vocabulary)
tfidf_vector = model.fit_transform(documents).transpose()

In [188]:
test_text = documents[0]
# Learn vocabulary and idf, return term-document matrix.
test_tfidf_vector = model.fit_transform([test_text])
results = awesome_cossim_topn(test_tfidf_vector, tfidf_vendor, 2, 0)
print(test_text)
print("*************************")
for index, i in enumerate(results.indices):
    print('{}: {}'.format(documents[i], results.data[index]))

The sky is blue
*************************
The sky is blue: 0.9930738960272274
The sun in the sky is bright: 0.46515556829761573


##################### Try out #########################

In [191]:
documents = [
"The sky is blue",
"The sun is bright",
"The sun in the sky is bright",
"We can see the shining sun, the bright sun"]

In [192]:
count_vectorizer = CountVectorizer(stop_words="english")
vocabulary = count_vectorizer.fit(documents).vocabulary_
model = TfidfVectorizer(vocabulary=vocabulary)
tfidf_vector = model.fit_transform(documents).transpose()

In [222]:
from decimal import Decimal
expected_percentage = round(Decimal(0.76),2)
for i, test_text in  enumerate(documents):
    # Learn vocabulary and idf, return term-document matrix.
    test_tfidf_vector = model.fit_transform([test_text])
    results = awesome_cossim_topn(test_tfidf_vector, tfidf_vendor, 2, 0)
    
#     print(results.indices)
    for index, j in enumerate(results.indices):
        if j != i:
            score = round(Decimal(results.data[index]), 2)
           
            if score >= expected_percentage:
                print(test_text, end='\n')
                print("*************************")
                print('{}: {}'.format(documents[j], score))

The sun is bright
*************************
We can see the shining sun, the bright sun: 0.78
The sun in the sky is bright
*************************
The sun is bright: 0.82
We can see the shining sun, the bright sun
*************************
The sun is bright: 0.87
