# Document recogntion and text based search using tfidf vectors

# 1. Dataset formulation

In [3]:
import pandas as pd 
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import CountVectorizer 
docs=["the house had a tiny little mouse", 
"the cat saw the mouse", 
"the mouse ran away from the house", 
"the cat finally ate the mouse", 
"the end of the mouse story"
]

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer 
# settings that you use for count vectorizer will go here 
tfidf_vectorizer=TfidfVectorizer(use_idf=True) 
# just send in all your docs here 
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(docs)
type(tfidf_vectorizer_vectors)
scipy.sparse.csr.csr_matrix.toarray(tfidf_vectorizer_vectors)

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.49356209, 0.39820278, 0.49356209, 0.23518498,
        0.        , 0.        , 0.        , 0.        , 0.23518498,
        0.49356209],
       [0.        , 0.        , 0.48334378, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.28547062,
        0.        , 0.        , 0.59909216, 0.        , 0.57094124,
        0.        ],
       [0.        , 0.45709287, 0.        , 0.        , 0.        ,
        0.45709287, 0.        , 0.36877965, 0.        , 0.2178072 ,
        0.        , 0.45709287, 0.        , 0.        , 0.43561441,
        0.        ],
       [0.51392301, 0.        , 0.41462985, 0.        , 0.51392301,
        0.        , 0.        , 0.        , 0.        , 0.24488707,
        0.        , 0.        , 0.        , 0.        , 0.48977413,
        0.        ],
       [0.        , 0.        , 0.        , 0.49175319, 0.        ,
        0.        , 0.        , 

In [30]:
# get the first vector out (for the first document) 
first_vector_tfidfvectorizer=tfidf_vectorizer_vectors[0]
# place tf-idf values in a pandas data frame 
df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), 
                  index=tfidf_vectorizer.get_feature_names(), 
                  columns=["tfidf"]) 
df.sort_values(by=["tfidf"],ascending=False)

Unnamed: 0,tfidf
had,0.493562
little,0.493562
tiny,0.493562
house,0.398203
mouse,0.235185
the,0.235185
ate,0.0
away,0.0
cat,0.0
end,0.0


##### Notice that the words ‘mouse’ and ‘the’ have the lowest IDF values. This is expected as these words appear in each and every document in our collection. The lower the IDF value of a word, the less unique it is to any particular document.

In [19]:
words_to_find = ['away', 'house', 'mouse']

In [20]:
vectorizer_tf_idf = TfidfVectorizer(max_df=.65, min_df=1, stop_words=None, 
                                    use_idf=True, norm=None, vocabulary=words_to_find)

In [27]:
type(vectorizer_tf_idf)

sklearn.feature_extraction.text.TfidfVectorizer

In [23]:
vectorizer_cnt = CountVectorizer(stop_words=None, vocabulary=words_to_find, ngram_range=(1,2))
vectorizer_cnt

CountVectorizer(ngram_range=(1, 2), vocabulary=['away', 'house', 'mouse'])

In [24]:
x = vectorizer_cnt.fit_transform(docs)

In [25]:
print(vectorizer_cnt.get_feature_names())
print(x.toarray())

['away', 'house', 'mouse']
[[0 1 1]
 [0 0 1]
 [1 1 1]
 [0 0 1]
 [0 0 1]]


In [41]:
tfidf_vectorizer_vectors_1=vectorizer_tf_idf.fit_transform(words_to_find)
type(tfidf_vectorizer_vectors_1)
Xf = scipy.sparse.csr.csr_matrix.toarray(tfidf_vectorizer_vectors_1)
Xf

array([[1.69314718, 0.        , 0.        ],
       [0.        , 1.69314718, 0.        ],
       [0.        , 0.        , 1.69314718]])

In [37]:
tfidf_vectorizer_vectors.shape

(5, 16)

In [43]:
Xd = scipy.sparse.csr.csr_matrix.toarray(tfidf_vectorizer_vectors)
Xd

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.49356209, 0.39820278, 0.49356209, 0.23518498,
        0.        , 0.        , 0.        , 0.        , 0.23518498,
        0.49356209],
       [0.        , 0.        , 0.48334378, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.28547062,
        0.        , 0.        , 0.59909216, 0.        , 0.57094124,
        0.        ],
       [0.        , 0.45709287, 0.        , 0.        , 0.        ,
        0.45709287, 0.        , 0.36877965, 0.        , 0.2178072 ,
        0.        , 0.45709287, 0.        , 0.        , 0.43561441,
        0.        ],
       [0.51392301, 0.        , 0.41462985, 0.        , 0.51392301,
        0.        , 0.        , 0.        , 0.        , 0.24488707,
        0.        , 0.        , 0.        , 0.        , 0.48977413,
        0.        ],
       [0.        , 0.        , 0.        , 0.49175319, 0.        ,
        0.        , 0.        , 

In [55]:
import numpy as np
single_point = Xf[0,0]
points = Xd

dist = (points - single_point)**2
dist = np.sum(dist, axis=1)
dist = np.sqrt(dist)
dist

array([6.23800253, 6.34842096, 6.22598832, 6.284547  , 6.28425291])

In [47]:
Xf[0,0]

1.6931471805599454