In [82]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer

In [21]:
def preprocess(string):
    _EXTRA_SPACE_REGEX = re.compile(r'\s+', re.IGNORECASE)
    _SPECIAL_CHAR_REGEX = re.compile(
        r"(?P<p>(\.+)|(\?+)|(!+)|(:+)|(;+)|(-+)|"
        r"(\(+)|(\)+)|(\}+)|(\{+)|('+)|(-+)|(\[+)|(\]+)|"
        r"(?<!\d)(,+)(?!=\d)|(\$+))")
    string = _EXTRA_SPACE_REGEX.sub(" ",string)
    string = _SPECIAL_CHAR_REGEX.sub(" ",string)
    string = _EXTRA_SPACE_REGEX.sub(" ",string)
    return string.lower()

In [22]:
text = "hi :how   ar-e y$ou!! @@ ? . : jaksfd ; \ { } ( ) [ ] / ,"
print(preprocess(text))

hi how ar e y ou @@ jaksfd \ / 


In [23]:
text = ["In information retrieval, tf–idf or TFIDF, short for term frequency–inverse document frequency",
       "It is often used as a weighting factor in searches of information retrieval, mining, modeling.",
       "Variations of the tf–idf weighting scheme are often used by search engines as a central tool",
       "in scoring and be successfully used for stop-words filtering ranking a document's relevance"]

In [24]:
for index,string in enumerate(text):
    text[index] = preprocess(string)

In [25]:
print(text)

['in information retrieval tf–idf or tfidf short for term frequency–inverse document frequency', 'it is often used as a weighting factor in searches of information retrieval mining modeling ', 'variations of the tf–idf weighting scheme are often used by search engines as a central tool', 'in scoring and be successfully used for stop words filtering ranking a document s relevance']


In [39]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(text)

In [70]:
features = vectorizer.get_feature_names()
print(features)

['and', 'are', 'as', 'be', 'by', 'central', 'document', 'engines', 'factor', 'filtering', 'for', 'frequency', 'idf', 'in', 'information', 'inverse', 'is', 'it', 'mining', 'modeling', 'of', 'often', 'or', 'ranking', 'relevance', 'retrieval', 'scheme', 'scoring', 'search', 'searches', 'short', 'stop', 'successfully', 'term', 'tf', 'tfidf', 'the', 'tool', 'used', 'variations', 'weighting', 'words']


In [66]:
tfidf_matrix.shape

(4, 42)

In [40]:
doc = 0
feature_names = vectorizer.get_feature_names()
feature_index = tfidf_matrix[doc,:].nonzero()[1]
tfidf_scores = zip(feature_index, [tfidf_matrix[doc, x] for x in feature_index])
for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
  print (w, s)

in 0.176103702353
information 0.217523107299
retrieval 0.217523107299
tf 0.217523107299
idf 0.217523107299
or 0.275900479796
tfidf 0.275900479796
short 0.275900479796
for 0.217523107299
term 0.275900479796
frequency 0.551800959592
inverse 0.275900479796
document 0.217523107299


In [79]:
rows = tfidf_matrix.shape[0]
cols = tfidf_matrix.shape[1]
words = {}
for row in range(rows):
    for col in range(cols):
        if tfidf_matrix[row,col] > 0:
            if features[col] not in words:
                words[features[col]] = {}
            words[features[col]][row] = tfidf_matrix[row,col]
                

In [87]:
def search(query):
    query_list = preprocess(query).split(" ")
    result_file_dict = {}
    for q in query_list:
        d = words.get(q,0) 
        if d!=0:
            for file_index in d.keys():
                result_file_dict[file_index] = result_file_dict.get(file_index,0)
                result_file_dict[file_index]+=d[file_index]
    return result_file_dict


In [88]:
search("is often used")

{1: 0.74731787528171112, 2: 0.39362680965121388, 3: 0.19194536047103244}

In [89]:
print(features["is"],features["often",features["used"]])

TypeError: list indices must be integers or slices, not str