In [56]:
import nltk
from collections import defaultdict
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import snowball
from natsort import natsorted
import os

In [57]:
doc_list = []
file_names = natsorted(os.listdir("./files"))
for file in file_names:
    with open(f"./files/{file}","r") as f:
        text = f.read()
        doc_list.append(text)
    f.close()

doc_list

['This is the first word.\nThis is the second text, Hello! How are you?\nThis is the third, this is it now.',
 'This is the first word.\nThis is the second text, Hello! How are you?\nThis is the third, this is it now.',
 'This is the first word.\nmy name is shayanTH\nThis is the second text, Hello! How are you?\nThis is the third, this is it now.\n\n',
 'This is the first word.\nmy name is shayanTH\nThis is the second text, Hello! How are you?\nThis is the third, this working is it now.']

In [63]:
docs_words = []

# Tokenize a paragraph into sentences and each sentence in to
# words
for c in doc_list:
    for sent in sent_tokenize(c):
        word_tokens = word_tokenize(sent)
        docs_words += word_tokens


In [64]:
word_list = []
for word in text:
    for sent in sent_tokenize(word):
        word_tokens = word_tokenize(sent)
        word_list += word_tokens

print(len(word_list))


105


In [67]:
lower_docs_words = set([ x.lower() for x in docs_words ])
len(lower_docs_words)

22

In [68]:
stwords = set(stopwords.words('english'))

# Using set difference to eliminate stopwords from our words
stopfree_words = lower_docs_words - stwords
len(stopfree_words)

13

In [69]:
stemmer = snowball.SnowballStemmer('english')
stemmed_words = set([stemmer.stem(x) for x in stopfree_words])
len(stemmed_words)

13

In [73]:
inverted_index = defaultdict(set)

# We maintain the reference to the document by its index in the corpus list
for docid, c in enumerate(doc_list):
    for sent in sent_tokenize(c):
        for word in word_tokenize(sent):
            word_lower = word.lower()
            if word_lower not in stwords:
                word_stem = stemmer.stem(word_lower)
                # We add the document to the set againt the word in our
                # index
                inverted_index[word_stem].add(docid)

inverted_index

defaultdict(set,
            {'first': {0, 1, 2, 3},
             'word': {0, 1, 2, 3},
             '.': {0, 1, 2, 3},
             'second': {0, 1, 2, 3},
             'text': {0, 1, 2, 3},
             ',': {0, 1, 2, 3},
             'hello': {0, 1, 2, 3},
             '!': {0, 1, 2, 3},
             '?': {0, 1, 2, 3},
             'third': {0, 1, 2, 3},
             'name': {2, 3},
             'shayanth': {2, 3},
             'work': {3}})

In [74]:
def process_and_search(query):
    matched_documents = set()
    for word in word_tokenize(query):
        word_lower = word.lower()
        if word_lower not in stwords:
            word_stem = stemmer.stem(word_lower)
            matches = inverted_index.get(word_stem)
            if matches:
                # The operator |= is a short hand for set union
                matched_documents |= matches
    return matched_documents

In [76]:
process_and_search("shayanTH")

{2, 3}