In [38]:
import nltk
from collections import defaultdict
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import snowball
from natsort import natsorted
import os

In [39]:
doc_list = []
file_names = natsorted(os.listdir("./files"))
for file in file_names:
    with open(f"./files/{file}","r") as f:
        text = f.read()
        doc_list.append(text)
    f.close()

doc_list

['This is the first word.\nThis is the second text, Hello! How are you?\nThis is the third, this is it now.',
 'This is the first word.\nThis is the second text, Hello! How are you?\nThis is the third, this is it now.',
 'This is the first word.\nmy name is shayanTH\nThis is the second text, Hello! How are you?\nThis is the third, this is it now.\n\n',
 'This is the first word.\nmy name is shayanTH\nThis is the second text, Hello! How are you?\nThis is the third, this working is it now.']

In [40]:
docs_words = []

# Tokenize a paragraph into sentences and each sentence in to
# words
for c in doc_list:
    for sent in sent_tokenize(c):
        word_tokens = word_tokenize(sent)
        docs_words += word_tokens
docs_words

['This',
 'is',
 'the',
 'first',
 'word',
 '.',
 'This',
 'is',
 'the',
 'second',
 'text',
 ',',
 'Hello',
 '!',
 'How',
 'are',
 'you',
 '?',
 'This',
 'is',
 'the',
 'third',
 ',',
 'this',
 'is',
 'it',
 'now',
 '.',
 'This',
 'is',
 'the',
 'first',
 'word',
 '.',
 'This',
 'is',
 'the',
 'second',
 'text',
 ',',
 'Hello',
 '!',
 'How',
 'are',
 'you',
 '?',
 'This',
 'is',
 'the',
 'third',
 ',',
 'this',
 'is',
 'it',
 'now',
 '.',
 'This',
 'is',
 'the',
 'first',
 'word',
 '.',
 'my',
 'name',
 'is',
 'shayanTH',
 'This',
 'is',
 'the',
 'second',
 'text',
 ',',
 'Hello',
 '!',
 'How',
 'are',
 'you',
 '?',
 'This',
 'is',
 'the',
 'third',
 ',',
 'this',
 'is',
 'it',
 'now',
 '.',
 'This',
 'is',
 'the',
 'first',
 'word',
 '.',
 'my',
 'name',
 'is',
 'shayanTH',
 'This',
 'is',
 'the',
 'second',
 'text',
 ',',
 'Hello',
 '!',
 'How',
 'are',
 'you',
 '?',
 'This',
 'is',
 'the',
 'third',
 ',',
 'this',
 'working',
 'is',
 'it',
 'now',
 '.']

In [41]:
word_list = []
for word in text:
    for sent in sent_tokenize(word):
        word_tokens = word_tokenize(sent)
        word_list += word_tokens

word_list


['T',
 'h',
 'i',
 's',
 'i',
 's',
 't',
 'h',
 'e',
 'f',
 'i',
 'r',
 's',
 't',
 'w',
 'o',
 'r',
 'd',
 '.',
 'm',
 'y',
 'n',
 'a',
 'm',
 'e',
 'i',
 's',
 's',
 'h',
 'a',
 'y',
 'a',
 'n',
 'T',
 'H',
 'T',
 'h',
 'i',
 's',
 'i',
 's',
 't',
 'h',
 'e',
 's',
 'e',
 'c',
 'o',
 'n',
 'd',
 't',
 'e',
 'x',
 't',
 ',',
 'H',
 'e',
 'l',
 'l',
 'o',
 '!',
 'H',
 'o',
 'w',
 'a',
 'r',
 'e',
 'y',
 'o',
 'u',
 '?',
 'T',
 'h',
 'i',
 's',
 'i',
 's',
 't',
 'h',
 'e',
 't',
 'h',
 'i',
 'r',
 'd',
 ',',
 't',
 'h',
 'i',
 's',
 'w',
 'o',
 'r',
 'k',
 'i',
 'n',
 'g',
 'i',
 's',
 'i',
 't',
 'n',
 'o',
 'w',
 '.']

In [42]:
def remove_punc(string):
    punc = '''!()-[]{};:'"\, <>./?@#$%^&*_~'''
    for ele in string:  
        if ele in punc:  
            string = string.replace(ele, "") 
    return string


In [49]:
lower_docs_words = [ x.lower() for x in docs_words ]
lower_docs_words = set([remove_punc(i) for i in docs_words])
lower_docs_words = set(list(filter(None, lower_docs_words)))


In [50]:
stwords = set(stopwords.words('english'))

# Using set difference to eliminate stopwords from our words
stopfree_words = lower_docs_words - stwords
stopfree_words

{'Hello',
 'How',
 'This',
 'first',
 'name',
 'second',
 'shayanTH',
 'text',
 'third',
 'word',
 'working'}

In [51]:
stemmer = snowball.SnowballStemmer('english')
stemmed_words = set([stemmer.stem(x) for x in stopfree_words])
stemmed_words

{'first',
 'hello',
 'how',
 'name',
 'second',
 'shayanth',
 'text',
 'third',
 'this',
 'word',
 'work'}

In [53]:
inverted_index = defaultdict(set)

# We maintain the reference to the document by its index in the Doc list
for docid, c in enumerate(doc_list):
    for sent in sent_tokenize(c):
        for word in word_tokenize(sent):
            word_lower = word.lower()
            if word_lower not in stwords:
                word_stem = stemmer.stem(word_lower)
                # We add the document to the set againt the word in our
                # index
                inverted_index[word_stem].add(docid)

inverted_index 

defaultdict(set,
            {'first': {0, 1, 2, 3},
             'word': {0, 1, 2, 3},
             '.': {0, 1, 2, 3},
             'second': {0, 1, 2, 3},
             'text': {0, 1, 2, 3},
             ',': {0, 1, 2, 3},
             'hello': {0, 1, 2, 3},
             '!': {0, 1, 2, 3},
             '?': {0, 1, 2, 3},
             'third': {0, 1, 2, 3},
             'name': {2, 3},
             'shayanth': {2, 3},
             'work': {3}})

In [54]:
def process_and_search(query):
    matched_documents = set()
    for word in word_tokenize(query):
        word_lower = word.lower()
        if word_lower not in stwords:
            word_stem = stemmer.stem(word_lower)
            matches = inverted_index.get(word_stem)
            if matches:
                # The operator |= is a short hand for set union
                matched_documents |= matches
    return matched_documents

In [64]:
def intersection(query1,query2):
    if query1 and query2 in inverted_index.keys():
        print(inverted_index[query1].intersection(inverted_index[query2]))
    

In [65]:
intersection("shayanth","work")


{3}
