In [None]:
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('stopwords')
nltk.download('punkt')
import string
import numpy as np

In [None]:
data_dir = '../Humor,Hist,Media,Food'
file_names = os.listdir(data_dir)
file_paths = [(data_dir + '/' + fname) for fname in file_names]

In [None]:
stop_words = set(stopwords.words('english'))

def read_files(fpaths):
    file_tokens = []
    for fpath in fpaths:
        f = open(fpath, 'r', encoding='utf-8', errors='replace')
        ftxt_unprocessed = f.read()
        # print(ftxt_unprocessed)
        ftoks = preprocess_file(ftxt_unprocessed)
        file_tokens.append(ftoks)
    return file_tokens

def isValidTok(tok):
    if((tok not in string.punctuation) and (tok.isnumeric() == False) and (sum([0 if ch in string.punctuation else 1 for ch in tok]) >= 1)):
        return True
    return False 

def preprocess_file(file_text):

    all_tokens = word_tokenize(file_text.lower())
    all_unique_tokens = set(all_tokens)
    tokens = list(all_unique_tokens - stop_words)
    # ps = PorterStemmer()
    valid_toks = []
    for tok in tokens:
        if(isValidTok(tok) == True):
            valid_toks.append(tok)
    return valid_toks
    # print(final_tokens)
    

In [None]:
file_toks = read_files(file_paths)

In [None]:
def create_inverted_index(file_toks):
    inv_index = {}
    for i in range(len(file_toks)):
        for tok in file_toks[i]:
            if(tok not in inv_index.keys()):
                inv_index[tok] = [i]
            else:
                inv_index[tok].append(i)
    inv_index = dict(sorted(inv_index.items()))
    terms_list = inv_index.keys()
    for word in terms_list:
        inv_index[word].sort()
    return inv_index

In [None]:
def getDocsFromID(file_names, doc_IDs):
    doc_names = []
    for doc_ID in doc_IDs:
        doc_names.append(file_names[doc_ID])
    return doc_names

In [None]:
def check_equal_arrays(arr1, arr2):

    if(len(arr1) != len(arr2)):
        return False
    
    arr1 = sorted(arr1)
    arr2 = sorted(arr2)
    for i in range(len(arr1)):
        if(arr1[i] != arr2[i]):
            return False
    return True

In [None]:
def query_AND(inv_index, term1, term2, verbose=False):

    terms_list = inv_index.keys()

    if((term1 not in terms_list) or (term2 not in terms_list)):
        return 0, 0, []

    posting1 = inv_index[term1]
    posting2 = inv_index[term2]
    ptr1 = 0
    ptr2 = 0
    answer_docID = []

    num_comparisons = 0

    while(ptr1 < len(posting1) and ptr2 < len(posting2)):
        num_comparisons += 1
        # print(f"1 : {posting1[ptr1]} , 2: {posting2[ptr2]}")

        if(posting1[ptr1] == posting2[ptr2]):
            answer_docID.append(posting1[ptr1])
            ptr1 += 1
            ptr2 += 1
        elif(posting1[ptr1] < posting2[ptr2]):
            ptr1 += 1
        else:
            ptr2 += 1

    num_docs_retreived = len(answer_docID)
    doc_names_retreived = getDocsFromID(file_names, answer_docID)

    if(verbose==True):
        print(f"Query: {term1} AND {term2}\nNo. of documents retreived: {num_docs_retreived}\nMinimum number of comparisons: {num_comparisons}\nNames of retreived documents: {doc_names_retreived}")
    
    # temp_verification = list(set(posting1) & set(posting2))
    # print(f"Verification: {check_equal_arrays(temp_verification, answer_docID)}")

    return num_docs_retreived, num_comparisons, doc_names_retreived

def query_OR(inv_index, term1, term2, verbose=False):
    
    terms_list = inv_index.keys()

    if((term1 not in terms_list) and (term2 not in terms_list)):
        return 0, 0, []
    elif((term1 not in terms_list) and (term2 in terms_list)):
        ans_docs = inv_index[term2]
        return len(ans_docs), 0, ans_docs
    elif((term1 in terms_list) and (term2 not in terms_list)):
        ans_docs = inv_index[term1]
        return len(ans_docs), 0, ans_docs
    else:
        posting1 = inv_index[term1]
        posting2 = inv_index[term2]
        ptr1 = 0
        ptr2 = 0
        answer_docID = []

        num_comparisons = 0

        while(ptr1 < len(posting1) and ptr2 < len(posting2)):
            num_comparisons += 1
            # print(f"1 : {posting1[ptr1]} , 2: {posting2[ptr2]}")

            if(posting1[ptr1] == posting2[ptr2]):
                answer_docID.append(posting1[ptr1])
                ptr1 += 1
                ptr2 += 1
            elif(posting1[ptr1] < posting2[ptr2]):
                answer_docID.append(posting1[ptr1])
                ptr1 += 1
            else:
                answer_docID.append(posting2[ptr2])
                ptr2 += 1
        while(ptr1 < len(posting1)):
            answer_docID.append(posting1[ptr1])
            ptr1 += 1
        while(ptr2 < len(posting2)):
            answer_docID.append(posting2[ptr2])
            ptr2 += 1


        num_docs_retreived = len(answer_docID)
        doc_names_retreived = getDocsFromID(file_names, answer_docID)

        if(verbose==True):
            print(f"Query: {term1} OR {term2}\nNo. of documents retreived: {num_docs_retreived}\nMinimum number of comparisons: {num_comparisons}\nNames of retreived documents: {doc_names_retreived}")
        
        # temp_verification = list(set(posting1) | set(posting2))
        # print(f"Verification: {check_equal_arrays(temp_verification, answer_docID)}")
        return num_docs_retreived, num_comparisons, doc_names_retreived

def perform_NOT(inv_index, term):

    all_docIDs = [docID for docID in range(len(file_names))]
    if(term not in inv_index.keys()):
        return all_docIDs

    posting = inv_index[term]
    for docID in posting:
        all_docIDs.remove(docID)

    return all_docIDs

def query_AND_NOT(inv_index, term1, term2, verbose=False):

    terms_list = inv_index.keys()

    if((term1 not in terms_list)):
        return 0, 0, []
    
    posting1 = inv_index[term1]
    posting2 = perform_NOT(inv_index, term2)
    ptr1 = 0
    ptr2 = 0
    answer_docID = []

    num_comparisons = 0

    while(ptr1 < len(posting1) and ptr2 < len(posting2)):
        num_comparisons += 1
        # print(f"1 : {posting1[ptr1]} , 2: {posting2[ptr2]}")
        if(posting1[ptr1] == posting2[ptr2]):
            answer_docID.append(posting1[ptr1])
            ptr1 += 1
            ptr2 += 1
        elif(posting1[ptr1] < posting2[ptr2]):
            ptr1 += 1
        else:
            ptr2 += 1

    num_docs_retreived = len(answer_docID)
    doc_names_retreived = getDocsFromID(file_names, answer_docID)

    if(verbose==True):
        print(f"Query: {term1} AND NOT {term2}\nNo. of documents retreived: {num_docs_retreived}\nMinimum number of comparisons: {num_comparisons}\nNames of retreived documents: {doc_names_retreived}")
    
    # temp_verification = list(set(posting1) | set(posting2))
    # print(f"Verification: {check_equal_arrays(temp_verification, answer_docID)}")
    return num_docs_retreived, num_comparisons, doc_names_retreived

def query_OR_NOT(inv_index, term1, term2, verbose=False):
    
    terms_list = inv_index.keys()

    if((term1 not in terms_list) and (term2 not in terms_list)):
        ans_docs = perform_NOT(inv_index, term2)
        return len(ans_docs), 0, ans_docs
    elif((term1 not in terms_list) and (term2 in terms_list)):
        ans_docs = perform_NOT(inv_index, term2)
        return len(ans_docs), 0, ans_docs
    # elif((term1 in terms_list) and (term2 not in terms_list)):
    #     ans_docs = inv_index[term1]
    #     return len(ans_docs), 0, ans_docs
    else:
        posting1 = inv_index[term1]
        posting2 = perform_NOT(inv_index, term2)
        ptr1 = 0
        ptr2 = 0
        answer_docID = []

        num_comparisons = 0

        while(ptr1 < len(posting1) and ptr2 < len(posting2)):
            num_comparisons += 1
            # print(f"1 : {posting1[ptr1]} , 2: {posting2[ptr2]}")

            if(posting1[ptr1] == posting2[ptr2]):
                answer_docID.append(posting1[ptr1])
                ptr1 += 1
                ptr2 += 1
            elif(posting1[ptr1] < posting2[ptr2]):
                answer_docID.append(posting1[ptr1])
                ptr1 += 1
            else:
                answer_docID.append(posting2[ptr2])
                ptr2 += 1
        while(ptr1 < len(posting1)):
            answer_docID.append(posting1[ptr1])
            ptr1 += 1
        while(ptr2 < len(posting2)):
            answer_docID.append(posting2[ptr2])
            ptr2 += 1


        num_docs_retreived = len(answer_docID)
        doc_names_retreived = getDocsFromID(file_names, answer_docID)

        if(verbose==True):
            print(f"Query: {term1} OR NOT {term2}\nNo. of documents retreived: {num_docs_retreived}\nMinimum number of comparisons: {num_comparisons}\nNames of retreived documents: {doc_names_retreived}")
        
        # temp_verification = list(set(posting1) | set(posting2))
        # print(f"Verification: {check_equal_arrays(temp_verification, answer_docID)}")
        return num_docs_retreived, num_comparisons, doc_names_retreived


    

In [None]:
inverted_index = create_inverted_index(file_toks)

In [None]:
num_docs_AND, min_cmps_AND, doc_names_AND = query_AND_NOT(inverted_index, 'water', 'effect', verbose=True)

In [None]:
num_docs_OR, min_cmps_OR, doc_names_OR = query_OR(inverted_index, 'water', 'effect', verbose=True)