In [1]:
import nltk
import numpy as np
import math
import pickle
import sys
from bs4 import BeautifulSoup as bsoup
from spellchecker import SpellChecker
from nltk.stem import PorterStemmer

In [2]:
ps = PorterStemmer()

punctuations = ['.', ',', '!', '\'', '\"',
                '(', ')', '[', ']', '{', '}', '?', '\\', '/', '~', '|', '<', '>']

# For printing the whole npy array
np.set_printoptions(threshold=sys.maxsize)

In [3]:
def spell_correct(query):
    spell = SpellChecker()
    misspelled = spell.unknown(query.split())
    if misspelled:
        for word in query.split():
            if word in misspelled:
                print("Correcting " + word + " to " + spell.correction(word))
                query = query.replace(word, spell.correction(word))
    return query

In [4]:
def get_stemmed_token(token):
    porter = nltk.PorterStemmer()
    return porter.stem(token)

In [5]:
def process_query_vector(query, vocabulary_keys, inverse_vocab_word_dict, term_document_frequency, N):
    query_text = ''
    for token in query:
        query_text = query_text + ' ' + str(token)
    query = query_text
    query = query.lower()

    # bonus heuristic
    print("Running Spell Check")
    query = spell_correct(query)

    query_vector = np.zeros(len(vocabulary_keys))
    query = nltk.word_tokenize(query)

    if(len(query) == 1 and query[0] not in inverse_vocab_word_dict):
        print(
            query[0] + " is not found in vocabulary. Using most appropriate substitution using root word analysis! ")
        stemmed_token = get_stemmed_token(query[0])

        # Now the query contains all tokens as it is, only the ones that do not excist
        # in the vocabulary are replaced by their stemmed root versions
        stemmed_vocab = pickle.load(open("stemmed_vocab.pkl", "rb"))
        if(stemmed_token not in stemmed_vocab):
            print("Could not replace, no search results found")
            exit(0)
        fixed_token = stemmed_vocab[token][0]
        print("Did you mean " + fixed_token + "? Press y for yes: ")
        choice = input()
        if choice != 'y':
            print("No search results found")
            exit(0)
        query[query.index(token)] = fixed_token
        # Query has the wrong word replaced by the most common rooted word (with the same root as the wrong word)
        print("Changed query: " + ' '.join(query))
        query_vector[inverse_vocab_word_dict[fixed_token]
                     ] = query_vector[inverse_vocab_word_dict[fixed_token]] + 1
    elif(len(query) >= 1):
        for token in query:
            if(token not in inverse_vocab_word_dict):
                print(token + " not found in vocabulary.")
                stemmed_token = get_stemmed_token(token)
                stemmed_vocab = pickle.load(open("stemmed_vocab.pkl", "rb"))
                if(stemmed_token not in stemmed_vocab):
                    continue
                fixed_token = stemmed_vocab[stemmed_token][0]
                print("Did you mean " + fixed_token + "? Press y for yes: ")
                choice = input()
                if choice != 'y':
                    print("Skipping " + token)
                    continue
                query[query.index(token)] = fixed_token
                print("Changed query: " + ' '.join(query))
                query_vector[inverse_vocab_word_dict[fixed_token]
                             ] = query_vector[inverse_vocab_word_dict[fixed_token]] + 1
            else:
                query_vector[inverse_vocab_word_dict[token]
                             ] = query_vector[inverse_vocab_word_dict[token]] + 1

    # processing log calculations (L)
    for i in range(query_vector.shape[0]):
        if(query_vector[i] > 0):
            query_vector[i] = 1 + math.log(query_vector[i])

    # processing term normalization (T)
    for i in range(query_vector.shape[0]):
        if(query_vector[i] == 0):
            continue
        query_vector[i] = query_vector[i] * \
            math.log(N/term_document_frequency[i])

    # Cosine normalization (C)
    temp_query_vector = np.copy(query_vector)
    temp_query_vector = np.square(temp_query_vector)
    temp_query_vector = np.sum(temp_query_vector)
    temp_query_vector = np.sqrt(temp_query_vector)
    query_vector = np.divide(query_vector, temp_query_vector)

    return query, query_vector

In [6]:
def calculate_score(query_vector, database_lnc):
    scores = []
    for id, document_vector in enumerate(database_lnc):
        score = np.dot(query_vector, document_vector)
        score = score/np.linalg.norm(query_vector)
        score = score/np.linalg.norm(document_vector)
        scores.append([id, score])
    return scores

In [7]:
def title_weighting(scores, query, doc_titles):
    title_weight = 0.1
    trivial_words = ["of", "and", "a", "the", "an", "is"]
    for doc_id, doc_title in enumerate(doc_titles):
        count = 0
        for word in query:
            if word in doc_title:
                if(word not in trivial_words):
                    count = count + 1
        scores[doc_id][1] = scores[doc_id][1] * (1+count*title_weight)
    return scores

In [8]:
def scoring(query, database_lnc, vocabulary_keys, inverse_vocab_word_dict, term_document_frequency, N, doc_titles):
    corrected_query, query_vector = process_query_vector(
        query, vocabulary_keys, inverse_vocab_word_dict, term_document_frequency, N)
    scores = calculate_score(query_vector, database_lnc)
#     scores = title_weighting(scores, corrected_query , doc_titles)
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    print("Top 10 Scoring Documents are: ")
    for ind in range(10):
        if(scores[ind][1] == 0):
            break
        print(doc_titles[scores[ind][0]] + " is at rank " +
              str(ind+1) + " Score: " + str(scores[ind][1]))

In [9]:
database_lnc = np.load("database_lnc.npy")
N = database_lnc.shape[0]
vocabulary_dict = pickle.load(open("vocabulary_dict.pkl", "rb"))

vocabulary_keys = list(vocabulary_dict.keys())
inverse_vocab_word_dict = {k: v for v, k in enumerate(vocabulary_keys)}
term_document_frequency = np.count_nonzero(database_lnc, axis=0)
doc_titles = pickle.load(open("doc_titles.pkl", "rb"))

In [11]:
query = input()
query = query.split()
scoring(query, database_lnc, vocabulary_keys, inverse_vocab_word_dict, term_document_frequency, N, doc_titles)

lonely widowers
Running Spell Check
widowers not found in vocabulary.
Did you mean widow? Press y for yes: 
y
Changed query: lonely widow
Top 10 Scoring Documents are: 
"Gheorghe Zamfir" is at rank 1 Score: 0.048668874035561034
"Book of Ruth" is at rank 2 Score: 0.03303445145059603
"Berry Berenson" is at rank 3 Score: 0.030607013577806003
"Gaudy Night" is at rank 4 Score: 0.029974901925442714
"Bram Stoker" is at rank 5 Score: 0.025377098888839864
"George Orwell" is at rank 6 Score: 0.022197025218706007
"Bestiary" is at rank 7 Score: 0.021706467308098033
"Book of Lamentations" is at rank 8 Score: 0.02132633262318457
"Bill Haley" is at rank 9 Score: 0.019482329503754642
"Two Tribes" is at rank 10 Score: 0.01914494259904734
