In [10]:
import numpy as np
import pandas as pd 
import pickle
import string
from string import punctuation
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import ADM_heapfunctions as heap

#Clean the given query (does not filter out duplicate words but it is taken care of in the program)
def clean_query(lst):
    stop_words =stopwords.words('english') + list(punctuation)+list(['no','information','NO','No','Information','INFORMATION'])
    words_lst = [nltk.word_tokenize(sentence) for sentence in lst]
    words=[]
    for wrd in words_lst:
        words=words + wrd
    
    words = [w.lower() for w in words]
    words = [PorterStemmer().stem_word(word) for word in words]
    return [w for w in words if w not in stop_words and not w.isdigit()]


#Read all docs from the disk into a dictionary 
def docs_dct():
    num_docs=11269
    all_docs_dct={}
    for i in range(num_docs):
        all_docs_dct[i]=read_doc(i)

    return all_docs_dct


#Sub function of docs_dct(), read one single document from the disk
def read_doc(i):
    path_file = './ADM_dataset/cleaned/'+'___'
    doc=[]
    target=open(path_file+str(i)+'.txt','r',encoding='utf-8')
    for wrd in target:
        doc.append(wrd.rstrip('\n'))
    return doc


#TF-IDF Functions

#Calculates term frequency in a document. The type of variable document must be string for this function to work
def tf(term, document):
    return 1+(np.log(freq(term, document))/np.log(10)) #1+log(tf) logarithmic term frequency

#sub function of tf() function
def freq(term, document):
    return document.split().count(term)

#calculates the inverse doc freq of a term log(N/df_n)
def idf(term):
    num_docs=11269
    idf_weight=np.log(num_docs/len(inverted_index[term]))/np.log(10)
    return idf_weight


#converts the given document having list of words into a string (so that it can be used by tf() function)
#and also builds unique words out of the given document list
def build_uni_wrds_cnv_string(doc_id_list):
    lexicon_doc=set()
    for word in doc_id_list:
        lexicon_doc.update([word])
    
    ch=''
    for j in doc_id_list:
        ch=ch+' '+j
    #returns unique words in lexicon_doc and converts list to string, to be used by tf( function)        
    return lexicon_doc,ch  

#converts an input query into a vector space
def query_vector(query):
    uniq_wrds,ch=build_uni_wrds_cnv_string(query)
    query_vec={}
    query_vec_list=[]
    for term in uniq_wrds:
        try:
            query_vec[term]=tf(term,ch)*idf(term)
            query_vec_list.append(query_vec[term])
        except:
            pass
        
        
    mod_val=np.sqrt(np.sum(np.array(query_vec_list)**2)) #to normalize the query vector (there is no need for this,remove it)
    
    for key in query_vec.keys():
        query_vec[key]=query_vec[key]/mod_val #normalized query vector
        
    return query_vec


#converts input document list into vector space
def doc_vector(doc_id_list):
    uniq_words,ch=build_uni_wrds_cnv_string(doc_id_list)
    tf_vec={}
    tf_vec_list=[]
    for word in uniq_words:
        tf_vec[word]=tf(word,ch) # no need to multiply by idf weight here as it is already done in query vector
        tf_vec_list.append(tf_vec[word])
        
    mod_tf = np.sqrt(np.sum(np.array(tf_vec_list)**2))
    
    for key in tf_vec.keys():
        tf_vec[key]=tf_vec[key]/mod_tf
        
    return tf_vec
        
    
#computes the cosine score between query_vector and document_vector
def compute_cosine(query_vec_dct,doc_vec_dct):
    uniq_words_query=set(query_vec_dct.keys())
    uniq_words_doc=set(doc_vec_dct.keys())
    score=0
    for val in (uniq_words_doc & uniq_words_query):
        score+=query_vec_dct[val]*doc_vec_dct[val]
    return score


#finds posting list for the terms of given query
def find_posting_list_query(query_list):
    dct={}
    for term in query_list:
        try:
            dct[term]=inverted_index[term]
        except:
            pass
    return dct

#computes cosine score for all documents in the postings list, however they are unsorted at this point
def rank_docs(query):
    doc_dct=find_posting_list_query(query)
    score_dct={}
    doc_ids=[]
    for key in doc_dct.keys():
        doc_ids+=[a[0] for a in doc_dct[key]]

    #precompute query vector as it will be same for all calculations
    query_vec=query_vector(query)
    
    for document in doc_ids:
        score_dct[document]=compute_cosine(query_vec,doc_vector(document_dct[document]))
    
    return score_dct

#outputs the list of documents and asks to enter the query
def run_query():
    input_query=str(input())
    query=clean_query(input_query.split())
    dct=rank_docs(query)
    inv_dct = {v: k for k, v in dct.items()}
    score_array=np.array(list(inv_dct.keys()))
    sorted_scores=heap.heapsort(score_array)
    #print(sorted_scores.shape)
    links=get_links()
    for i in range(len(sorted_scores)-1,len(sorted_scores)-20,-1):
        #print(i)
        print('Score:',sorted_scores[i],'Doc_ID',inv_dct[sorted_scores[i]])
        print('Link',links[inv_dct[sorted_scores[i]]])

#collects all links in memory
def get_links():
    path_file = 'all_recipe_links.txt'
    doc=[]
    target=open(path_file,'r',encoding='utf-8')
    for wrd in target:
        doc.append(wrd.rstrip('\n'))
    return doc

In [2]:
inverted_index=pickle.load(open('inverted_index.p','rb'))
document_dct=docs_dct()


In [12]:
len(inverted_index['oil'])

6988

In [14]:
run_query()

oil
Score: 0.244401015741 Doc_ID 8044
Link http://www.bbc.co.uk/food/recipes/goatscheesepacirctea_65619
Score: 0.241371061921 Doc_ID 1524
Link http://www.bbc.co.uk/food/recipes/halloumicheeselemono_13052
Score: 0.237636903292 Doc_ID 3003
Link http://www.bbc.co.uk/food/recipes/sun-blushed_tomato_and_23591
Score: 0.236249332952 Doc_ID 5843
Link http://www.bbc.co.uk/food/recipes/homemade_chilli_oil_54985
Score: 0.231916140991 Doc_ID 4880
Link http://www.bbc.co.uk/food/recipes/steamedbroccoliwithl_84194
Score: 0.231772888613 Doc_ID 10890
Link http://www.bbc.co.uk/food/recipes/breadedcheese_77127
Score: 0.228559628961 Doc_ID 9680
Link http://www.bbc.co.uk/food/recipes/steamedseabasswithgi_90422
Score: 0.22707566237 Doc_ID 6415
Link http://www.bbc.co.uk/food/recipes/tomato_rubbed_toast_65992
Score: 0.225390709742 Doc_ID 3755
Link http://www.bbc.co.uk/food/recipes/tomatosoup_75886
Score: 0.222202789967 Doc_ID 9158
Link http://www.bbc.co.uk/food/recipes/lambcutletswithgarli_83750
Score: 0.2139