### Imports

In [29]:
import re
import pandas as pd
import nltk
import csv
import pickle
import math

### Defining Index/InvertedIndex Functions

In [2]:
def read_documents(path):
    f = open(path, 'r')
    documents = f.read()
    documents = documents.split(".I ")
    for i in range(1,len(documents)):
        documents[i] = documents[i].split("\n.S", maxsplit=1)
    return documents

In [3]:
def remove_stopwords(text,stpwrd):
    text =  ' '.join([word for word in text.split() if word not in stpwrd]) # delete stopwords from text
    return text

In [4]:
def process_text(text):
	pattern = re.compile('[\W_]+')
	text = text.lower()
	text = pattern.sub(' ', text)
	text = re.sub(r'[\W_]+',' ', text)
	text = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", text)
	text = text.split()
	return text

In [5]:
def clean_documents(documents, stpwrd):
    cleaned_documents = {}
    for i in range(1,len(documents)-1):
        try:
            id = documents[i][0][9:]
            text = documents[i][1]
        except:
            continue
        text = remove_stopwords(text, stpwrd)
        text = process_text(text)
        cleaned_documents[int(id)]= text
    return cleaned_documents

In [6]:
def index_one_file(termlist):
	fileIndex = {}
	for index, word in enumerate(termlist):
		if word in fileIndex.keys():
			fileIndex[word].append(index)
		else:
			fileIndex[word] = [index]
	return fileIndex

In [7]:
def make_indices(termlists):
	total = {}
	for filename in termlists.keys():
		total[filename] = index_one_file(termlists[filename])
	return total

In [8]:
def fullIndex(regdex):
	total_index = {}
	for filename in regdex.keys():
		for word in regdex[filename].keys():
			if word in total_index.keys():
				if filename in total_index[word].keys():
					total_index[word][filename].extend(regdex[filename][word][:])
				else:
					total_index[word][filename] = regdex[filename][word]
			else:
				total_index[word] = {filename: regdex[filename][word]}
	return total_index

### Creating Index/InvertedIndex, Saving Pickles (Do not run, load from pickles instead)

In [9]:
### MAIN
nltk.download('stopwords')
stpwrd = nltk.corpus.stopwords.words('english')
stpwrd.extend(['.U', '.S','.M','.T','.P','.W','.M','.I'])

documents = read_documents('/content/drive/MyDrive/INFORMATION RETRIEVAL/HW1 SEARCH ENGINE/ohsumed.88-91')
cleaned_documents = clean_documents(documents, stpwrd)
index = make_indices(cleaned_documents)
inverted_index = fullIndex(index)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
inverted_index_file = open("/content/drive/MyDrive/INFORMATION RETRIEVAL/HW1 SEARCH ENGINE/inverted_index.pkl", "wb")
pickle.dump(inverted_index, inverted_index_file)
inverted_index_file.close()

In [10]:
index_file = open("/content/drive/MyDrive/INFORMATION RETRIEVAL/HW1 SEARCH ENGINE/index.pkl", "wb")
pickle.dump(index, index_file)
index_file.close()

### Loading Index/InvertedIndex from Pickles

In [9]:
b_file = open("/content/drive/MyDrive/INFORMATION RETRIEVAL/HW1 SEARCH ENGINE/index.pkl", "rb")
index = pickle.load(b_file)

In [10]:
a_file = open("/content/drive/MyDrive/INFORMATION RETRIEVAL/HW1 SEARCH ENGINE/inverted_index.pkl", "rb")
inverted_index = pickle.load(a_file)

In [11]:
# index[88143308]

In [12]:
# inverted_index['neodymium']

### Query Parsing Functions

In [13]:
def read_queries(filename):
    queries = []
    with open(filename, 'r+') as f:
        current_query = None
        for line in f:
            line = line[:-1]
            if '<top>' in line:
                current_query = {}
            elif '</top>' in line:
                queries.append(current_query)
                current_query = {}
            elif '<num>' in line:
                current_query['num'] = line.split(':')[1].strip()
            elif '<title>' in line:
                current_query['title'] = line.split('>')[1].strip()
            elif (not '<desc>' in line and len(line) > 2):
                current_query['description'] = line

            queries_2 = {}

            for i in range(len(queries)):
                queries_2[queries[i]['num']] = queries[i]['description']

    return queries_2

In [14]:
def clean_queries(queries, stpwrd):
    for i, j in queries.items():
        text = queries[i]
        text = remove_stopwords(text, stpwrd)
        text = process_text(text)
        queries[i] = text
    return queries

In [15]:
def free_text_query(query):
    terms = query
    docs=set()
    for term in terms:
        try:
            termDocs = [i for i in inverted_index[term]]
            # print(termDocs)
            docs |= set(termDocs)
        except:
            #term is not in inverted index
            pass
    docs=list(docs)
    return docs

In [16]:
def intersection(lst1, lst2):
    return list(set(lst1) & set(lst2))

### Loading Queries

In [17]:
nltk.download('stopwords')
stpwrd = nltk.corpus.stopwords.words('english')
stpwrd.extend(['.U', '.S','.M','.T','.P','.W','.M','.I'])
queries = read_queries('/content/drive/MyDrive/INFORMATION RETRIEVAL/HW1 SEARCH ENGINE/query.ohsu.1-63')
queries = clean_queries(queries, stpwrd)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Ranking Functions

In [18]:
def boolean_ranking(docs, query):
    docs_score = {}
    for doc in docs:
        score = 0
        score = len ( intersection(list(index[doc].keys()), query) )
        docs_score[doc] = score
    docs_score = list( sorted(docs_score.items(),
                           key=lambda item: item[1],
                           reverse=True))
    return docs_score[:50]

In [36]:
def tf_ranking(docs, query):
    docs_score = {}
    for doc in docs:
        doc_length = len(index[doc])
        absolute_freq = 0
        normalized_freq = 0
        for term in query:
            try:
                term_freq = len(index[doc][term])
            except:
                term_freq = 0
            absolute_freq += term_freq
        normalized_freq = absolute_freq / doc_length
        docs_score[doc] = normalized_freq
    docs_score = list( sorted(docs_score.items(),
                           key=lambda item: item[1],
                           reverse=True))
    return docs_score[:50]


In [41]:
def tf_idf_ranking(docs, query):
    docs_score = {}
    for doc in docs:
        doc_length = len(index[doc])
        absolute_freq = 0
        normalized_freq = 0
        idf_total = 0
        for term in query:
            try:
                term_freq = len(index[doc][term])
            except:
                term_freq = 0
            absolute_freq += term_freq
        normalized_freq = absolute_freq / doc_length

        for term in query:
            try:
                document_freq = len(inverted_index[term])
            except:
                continue
            idf = len(inverted_index) / document_freq
            idf = math.log(idf)
            idf_total += idf

        tf_idf = normalized_freq * idf_total

        docs_score[doc] = tf_idf
    docs_score = list( sorted(docs_score.items(),
                           key=lambda item: item[1],
                           reverse=True))
    return docs_score[:50]

In [56]:
def custom_ranking(docs, query):
    docs_score = tf_idf_ranking(docs, query)
    new_docs = []
    for i, j in docs_score:
        new_docs.append(i)
    new_docs_score = boolean_ranking(new_docs, query)
    return new_docs_score

### Creating Log Files

In [54]:
def create_log_file(algorithm_name):
    f = open(algorithm_name, 'w')
    for query_code, query in queries.items():
        docs = free_text_query(query)
        if algorithm_name == 'BOOL':
            docs_score = boolean_ranking(docs, query)
        elif algorithm_name == 'TF':
            docs_score = tf_ranking(docs, query)
        elif algorithm_name == 'TF-IDF':
            docs_score = tf_idf_ranking(docs, query)
        elif algorithm_name == 'CUSTOM':
            docs_score = custom_ranking(docs, query)
        for i in range(len(docs_score)):
            f.write(query_code + "\tQ0\t" + str(docs_score[i][0]) + "\t" 
                    + str(i+1) + "\t" + str(docs_score[i][1]) + "\t" + algorithm_name +"\n")
    f.close()

In [234]:
create_log_file('BOOL')

In [39]:
create_log_file('TF')

In [42]:
create_log_file('TF-IDF')

In [57]:
create_log_file('CUSTOM')