# 495 Search engine applied to nf_corpus.txt

In [None]:
# CST 495 Data Science For Search, Spring 2016
# @uthors: Richard Isom, Joshua Kim
#
#  This program was designed to look into tf-idf and bm25 techniques for making a search engine
# to test and explore the functionality and user feedback on queries given by volunteer testers
#
# Medical text obtained from http://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/
# Search engine template designed after https://github.com/mattwg/data-science-for-search
# Map-Reduce design by CSUMB Big Data capstone project: http://www.csumbbigdata.github.io


import csv
import math
import pandas as pd
from collections import Counter

# open file containging data
dataFile = "data/nfcorpus_medical.txt"
with open(dataFile) as f:
    r = csv.reader(f, delimiter='	')
    docs = [ x[1] for i,x in enumerate(r) if i > 1 ]
    
# retrieve results from document data, with k1 and b BM-25 constants
def get_results(qry, corpus, k1=1.5, b=0.75):
    idx = create_inverted_index(corpus)
    n = len(corpus)
    d = [len(x.split()) for x in corpus]
    d_avg = float(sum(d)) / len(d)                
    score = Counter()
    for term in qry.split():
        if term in idx:
            i = idf(term, idx, n)
            for doc in idx[term]:
                f = float(idx[term][doc])
                score[doc] += i * (( f * (k1 + 1) ) / (f + k1 * (1 - b + (b * (float(d[doc]) / d_avg)))))
        
    results=[]
    for x in [[r[0],r[1]] for r in zip(score.keys(), score.values())]:
        if x[1] > 0:
            # output [0] score, [1] doc_id
            results.append([x[1],x[0]])

    return results;

# inverted index of document data 
def create_inverted_index(corpus):
    idx={}
    for i, doc in enumerate(corpus):
        for word in doc.split():
            if word in idx:
                if i in idx[word]:
                    # Update document's frequency
                    idx[word][i] += 1
                else:
                    # Add document
                    idx[word][i] = 1
            else:
                # Add term
                idx[word] = {i:1}
    return idx

# inverse document frequency
def idf(term, idx, n):
    return math.log( float(n) / (1 + len(idx[term])))

# bokeh tools for displaying data
from bokeh.charts import output_notebook, Scatter, show
from bokeh.io import push_notebook
from bokeh.plotting import ColumnDataSource, figure
from bokeh.models import HoverTool, ColorMapper
from bokeh.palettes import YlOrRd9

print(YlOrRd9)

output_notebook()

results = get_results('', docs, k1=1.5, b=0.75)

x_vals = [float(x[0]) for x in results] 
y_vals = [len(docs[x[1]].split()) for x in results]
d_vals = [docs[x[1]] for x in results]

hover = HoverTool(
        tooltips=[
            ("desc", "@desc"),
        ]
    )

source = ColumnDataSource(data=dict(x=x_vals,y=y_vals,desc=d_vals))
p = figure()
p.add_tools(hover)
p.circle(x_vals, y_vals, size=10, color="red", source=source)
show(p)

# Set data to be retrieved when query is updated
def update(qry, k1, b):
    results = get_results(qry, docs, k1, b)
    x_vals = [float(x[0]) for x in results] 
    y_vals = [len(docs[x[1]].split()) for x in results]
    d_vals = [docs[x[1]] for x in results]
    source.data['x'] = x_vals
    source.data['y'] = y_vals
    source.data['desc'] = d_vals
    push_notebook()


In [None]:
# Interaction box for setting queries and k1 and b constants   
from ipywidgets import interact
interact(update, qry='cancer', k1=(0.0,2.0,0.05), b=(0.0,1.0,0.05))


Map Reduce Functions
These were tested on our test text document to see how it would work accross hadoop clusters. 

In [None]:
# Mapper
def mapper():
    for line in sys.stdin:
        data = line.strip().split(" ")
        words = data
        for word in words:	
            print "{0}\t{1}".format(word,"1")
mapper()

# Reducer
wordCount = {}
def reducer():    
    for line in sys.stdin:        
        line = line.strip()
        word, count = line.split("\t",1)
        try:
            count = int(count)
        except ValueError:
            continue
        try:
            wordCount[word] = wordCount[word] + count
        except:
            wordCount[word] =count 
    for word in wordCount.keys():
        print "{0}\t{1}".format(word,wordCount[word])
reducer()