# Search Engine

In this file, we walk through the process of creating a search engine with an inverted index.

In [2]:
# Imports
import pandas as pd
from path import Path
import os
from collections import defaultdict
from math import log
import string
from collections import Counter
import re

In [None]:
# Start by loading in the data
complete = pd.read_csv('https://scmcqueen.github.io/StarTrekScriptData/complete_data.csv')
# Rename columns
complete.columns = ['index', 'character', 'quote', 'scene', 'location', 'view',
       'episode', 'date', 'series', 'file']
# Clean up Character & Quote
complete['character'] = complete['character'].apply(lambda text: " ".join(str(text).split()))
complete['quote']=complete['quote'].apply(lambda text: " ".join(text.split()))
# Show sample of data
complete.sample(5)

Unnamed: 0,index,character,quote,scene,location,view,episode,date,series,file
32197,417,O'BRIEN,"Then, do it, dammit!",70 INT. OPS,OPS,INT.,If Wishes Were Horses,1993-02-24,Deep Space Nine,416.txt
136636,307,GEORDI,If there's even one chance in a million I'm ri...,47 INT. OBS LOUNGE,OBS LOUNGE,INT.,Interface,1993-07-14,The Next Generation,255.txt
77282,387,RIKER,"Data, what if we forced an EPS discharge throu...",54 INT. BRIDGE,BRIDGE,INT.,Force of Nature,1993-09-17,The Next Generation,261.txt
86707,51,TASHA,Aikido. One.,11 INT. HOLODECK - FULL SHOT - OPTICAL (SPLI...,HOLODECK - FULL SHOT - OPTICAL,INT.,Code of Honor,1987-07-02,The Next Generation,104.txt
103786,227,FALSE PICARD,I have been preoccupied. I've been thinking ab...,31 INT. CAPTAIN'S QUARTERS (OPTICAL),CAPTAIN'S QUARTERS,INT.,Allegiance,1990-01-15,The Next Generation,166.txt


In [None]:
# TO DO: POTENTIALLY FIX CHARACTER NAME TOO

Now that we have the data loaded in, we may want to define some functions that we will use when creating the search engine.

In [10]:
def normalize_string(input_string: str) -> str:
    '''This function processes a string by removing punctuation,
    making text lowercase, and getting rid of extra spaces

    For example:
        "Hello,  HI!!! How are     you?"
    becomes
        "hello hi how are you"
    '''
    translation_table = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    string_without_punc = input_string.translate(translation_table)
    string_without_double_spaces = ' '.join(string_without_punc.split())
    return string_without_double_spaces.lower()

In [None]:
# MAY WANT TO ALSO LEMMATIZE & DROP STOP WORDS

In [11]:
def update_url_scores(old: dict[str, float], new: dict[str, float]):
    '''This function adds two dictionaries together'''
    for url, score in new.items():
        if url in old:
            old[url] += score
        else:
            old[url] = score
    return old

Now we will create a Search Engine object with an inverted index.

In [21]:
class search_engine:
    '''This class creates a search engine object'''
    def __init__(self, index:dict[str, dict[str, int]]=None, docs: dict[str, str]=None,
        original_docs: dict[str, str]=None,k1:float=1.5,b:float=0.75,
        name:str='Default Search Engine',full_data: pd.DataFrame=None):
        '''
        Instantiate an instance of the search engine class.

        Input:
            index: dict[str, dict[int,int]], the inverted index
            docs: dict[int, str], key is the id of the quote and value is the quote text
            original_docs:
            k1: float, k1 constant to use for bm25
            b: float, b constant to use for bm25
            name: string, name used for the search engine instance
            full_data:  pd.DataFrame, the original dataset used

        Output:
            search engine!
        '''
        # set index
        if index is None: self.index = defaultdict(lambda: defaultdict(int))
        else: self.index = index
        # set docs
        if docs is None: self.docs = {}
        else: self.docs = docs
        # set original docs
        if original_docs is None: self.original_docs = {}
        else: self.original_docs = original_docs
        # set k1
        self.k1 = k1
        # set b
        self.b = b
        # set name
        self.name = name
        # set full_data
        self.full_data = full_data

    def __str__(self)->str:
        '''
        Prints a readable name of the search engine

        Output:
            str: name of the instance
        '''
        return(self.name)

    def bulk_load(self,data:dict,full_data:pd.DataFrame=None)->None:
        '''
        Bulk loads new documents to the search engine.

        Input:
            data: dict, the formatted data
            full_data: pd.DataFrame, the data with the full info
        '''
        # get the original size of the docs
        original_len = len(self.docs.keys())
        # for each index in the data
        for ind in data.keys():
            content = data[ind] # quote text
            # add to original docs
            self.original_docs[ind]=content
            # normalize content & add to docs
            n_content = normalize_string(str(content))
            self.docs[ind]=n_content

            # now we want to created the inverted index based on words
            words = n_content.split(" ")
            for w in words:
                self.index[w][ind]+=1 # update count of word per index
        # get new length
        new_len = len(self.docs.keys())
        print(f'We added {new_len-original_len} documents. The engine now has {new_len} documents.')

    def individual_load(self, document:str)-> None:
        '''
        Load a single document into the search engine. Ideally this should not be used.

        Input:
            document: str, the new text document to add to the search engine.
        '''
        # assign new id
        new_id = len(self.docs.keys())
        # add to docs & original docs
        self.original_docs[new_id]=document
        n_docs = normalize_string(document)
        self.docs[new_id]=n_docs
        # now we need to update the inverted index
        words = n_docs.split(" ")
        for w in words:
            self.index[w][new_id]
        print(f'Added document "{document}" to search engine.')

    def num_docs(self)->int:
        '''
        Returns the number of docs

        Output:
            int: length of docs
        '''
        return len(self.docs.keys())

    def find_ids(self, keyword:str)->dict:
        '''
        Find the doc ids that contain a keyword.

        Input:
            keyword: str, the word to search
        Returns:
            dict: keys are the indices and the values are
                the frequency of the word in the document
        '''
        key = normalize_string(keyword)
        return(self.index[key])

    def bw_idf(self,keyword:str)-> float:
        '''
        Find the inverse document frequency for a term

        Input:
            keyword: str, word to search

        Output:
            float: the idf score
        '''
        num_docs = self.num_docs()
        keyword = normalize_string(keyword)
        n_kw = len(self.find_ids(keyword))
        idf = log((num_docs-n_kw+0.5)/(n_kw+0.5)+1)
        return(idf)

    def bm25(self,keyword:str)-> dict[str, float]:
        '''
        Calculate the bm25 score for every document

        Input:
            keyword: str, word to search

        Output:
            dict[str, float]: dict of doc ids & the bm25 score
        '''
        result = {} # instantiate the output
        keyword = normalize_string(keyword)
        idf = self.bw_idf(keyword) # get the idf score
        # get the avg len of a document
        avg_ql = sum(len(d) for d in self.docs.values()) / len(self.docs)
        # calculate the bw score for each
        for id, freq in self.find_ids(keyword).items(): # for doc id & word freq
            numerator = freq*(self.k1+1)
            denominator = freq+self.k1*(1 - self.b + self.b * len(self.docs[id]) / avg_ql)
            result[id]=idf*numerator / denominator
        # return dict with the ids & scores
        return result

    def bw_search(self,query:str,limit:int=20)->dict[str,float]:
        '''
        Completes the bm25 search of the documents using the query and returns

        Input:
            query: str, the query to search through the documents
            limit: int, limits the number of documents

        Output:
            dict[str,float]: the index and the bm25 score
        '''
        # split the query & normalize it
        kws = normalize_string(query).split(" ")
        scores = {} # initialize output
        for k in kws:
            kw_score = self.bm25(k) # get the scores for this word
            scores = update_url_scores(scores,kw_score) # add the dict values together
        # sort the scores by the bm25 score
        sorted_scores = sorted(scores.items(), key=lambda kv: (kv[1], kv[0]),reverse=True)
        # limit the score output
        limit_scores = sorted_scores[:limit]
        return(limit_scores)

In [22]:
bm25_engine = search_engine(name='BM25 Engine',full_data=complete)