In [4]:
import pandas as pd
from path import Path
import os
from collections import defaultdict
from math import log
import string
from collections import Counter



# Following along with [80 Line Search Engine](https://www.alexmolas.com/2024/02/05/a-search-engine-in-80-lines.html)

In [5]:
# open complete csv
complete = pd.read_csv('../StarTrekNextGenScriptData/complete_data.csv')
complete.columns = ['index', 'character', 'quote', 'scene', 'location', 'view',
       'episode', 'date', 'series', 'file']

### Create Search Engine Class

In [None]:
# functions to use  --- may wanna update this to lemmatize
def normalize_string(input_string: str) -> str:
    translation_table = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    string_without_punc = input_string.translate(translation_table)
    string_without_double_spaces = ' '.join(string_without_punc.split())
    return string_without_double_spaces.lower()

In [None]:
class search_engine:
    '''Search engine class'''
    def __init__(self, index:dict[str, dict[str, int]]=None, docs: dict[str, str]=None,k1:float=1.5,b:float=0.75,name:str='Default Search Engine'):
        '''Instantiate the search engine class.
        Input:
            index: dict[str, dict[int,int]], the inverted index
            docs: dict[int, str], key is the id of the quote and value is the quote text
            k1: float, k1 constant to use for bm25
            b: float, b constant to use for bm25
        Output:
            None, class instantiation
        '''
        if index is None:
            self.index = defaultdict(lambda: defaultdict(int))
        else:
            self.index = index
        if docs is None:
            self.docs = {}
        else:
            self.docs = docs
        self.k1 =k1
        self.b = b
        self.name = name

    def __str__(self)->str:
        '''Print readable name of the search engine
        Output:
            str: name of the instance
        '''
        return(self.name)

    def bulk_load(self,data:dict)->None:
        '''Bulk loads new documents to add to the search engine.

        Input:
            data: dict[int,str], where int is the id and str is the content
        '''
        original_len = len(self.docs.keys())
        for ind in data.keys():
            # get the content & the id
            content = data[ind]
            id = ind
            # add to doc list
            self.docs[id]=normalize_string(content)
            # normalize content
            words = normalize_string(content).split(" ")
            # update the index
            for word in words:
                self.index[word][id] += 1
        new_len =original_len = len(self.docs.keys())
        print(f'We have added {new_len-original_len} documents. The engine now has {new_len} documents.')

    def load(self,document:str)->None:
        '''Load a single document into the search engine. Ideally this should not be used.

        Input:
            document: str, the new text document to add to the search engine.
        '''
        new_id = len(self.docs.keys())
        self.docs[new_id]=normalize_string(document)
        words = normalize_string(document).split(" ")
        for word in words:
            self.index[word][new_id]

    def num_docs(self)->int:
        '''Returns the number of docs

        Output:
            int: length of docs
        '''
        return len(self.docs.keys())

    def find_ids(self,keyword:string)->dict:
        keyword =normalize_string(keyword)
        return self.index[keyword]

In [29]:
test_engine = search_engine()
test_engine.bulk_load(complete[['quote']].to_dict()['quote'])
test_engine.num_docs()

We have added 0 documents. The engine now has 144211 documents.


144211

In [38]:
test_engine.docs[2]

'what do you mean what s this it s puree of beetle'

In [34]:
# set up default index & docs
index = defaultdict(lambda: defaultdict(int)) # dict[str, dict[str, int]]
docs = {} # dict[str, str]
# for BM 25 
k1 = 1.5
b = .75

Here we are created an inverted index: 

```
{
    term: {
        document_id: frequency
    }
}
```

Where the doc id is the index of the quote!

In [19]:
#complete[['quote']].to_dict()

In [35]:
for ind, row in complete.iterrows():
    # get id & content
    id = str(ind)
    content = row['quote']
    # update documents
    docs[id]=normalize_string(content)
    # normalize content
    words = normalize_string(content).split(" ")
    # update the index
    for word in words:
        index[word][id] += 1

In [None]:
num_docs=len(docs)
doc_ids = docs.keys()
avg_ql = sum(len(d) for d in docs.values()) / len(docs)
# length of quote divided by the number of quotes to get avg quote len

In [100]:
def find_ids(keyword:str)->dict[str,int]:
    global index
    # gets the ids of the quotes containing a word
    keyword =normalize_string(keyword)
    return index[keyword]

def bw_idf(keyword:str)->float:
    # for each term, get the idf
    global num_docs
    n_kw = len(find_ids(keyword))
    return log((num_docs-n_kw+0.5)/(n_kw+0.5)+1)

def bm25(keyword:str)-> dict[str, float]:
    result = {}
    idf = bw_idf(keyword)
    for id, freq in find_ids(keyword).items():
        num = freq * (k1+1)
        denom = freq+k1*(1 - b + b * len(docs[id]) / avg_ql)
        result[id]=idf * num /denom
    return result


In [101]:
def update_url_scores(old: dict[str, float], new: dict[str, float]):
    for url, score in new.items():
        if url in old:
            old[url] += score
        else:
            old[url] = score
    return old

In [102]:
def search(query:str)->dict[str,float]:
    kws = normalize_string(query).split(" ")
    scores ={} # dict[str, float] 
    for k in kws:
        kw_url_score = bm25(k)
        scores = update_url_scores(scores,kw_url_score)
    return scores

In [36]:
index['quark']

defaultdict(int,
            {'391': 1,
             '482': 1,
             '487': 1,
             '558': 1,
             '564': 1,
             '767': 1,
             '779': 1,
             '785': 1,
             '1329': 1,
             '1471': 1,
             '1651': 1,
             '1693': 1,
             '1702': 1,
             '1705': 1,
             '1711': 1,
             '1717': 1,
             '1728': 1,
             '1732': 1,
             '1735': 1,
             '1741': 2,
             '1748': 1,
             '1750': 1,
             '1753': 1,
             '1801': 1,
             '1974': 1,
             '1983': 1,
             '2810': 1,
             '2879': 1,
             '3076': 1,
             '3098': 1,
             '3129': 1,
             '3456': 1,
             '3463': 1,
             '3632': 1,
             '3644': 1,
             '3647': 1,
             '3650': 1,
             '3676': 1,
             '3680': 1,
             '3762': 1,
             '4376': 1,
       

In [103]:
search('Quark')

{'391': 6.221498886973986,
 '482': 6.4626359388959385,
 '487': 8.63909515492241,
 '558': 4.896830111908078,
 '564': 7.5616900019127975,
 '767': 8.63909515492241,
 '779': 5.8401550915036875,
 '785': 6.4626359388959385,
 '1329': 7.394030059321743,
 '1471': 4.382060792739961,
 '1651': 5.59518769792469,
 '1693': 6.72321899740191,
 '1702': 3.6012514210049797,
 '1705': 7.737129744291454,
 '1711': 1.8113976130255554,
 '1717': 3.3993491789429977,
 '1728': 8.63909515492241,
 '1732': 3.701173657502406,
 '1735': 5.944255169954484,
 '1741': 9.828944371920581,
 '1748': 3.113693188758807,
 '1750': 3.543846537378289,
 '1753': 3.4700942647543904,
 '1801': 2.33855251130732,
 '1974': 5.8401550915036875,
 '1983': 6.339775294528471,
 '2810': 4.242684665308142,
 '2879': 1.7542260800887377,
 '3076': 6.052133742966021,
 '3098': 4.690216734982387,
 '3129': 2.56323815873627,
 '3456': 4.269846068947536,
 '3463': 4.297357484579212,
 '3632': 3.0992245359696597,
 '3644': 4.657464443040578,
 '3647': 6.9328769006273