In [58]:
import pickle
import time
import pandas as pd
import ast
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer


In [60]:
class SearchEngine:

    def create_positional_indexes(self,dataset: pd.DataFrame):
    
        p_index = {}
        """
        to return a positional index:
        {
            'word'[
                freq:int,
                'doc1':[
                    freq:int,
                    [pos:int]
                ]
            ] 
            ...
        }
        """
        display(dataset)
        for _index, row in dataset.iterrows():
            doc = ast.literal_eval(row['tokens'])
            for i,token in enumerate(doc):
                if token not in p_index:
                    p_index[token] = [
                        1,
                        {row['Title']:[1,[i]]}
                    ]
                    
                elif row['Title'] not in p_index[token][1]:
                    p_index[token][0] += 1
                    p_index[token][1][row['Title']] = [1,[i]]

                else:
                    p_index[token][1][row['Title']][0] += 1
                    p_index[token][1][row['Title']][1].append(i)

        return p_index

    def store_index(self):
        with open('index.pkl', 'wb') as f:
            pickle.dump(self.index,f)

    def construct_index(self, df):
        self.index = self.create_positional_indexes(df)
        self.store_index()
    
    def load_index(self,index):
        self.index = index
        print('Index loaded.')
    
    def parse_query(self,query):
        tokens = word_tokenize(query)
        
        stop_words = set(stopwords.words('english'))
        
        filtered_tokens = [w for w in tokens if not w.lower() in stop_words]
        
        ps = PorterStemmer()
        stemmed_tokens = [ps.stem(w) for w in filtered_tokens]
        
        return stemmed_tokens
    
    def search(self,query:str):
        print('querying...')
        q_tokens = self.parse_query(query)
        print('tokens',q_tokens)
        final_results = {}
        
        for index, token in enumerate(q_tokens):
            if token in self.index:
                if index == 0:
                    final_results = self.index[token][1] 
                else:
                    filtered_results = {}
                    for doc in final_results.keys():
                        if doc not in self.index[token][1]:
                            continue
                        for position in final_results[doc][1]:
                            if (int(position)+index) in self.index[token][1][doc][1]:
                                filtered_results[doc] = final_results[doc]
                                break
                    final_results = filtered_results

        return list(final_results.keys())
        
search_engine = SearchEngine()

In [35]:
df=pd.read_csv('parsed_data.csv', sep=',',header=0)
search_engine.construct_index(df)


Unnamed: 0.1,Unnamed: 0,Title,tokens
0,0,Kansas Saloon Smashers,"['bartend', 'work', 'saloon', ',', 'serv', 'dr..."
1,1,Love by the Light of the Moon,"['moon', ',', 'paint', 'smile', 'face', 'hang'..."
2,2,The Martyred Presidents,"['film', ',', 'minut', 'long', ',', 'compos', ..."
3,3,"Terrible Teddy, the Grizzly King","['last', '61', 'second', 'consist', 'two', 'sh..."
4,4,Jack and the Beanstalk,"['earliest', 'known', 'adapt', 'classic', 'fai..."
...,...,...,...
34881,34881,The Water Diviner,"['film', 'begin', '1919', ',', 'world', 'war',..."
34882,34882,Çalgı Çengi İkimiz,"['two', 'musician', ',', 'salih', 'gürkan', ',..."
34883,34883,Olanlar Oldu,"['zafer', ',', 'sailor', 'live', 'mother', 'dö..."
34884,34884,Non-Transferable,"['film', 'centr', 'around', 'young', 'woman', ..."


In [61]:
with open('index.pkl', 'rb') as f:
    index = pickle.load(f)

search_engine.load_index(index)

start_time = time.time()
results = search_engine.search('A bartender is working at a saloon')

print("results -",results)
print("time taken -",(time.time() - start_time), 'seconds')

Index loaded.
querying...
tokens ['bartend', 'work', 'saloon']
results - ['Kansas Saloon Smashers']
time taken - 0.002501249313354492 seconds
