## Search Engine Development => BM25

### Importing Modules

In [1]:
import pandas as pd
import numpy as np
import re

from nltk.tokenize import sent_tokenize, word_tokenize 
from rank_bm25 import BM25Okapi

import pickle

### Import Data

In [2]:
items = pd.read_parquet('../../../Datasets/Processed/books_SE_v3.parquet')

In [4]:
items.shape

(2113033, 6)

In [5]:
items.head()

Unnamed: 0,book_id,title_without_series,title,language_code,mod_title,mod_title_without_series
0,5333265,W.C. Fields: A Life on Film,W.C. Fields: A Life on Film,,wc fields a life on film,wc fields a life on film
1,1333909,Good Harbor,Good Harbor,,good harbor,good harbor
2,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...","The Unschooled Wizard (Sun Wolf and Starhawk, ...",eng,the unschooled wizard sun wolf and starhawk 12,the unschooled wizard sun wolf and starhawk 12
3,6066819,Best Friends Forever,Best Friends Forever,eng,best friends forever,best friends forever
4,287140,Runic Astrology: Starcraft and Timekeeping in ...,Runic Astrology: Starcraft and Timekeeping in ...,,runic astrology starcraft and timekeeping in t...,runic astrology starcraft and timekeeping in t...


In [14]:
# stop code here

10/0

### Tokenize The `mod_title` Data to Feed that into BM25

**Tokenizing documents**

In [6]:
tokenized_documents = [word_tokenize(document) for document in items["mod_title"].values]
tokenized_documents[:5]

[['wc', 'fields', 'a', 'life', 'on', 'film'],
 ['good', 'harbor'],
 ['the', 'unschooled', 'wizard', 'sun', 'wolf', 'and', 'starhawk', '12'],
 ['best', 'friends', 'forever'],
 ['runic',
  'astrology',
  'starcraft',
  'and',
  'timekeeping',
  'in',
  'the',
  'northern',
  'tradition']]

In [7]:
type(tokenized_documents)

list

**Fitting the tokenized documents into BM25**

In [9]:
bm25 = BM25Okapi(tokenized_documents)

**Custom function to inject the query string and get the indices of top n matches**

In [26]:
def bm25_top_hits(query,n=50):
    processed = re.sub("\s+", " ", re.sub("[^a-zA-Z0-9 ]", "", query.lower()))

    # tokenizing the query string
    tokenized_query = word_tokenize(processed)

    # getting document indices based on BM25 top scores in descending order
    top_n = np.argsort(bm25.get_scores(tokenized_query), axis=0)[::-1]

    return top_n[:n]

### Using the BM25 Performance and Exporting the Model

In [17]:
indices = bm25_top_hits("goblet fire")
indices

array([1670906, 1594931,  499730, 1137525, 1395430,  994462, 2087727,
       1621211,   30632, 1096035,   75469, 1351653,  802741, 1898515,
        398865, 1184607, 1660454, 1665462, 2057184,  638491,  662040,
         31694, 1834823, 2065707,  624086, 2015401,  179139, 1617513,
       1825842,  416477], dtype=int64)

In [18]:
results = items.iloc[indices]
results

Unnamed: 0,book_id,title_without_series,title,language_code,mod_title,mod_title_without_series
1866086,17125270,The Goblet,The Goblet,,the goblet,the goblet
1781268,17861465,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire
557862,28754622,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire
1270755,29411410,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire
1558509,23784313,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire
1110909,13612286,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire
2332334,31844151,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire
1810678,7292005,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,,harry potter and the goblet of fire,harry potter and the goblet of fire
34126,20330423,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire
1224392,29538441,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire


**Exporting the `tokenized_documents` object**

In [20]:
with open('./MemoryArray/tokenized_documents.pkl', 'wb') as file:
    pickle.dump(tokenized_documents, file)

**Exporting the BM25 Model**

In [21]:
with open('./MemoryArray/bm25Model.pkl', 'wb') as file:
    pickle.dump(bm25, file)

## Testing The Model

**Loading the `tokenized_documents` object**

In [3]:
with open('./MemoryArray/tokenized_documents.pkl', 'rb') as file:
    tokenized_documents = pickle.load(file)

**Loading the BM25 trained model**

In [4]:
with open('./MemoryArray/bm25Model.pkl', 'rb') as file:
    bm25 = pickle.load(file)

In [27]:
indices = bm25_top_hits("girl with tattoo")
indices

array([ 410676, 1962908,  491718, 2081798,   21799,   70036, 1552047,
       1815274, 1471032, 1622985, 1572400, 1338776, 1389140, 1488884,
       1252795, 1146632,  699894,  840355,  470767, 1451904, 1078530,
        534134, 1282056, 1681701,  590520, 2032562,  916543, 1933207,
        602871,  976867,  615624, 1508286,  916542, 1825180, 1342402,
       1043351, 2089004,  205984,  508398, 1150375, 2103862, 1754622,
       1380902,  509364, 1633333, 1248602,  242816,  869856,  729896,
       1483406], dtype=int64)

In [28]:
results = items.iloc[indices]
results

Unnamed: 0,book_id,title_without_series,title,language_code,mod_title,mod_title_without_series
458289,448343,Tattoo Girl,Tattoo Girl,,tattoo girl,tattoo girl
2192643,761697,Tattoo Girl,Tattoo Girl,,tattoo girl,tattoo girl
548937,19179374,Girl with the Cat Tattoo,Girl with the Cat Tattoo,eng,girl with the cat tattoo,girl with the cat tattoo
2325724,17826986,The Girl with the Werewolf Tattoo (The Girl wi...,The Girl with the Werewolf Tattoo (The Girl wi...,eng,the girl with the werewolf tattoo the girl wit...,the girl with the werewolf tattoo the girl wit...
24299,14459357,The Dragon with the Girl Tattoo,The Dragon with the Girl Tattoo,,the dragon with the girl tattoo,the dragon with the girl tattoo
77969,28440032,The Girl with the Dragon Tattoo,The Girl with the Dragon Tattoo,,the girl with the dragon tattoo,the girl with the dragon tattoo
1733362,22855495,The Girl with the Dragon Tattoo,The Girl with the Dragon Tattoo,eng,the girl with the dragon tattoo,the girl with the dragon tattoo
2027426,20320357,The Girl with the Thistle Tattoo,The Girl with the Thistle Tattoo,en-US,the girl with the thistle tattoo,the girl with the thistle tattoo
1642866,22718649,The Girl with the Dragon Tattoo,The Girl with the Dragon Tattoo,,the girl with the dragon tattoo,the girl with the dragon tattoo
1812657,14980637,The Dragon with the Girl Tattoo,The Dragon with the Girl Tattoo,,the dragon with the girl tattoo,the dragon with the girl tattoo
