## Search Engine Development => BM25 gensim

### Importing Modules

In [1]:
import pandas as pd
import numpy as np
import re

from nltk.tokenize import word_tokenize

from gensim.corpora import Dictionary
from gensim.models import TfidfModel, OkapiBM25Model
from gensim.similarities import SparseMatrixSimilarity

import pickle

### Import Data

In [2]:
items = pd.read_parquet('../../../Datasets/Processed/books_SE_v3.parquet')

In [3]:
items.shape

(2113033, 6)

In [4]:
items.head()

Unnamed: 0,book_id,title_without_series,title,language_code,mod_title,mod_title_without_series
0,5333265,W.C. Fields: A Life on Film,W.C. Fields: A Life on Film,,wc fields a life on film,wc fields a life on film
1,1333909,Good Harbor,Good Harbor,,good harbor,good harbor
2,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...","The Unschooled Wizard (Sun Wolf and Starhawk, ...",eng,the unschooled wizard sun wolf and starhawk 12,the unschooled wizard sun wolf and starhawk 12
3,6066819,Best Friends Forever,Best Friends Forever,eng,best friends forever,best friends forever
4,287140,Runic Astrology: Starcraft and Timekeeping in ...,Runic Astrology: Starcraft and Timekeeping in ...,,runic astrology starcraft and timekeeping in t...,runic astrology starcraft and timekeeping in t...


In [None]:
# stop code here

10/0

### Tokenize The `mod_title` Data to Feed that into BM25

**We already have the tokenized documents**

In [5]:
with open('./Resources/tokenized_documents.pkl', 'rb') as file:
    tokenized_documents = pickle.load(file)

In [6]:
tokenized_documents[:5]

[['wc', 'fields', 'a', 'life', 'on', 'film'],
 ['good', 'harbor'],
 ['the', 'unschooled', 'wizard', 'sun', 'wolf', 'and', 'starhawk', '12'],
 ['best', 'friends', 'forever'],
 ['runic',
  'astrology',
  'starcraft',
  'and',
  'timekeeping',
  'in',
  'the',
  'northern',
  'tradition']]

### `dictionary = Dictionary` -> `bm25_model = OkapiBM25Model` -> `bm25_corpus = bm25_model` -> `bm25_index = SparseMatrixSimilarity`

In [7]:
dictionary = Dictionary(tokenized_documents)

In [16]:
len(dictionary)

313890

In [8]:
bm25_model = OkapiBM25Model(dictionary=dictionary)

In [10]:
bm25_corpus = bm25_model[list(map(dictionary.doc2bow, tokenized_documents))]

In [17]:
len(bm25_corpus)

2113033

In [11]:
bm25_index = SparseMatrixSimilarity(bm25_corpus, num_docs=len(tokenized_documents), num_terms=len(dictionary),
                                   normalize_queries=False, normalize_documents=False)

In [15]:
len(bm25_index)

2113033

### Testing with Query

In [30]:
query_str = "goblet fire"

**Function to get the top_n matching indices**

In [29]:
def bm25_top_hits(query, dictionary, n=50):
    processed = re.sub("\s+", " ", re.sub("[^a-zA-Z0-9 ]", "", query.lower()))
    tokenized_query = word_tokenize(processed)
    
    tfidf_model = TfidfModel(dictionary=dictionary, smartirs='bnn') # Enforce binary weighting of queries
    tfidf_query = tfidf_model[dictionary.doc2bow(tokenized_query)]
    similarities = bm25_index[tfidf_query]

    # getting document indices based on BM25 top scores in descending order
    top_n = np.argsort(similarities, axis=0)[::-1]

    return top_n[:n]


**Top matches**

In [32]:
indices = bm25_top_hits(query=query_str, dictionary=dictionary, n=30)
indices

array([1670906, 1594931,  499730, 1137525, 1395430,  994462, 2087727,
       1621211,   30632, 1096035,   75469, 1351653,  802741, 1898515,
        398865, 1184607, 1660454, 1665462, 2057184,  638491,  662040,
         31694, 1834823, 2065707,  624086, 2015401,  179139, 1617513,
       1825842,  416477], dtype=int64)

In [33]:
results = items.iloc[indices]
results

Unnamed: 0,book_id,title_without_series,title,language_code,mod_title,mod_title_without_series
1866086,17125270,The Goblet,The Goblet,,the goblet,the goblet
1781268,17861465,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire
557862,28754622,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire
1270755,29411410,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire
1558509,23784313,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire
1110909,13612286,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire
2332334,31844151,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire
1810678,7292005,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,,harry potter and the goblet of fire,harry potter and the goblet of fire
34126,20330423,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire
1224392,29538441,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire


**BM25 gensim vs BM25 okapi**

In [34]:
set(
    indices
    ) - set(
        [1670906, 1594931,  499730, 1137525, 1395430,  994462, 2087727,
       1621211,   30632, 1096035,   75469, 1351653,  802741, 1898515,
        398865, 1184607, 1660454, 1665462, 2057184,  638491,  662040,
         31694, 1834823, 2065707,  624086, 2015401,  179139, 1617513,
       1825842,  416477]
    )

set()

- We can observe that there is no difference