## Search Engine Development => BM25 gensim

### Importing Modules

In [2]:
import pandas as pd
import numpy as np
import re

from nltk.tokenize import word_tokenize

from gensim.corpora import Dictionary
from gensim.models import TfidfModel, OkapiBM25Model
from gensim.similarities import SparseMatrixSimilarity

import pickle
import joblib

### Import Data

In [3]:
items = pd.read_parquet('../../../Datasets/Processed/books_SE_v3.parquet')

In [4]:
items.shape

(2113033, 6)

In [5]:
items.head()

Unnamed: 0,book_id,title_without_series,title,language_code,mod_title,mod_title_without_series
0,5333265,W.C. Fields: A Life on Film,W.C. Fields: A Life on Film,,wc fields a life on film,wc fields a life on film
1,1333909,Good Harbor,Good Harbor,,good harbor,good harbor
2,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...","The Unschooled Wizard (Sun Wolf and Starhawk, ...",eng,the unschooled wizard sun wolf and starhawk 12,the unschooled wizard sun wolf and starhawk 12
3,6066819,Best Friends Forever,Best Friends Forever,eng,best friends forever,best friends forever
4,287140,Runic Astrology: Starcraft and Timekeeping in ...,Runic Astrology: Starcraft and Timekeeping in ...,,runic astrology starcraft and timekeeping in t...,runic astrology starcraft and timekeeping in t...


In [None]:
# stop code here

10/0

### Tokenize The `mod_title` Data to Feed that into BM25

**Tokenizing the `title_without_series` and `title`**

In [6]:
title_without_series_tokenized = []
title_tokenized = []

for doc in items["mod_title_without_series"].values:
    title_without_series_tokenized.append(word_tokenize(doc))

for doc in items["mod_title"].values:
    title_tokenized.append(word_tokenize(doc))

In [7]:
title_without_series_tokenized[:3]

[['wc', 'fields', 'a', 'life', 'on', 'film'],
 ['good', 'harbor'],
 ['the', 'unschooled', 'wizard', 'sun', 'wolf', 'and', 'starhawk', '12']]

In [8]:
title_tokenized[:3]

[['wc', 'fields', 'a', 'life', 'on', 'film'],
 ['good', 'harbor'],
 ['the', 'unschooled', 'wizard', 'sun', 'wolf', 'and', 'starhawk', '12']]

**Exporting the tokenized documents**

In [9]:
joblib.dump(title_without_series_tokenized, 'title_without_series_tokenized.joblib')
joblib.dump(title_tokenized, 'title_tokenized.joblib')

['title_tokenized.joblib']

### `dictionary = Dictionary` -> `bm25_model = OkapiBM25Model` -> `bm25_corpus = bm25_model` -> `bm25_index = SparseMatrixSimilarity`

**Creating dictionary**

In [10]:
dictionary1 = Dictionary(title_without_series_tokenized)
dictionary2 = Dictionary(title_tokenized)

In [11]:
len(dictionary1)

313890

In [12]:
len(dictionary2)

313890

**Exporting dictionary**

In [23]:
joblib.dump(dictionary1, 'title_without_series_dictionary.joblib')
joblib.dump(dictionary2, 'title_dictionary.joblib')

['title_dictionary.joblib']

**Training bm25 model**

In [13]:
bm25_model1 = OkapiBM25Model(dictionary=dictionary1)
bm25_model2 = OkapiBM25Model(dictionary=dictionary1)

**Exporting bm25 model**

In [24]:
joblib.dump(bm25_model1, 'title_without_series_bm25.joblib')
joblib.dump(bm25_model2, 'title_bm25.joblib')

['title_bm25.joblib']

In [14]:
bm25_corpus1 = bm25_model1[list(map(dictionary1.doc2bow, title_without_series_tokenized))]
bm25_corpus2 = bm25_model2[list(map(dictionary2.doc2bow, title_tokenized))]

In [15]:
len(bm25_corpus1)

2113033

In [16]:
len(bm25_corpus2)

2113033

**Creating bm25 matrix**

In [17]:
bm25_index1 = SparseMatrixSimilarity(bm25_corpus1, num_docs=len(title_without_series_tokenized), num_terms=len(dictionary1),
                                   normalize_queries=False, normalize_documents=False)

bm25_index2 = SparseMatrixSimilarity(bm25_corpus2, num_docs=len(title_tokenized), num_terms=len(dictionary2),
                                   normalize_queries=False, normalize_documents=False)

In [18]:
len(bm25_index1)

2113033

In [19]:
len(bm25_index2)

2113033

**Exporting bm25 matrix**

In [25]:
joblib.dump(bm25_index1, 'title_without_series_matrix.joblib')
joblib.dump(bm25_index2, 'title_matrix.joblib')

['title_matrix.joblib']

**Creating tfidf model**

In [20]:
tfidf_model1_bnn = TfidfModel(dictionary=dictionary1, smartirs='bnn')
tfidf_model1 = TfidfModel(dictionary=dictionary1)

tfidf_model2_bnn = TfidfModel(dictionary=dictionary2, smartirs='bnn')
tfidf_model2 = TfidfModel(dictionary=dictionary2)

**Exporting tfidf model**

In [26]:
joblib.dump(tfidf_model1_bnn, 'title_without_series_tfidf_bnn.joblib')
joblib.dump(tfidf_model1, 'title_without_series_tfidf.joblib')

joblib.dump(tfidf_model2_bnn, 'title_tfidf_bnn.joblib')
joblib.dump(tfidf_model2, 'title_tfidf.joblib')

['title_tfidf.joblib']

### Testing with Query

**Function to get the top_n matching indices**

In [22]:
def bm25_top_hits(query, tfidf_model, bm25_index, dictionary, n=50):
    processed = re.sub("\s+", " ", re.sub("[^a-zA-Z0-9 ]", "", query.lower()))
    tokenized_query = word_tokenize(processed)
    
    tfidf_query = tfidf_model[dictionary.doc2bow(tokenized_query)]
    similarities = bm25_index[tfidf_query]

    # getting document indices based on BM25 top scores in descending order
    top_n = np.argsort(similarities, axis=0)[::-1]

    return top_n[:n]

In [27]:
query_str = "goblet fire"

#### `title_without_series`

**Top matches - bnn**

In [28]:
indices = bm25_top_hits(query=query_str, tfidf_model=tfidf_model1_bnn, bm25_index=bm25_index1, dictionary=dictionary1, n=30)
indices

array([1670906, 1594931,  499730, 1137525, 1395430,  994462, 2087727,
       1621211,   30632, 1096035,   75469, 1351653,  802741, 1898515,
        398865, 1184607, 1660454, 1665462, 2057184,  638491,  662040,
         31694, 1834823, 2065707,  624086, 2015401,  179139, 1617513,
       1825842,  416477], dtype=int64)

In [29]:
results = items.iloc[indices]
results

Unnamed: 0,book_id,title_without_series,title,language_code,mod_title,mod_title_without_series
1866086,17125270,The Goblet,The Goblet,,the goblet,the goblet
1781268,17861465,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire
557862,28754622,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire
1270755,29411410,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire
1558509,23784313,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire
1110909,13612286,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire
2332334,31844151,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire
1810678,7292005,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,,harry potter and the goblet of fire,harry potter and the goblet of fire
34126,20330423,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire
1224392,29538441,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire


**BM25 gensim vs BM25 okapi**

In [34]:
set(
    indices
    ) - set(
        [1670906, 1594931,  499730, 1137525, 1395430,  994462, 2087727,
       1621211,   30632, 1096035,   75469, 1351653,  802741, 1898515,
        398865, 1184607, 1660454, 1665462, 2057184,  638491,  662040,
         31694, 1834823, 2065707,  624086, 2015401,  179139, 1617513,
       1825842,  416477]
    )

set()

- We can observe that there is no difference

**Top matches - no bnn**

In [30]:
indices = bm25_top_hits(query=query_str, tfidf_model=tfidf_model1, bm25_index=bm25_index1, dictionary=dictionary1, n=30)
indices

array([1670906, 1665462, 1660454,  398865, 1184607, 1898515, 2057184,
       1351653, 1594931, 2087727,   30632,  499730, 1096035,   75469,
       1137525, 1395430,  994462, 1621211,  802741, 1022937,    3501,
       1219303,  638491,  662040,   31694, 1331431, 2076674,  556778,
       1809223, 1924919], dtype=int64)

In [31]:
results = items.iloc[indices]
results

Unnamed: 0,book_id,title_without_series,title,language_code,mod_title,mod_title_without_series
1866086,17125270,The Goblet,The Goblet,,the goblet,the goblet
1859980,305216,The Golden Goblet,The Golden Goblet,,the golden goblet,the golden goblet
1854434,3406842,The Golden Goblet,The Golden Goblet,,the golden goblet,the golden goblet
445120,6033848,The Golden Goblet,The Golden Goblet,,the golden goblet,the golden goblet
1323307,9351802,The Golden Goblet,The Golden Goblet,eng,the golden goblet,the golden goblet
2120714,714807,The Golden Goblet,The Golden Goblet,eng,the golden goblet,the golden goblet
2298151,6929046,The Goblet Club,The Goblet Club,,the goblet club,the goblet club
1509695,1071182,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire
1781268,17861465,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire
2332334,31844151,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire


#### `title`

**Top matches - bnn**

In [32]:
indices = bm25_top_hits(query=query_str, tfidf_model=tfidf_model2_bnn, bm25_index=bm25_index2, dictionary=dictionary2, n=30)
indices

array([1670906, 1594931,  499730, 1137525, 1395430,  994462, 2087727,
       1621211,   30632, 1096035,   75469, 1351653,  802741, 1898515,
        398865, 1184607, 1660454, 1665462, 2057184,  638491,  662040,
         31694, 1834823, 2065707,  624086, 2015401,  179139, 1617513,
       1825842,  416477], dtype=int64)

In [33]:
results = items.iloc[indices]
results

Unnamed: 0,book_id,title_without_series,title,language_code,mod_title,mod_title_without_series
1866086,17125270,The Goblet,The Goblet,,the goblet,the goblet
1781268,17861465,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire
557862,28754622,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire
1270755,29411410,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire
1558509,23784313,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire
1110909,13612286,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire
2332334,31844151,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire
1810678,7292005,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,,harry potter and the goblet of fire,harry potter and the goblet of fire
34126,20330423,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire
1224392,29538441,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire


**Top matches - no bnn**

In [34]:
indices = bm25_top_hits(query=query_str, tfidf_model=tfidf_model2, bm25_index=bm25_index2, dictionary=dictionary2, n=30)
indices

array([1670906, 1665462, 1660454,  398865, 1184607, 1898515, 2057184,
       1351653, 1594931, 2087727,   30632,  499730, 1096035,   75469,
       1137525, 1395430,  994462, 1621211,  802741, 1022937,    3501,
       1219303,  638491,  662040,   31694, 1331431, 2076674,  556778,
       1809223, 1924919], dtype=int64)

In [35]:
results = items.iloc[indices]
results

Unnamed: 0,book_id,title_without_series,title,language_code,mod_title,mod_title_without_series
1866086,17125270,The Goblet,The Goblet,,the goblet,the goblet
1859980,305216,The Golden Goblet,The Golden Goblet,,the golden goblet,the golden goblet
1854434,3406842,The Golden Goblet,The Golden Goblet,,the golden goblet,the golden goblet
445120,6033848,The Golden Goblet,The Golden Goblet,,the golden goblet,the golden goblet
1323307,9351802,The Golden Goblet,The Golden Goblet,eng,the golden goblet,the golden goblet
2120714,714807,The Golden Goblet,The Golden Goblet,eng,the golden goblet,the golden goblet
2298151,6929046,The Goblet Club,The Goblet Club,,the goblet club,the goblet club
1509695,1071182,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire
1781268,17861465,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire
2332334,31844151,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire
