## Installing `sentence-transformers`

In [None]:
!pip install rank_bm25

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.2


In [None]:
# All the necessary imports

from rank_bm25 import BM25Okapi
from sklearn.feature_extraction import _stop_words
import string
from tqdm.autonotebook import tqdm
import numpy as np
import pandas as pd
import ast

  


In [None]:
# Some hyperparameters

pre_prune_results = 100
results_to_show = 10

In [None]:
# Mount drive and load datasets and model

from google.colab import drive
drive.mount("/content/gdrive")

plots = pd.read_csv("/content/gdrive/MyDrive/wiki_with_revenue.csv", compression="zip", converters={'to_embed': ast.literal_eval})
test_queries = pd.read_csv("/content/gdrive/MyDrive/summaries_test.csv", compression="zip")
id_and_summary = pd.read_csv("/content/gdrive/MyDrive/id_and_summary.csv", compression="zip")

Mounted at /content/gdrive


In [None]:
# We lower case our text and remove stop-words from indexing
def bm25_tokenizer(text):
    tokenized_doc = []
    for token in text.lower().split():
        token = token.strip(string.punctuation)

        if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
            tokenized_doc.append(token)
    return tokenized_doc

In [None]:
tokenized_corpus = []
for passage in id_and_summary['to_embed']:
  tokenized_corpus.append(bm25_tokenizer(passage))

bm25 = BM25Okapi(tokenized_corpus)

In [None]:
# Function to query and return top `results_to_show` with associated score

def lexical_query(query_string, bm25_corpus, id_and_summary, wiki_dataset):
  bm25_scores = bm25.get_scores(bm25_tokenizer(query_string))
  top_n = np.argpartition(bm25_scores, -pre_prune_results)[-pre_prune_results:]
  bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
  bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)

  results = []
  for raw_res in bm25_hits:
    if len(results) >= 10:
      break

    corpus_id = raw_res['corpus_id']
    score = raw_res['score']
    movie_id = id_and_summary['MovieId'][corpus_id]
    movie_title = wiki_dataset['Title'][movie_id]
    movie_year = wiki_dataset['Release Year'][movie_id]
    if movie_title.strip() not in map(lambda x: x[0][0].strip(), results):
      results.append(((movie_title, movie_year), score))
  return results

def measure_accuracy(query_dataset, bm25_corpus, id_and_summary, wiki_dataset):
  total = 0
  correct = 0

  for row in query_dataset.iterrows():
    query_string = row[1]['SummaryFragment']
    movie_id = row[1]['MovieId']

    hits = lexical_query(query_string, bm25_corpus, id_and_summary, wiki_dataset)
    movie_title = wiki_dataset['Title'][movie_id]
    if movie_title.strip() in map(lambda x: x[0][0].strip(), hits):
      correct += 1
    total += 1

  return correct/total

In [None]:
query = "couple walks through paris all night"

lexical_query(query, bm25, id_and_summary, plots)

[(('Careers', 1929), 12.915008327355313),
 (('Mischief', 1931), 12.661145357302047),
 (("Devil's Due", 2014), 12.571518863828018),
 (('Istanbul', 1957), 12.179341408524975),
 (('Witness', 1985), 12.086338509752773),
 (('The Strange One', 1957), 11.75522355853468),
 (('The Man Who Reclaimed His Head', 1934), 11.710258811839108),
 (('Target', 1985), 11.107241169779607),
 (('Four Girls in Town', 1957), 11.06112319114832),
 (('Le Week-End', 2013), 10.982310168457605)]

## Testing performance on artificial query set

In [None]:
test_queries

Unnamed: 0,MovieId,PlotFragments,SummaryFragment,summary_length
0,25782,"It is also known that Prince Vijay, nephew of ...","the film now moves to Kiran's hotel, where ama...",18
1,17655,"One Valentine's evening a group of single, dat...",Brett (Guy Pearce) is a science journalist for...,12
2,19695,Charlie's friends won't tell him where Maggie ...,Charlie's friends won't tell him where Maggie ...,18
3,20660,"Thomas Smithers (Postlethwaite), who has made ...",Thomas Smithers (Postlethwaite) hires the famo...,21
4,22022,The plot revolves around the life of aspiring ...,plot revolves around the life of aspiring writ...,15
...,...,...,...,...
24743,11406,Bill's wishes end up causing more trouble due ...,bill's wishes end up causing more trouble due ...,12
24744,6930,Jesse (Robert Wagner) and Frank James (Jeffrey...,Jesse (Robert Wagner) and Frank James (Jeffrey...,16
24745,30788,"Aadhi feels that Dhana has changed a lot, so A...",aadhi attempts to send Dhana in jail for a mur...,16
24746,25561,Amateur boxer Ajay Mehra (Sunny Deol) is livin...,boxer is living with his brother and sister-in...,11


In [None]:
test_queries_small = test_queries.head(1000)

In [None]:
test_queries_small

Unnamed: 0,MovieId,PlotFragments,SummaryFragment,summary_length
0,25782,"It is also known that Prince Vijay, nephew of ...","the film now moves to Kiran's hotel, where ama...",18
1,17655,"One Valentine's evening a group of single, dat...",Brett (Guy Pearce) is a science journalist for...,12
2,19695,Charlie's friends won't tell him where Maggie ...,Charlie's friends won't tell him where Maggie ...,18
3,20660,"Thomas Smithers (Postlethwaite), who has made ...",Thomas Smithers (Postlethwaite) hires the famo...,21
4,22022,The plot revolves around the life of aspiring ...,plot revolves around the life of aspiring writ...,15
...,...,...,...,...
995,22211,"After their first-born baby, Pierre (Patrick G...",Pierre (Patrick Goyette) and Élisabeth (Suzie ...,17
996,14638,He hits the road looking for refuge in his pas...,"he visits his mother, who he hasn't seen in 30...",12
997,29,Hoax rushes to scene of the crime where he dis...,the tramp runs away and Hoax gives chase .,9
998,14120,Barry Egan is a single man who owns a company ...,"he calls a phone-sex line, but the operator at...",13


In [None]:
%time measure_accuracy(test_queries_small, bm25, id_and_summary, plots)

CPU times: user 7min 38s, sys: 3.05 s, total: 7min 42s
Wall time: 7min 43s


0.979

In [None]:
measure_accuracy(test_queries, bm25, id_and_summary, plots)

0.9762809115888152