# Rank BM 25 Search Engine

## Configuration

In [1]:
%load_ext autotime

In [2]:
from rank_bm25 import *

import pandas as pd
pd.options.display.max_rows = 999
pd.set_option("max_colwidth", 100)
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import os

In [3]:
path = os.getcwd()+'\\clean_data\\'
print('Path: ', path)

Path:  C:\Users\ramos\OneDrive\Documents\SyracuseUniversity\9th_Quarter\IST736\mcu_marvel_search_engine\clean_data\


## Analysis

### About the Data

In [4]:
df = pd.read_csv(path + 'mcu_data_clean_all.csv', index_col=0).reset_index(drop=True)[['character', 'line', 'movie', 'year']]
print('Entries: ', len(df))
df.head()

Entries:  19205


Unnamed: 0,character,line,movie,year
0,TONY STARK,"Oh, I get it. You guys aren’t allowed to talk. Is that it? Are you not allowed to talk?",Iron Man,2008
1,IRON MAN JIMMY,No. We’re allowed to talk.,Iron Man,2008
2,TONY STARK,Oh. I see. So it’s personal.,Iron Man,2008
3,RAMIREZ,I think they’re intimidated.,Iron Man,2008
4,TONY STARK,"Good God, you’re a woman. I, honestly, I couldn’t have called that. I would apologize, but isn’t...",Iron Man,2008


#### Get Important Characters Only

In [5]:
char_line = np.unique(df['character'].values, return_counts=True)
char_line = pd.DataFrame(zip(char_line[0], char_line[1]), columns=['character', 'line_count'])
imp_chars = char_line.loc[char_line['line_count']>30]['character'].unique()

In [6]:
df = df.loc[df['character'].isin(imp_chars)].reset_index(drop=True)

In [7]:
df.to_csv(path + 'mcu_data_important_chars.csv')

#### Google Universal Sentence Encoder

In [8]:
import tensorflow as tf
import tensorflow_hub as hub

from sklearn.metrics.pairwise import linear_kernel

In [9]:
#Model load through URL path:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)

#Create function for using model training
def embed(input):
    return model(input)

In [10]:
import shutil

## Training
Model_USE= embed(df.line)

## Save model
exported = tf.train.Checkpoint(v=tf.Variable(Model_USE))
exported.f = tf.function(
    lambda  x: exported.v * x,
    input_signature=[tf.TensorSpec(shape=None, dtype=tf.float32)])

dirpath = os.getcwd()+'\\pretrained_model\\'
if os.path.exists(dirpath) and os.path.isdir(dirpath):
    shutil.rmtree(dirpath)
    
tf.saved_model.save(exported,dirpath)

INFO:tensorflow:Assets written to: C:\Users\ramos\OneDrive\Documents\SyracuseUniversity\9th_Quarter\IST736\mcu_marvel_search_engine\pretrained_model\assets


INFO:tensorflow:Assets written to: C:\Users\ramos\OneDrive\Documents\SyracuseUniversity\9th_Quarter\IST736\mcu_marvel_search_engine\pretrained_model\assets


In [11]:
## Function for Document Search
def SearchDocument(query, topn=10):
    
    ## Create Query
    q =[query]
    
    # embed the query for calcluating the similarity
    Q_Train = embed(q)
    
    ## Load Model
    imported_m = tf.saved_model.load(os.getcwd()+'\\pretrained_model\\')
    loadedmodel = imported_m.v.numpy()
    
    # Calculate the Similarity
    linear_similarities = linear_kernel(Q_Train, loadedmodel).flatten()

    #Sort top 10 index with similarity score
    Top_index_doc = linear_similarities.argsort()[:-(topn + 1):-1]
     
    # sort by similarity score
    linear_similarities.sort()
    a = df.loc[df.index.isin(Top_index_doc)].reset_index(drop=True)
    a['Score'] = linear_similarities[:-(topn+1):-1]
    
    return a.sort_values(['Score'], ascending=False)[['character', 'line', 'movie', 'year', 'Score']]

In [12]:
SearchDocument('character dies')

Unnamed: 0,character,line,movie,year,Score
0,JAMES RHODES,Vanko’s alive?,Iron Man 2,2010,0.414177
1,STEVE ROGERS,You lose someone?,Captain America: The Winter Soldier,2014,0.382143
2,GAMORA,Everything will die.,Guardians of the Galaxy,2014,0.381484
3,STEVE ROGERS,Fatalities?,Avengers: Age of Ultron,2015,0.372609
4,DOCTOR STRANGE,Bye-bye.,Thor: Ragnarok,2017,0.356258
5,THOR,Both dead.,Avengers: Infinity War,2018,0.355951
6,DRAX,"Die, blanket of death!",Avengers: Infinity War,2018,0.35497
7,T'CHALLA,Murderer!,Black Panther,2018,0.350028
8,ROCKET,Something died in here.,Avengers: Endgame,2019,0.347618
9,THOR,Noobmaster again?,Avengers: Endgame,2019,0.347357


In [13]:
SearchDocument('thanos', topn=30)

Unnamed: 0,character,line,movie,year,Score
0,PETER QUILL,Thanos?,Guardians of the Galaxy,2014,1.0
1,PETER QUILL,Drax!,Guardians of the Galaxy,2014,1.0
2,PETER QUILL,Yondu.,Guardians of the Galaxy,2014,1.0
3,DRAX,"Yes. Of course, Ronan was only a puppet. It's really Thanos I need to kill.",Guardians of the Galaxy,2014,0.827228
4,PETER QUILL,YONDU?,Guardians of the Galaxy Vol. 2,2017,0.813733
5,PETER QUILL,Gamora!!,Guardians of the Galaxy Vol. 2,2017,0.779539
6,NEBULA,I’ll help them by killing Thanos.,Guardians of the Galaxy Vol. 2,2017,0.748195
7,PETER QUILL,Yondu.,Guardians of the Galaxy Vol. 2,2017,0.743052
8,BRUCE BANNER,Thanos is coming. He's coming...,Avengers: Infinity War,2018,0.740423
9,THOR,You seem to know a great deal about Thanos.,Avengers: Infinity War,2018,0.735027


In [14]:
SearchDocument('forever', topn=20)

Unnamed: 0,character,line,movie,year,Score
0,NATASHA ROMANOFF,Right away.,Iron Man 2,2010,1.0
1,HAPPY HOGAN,Hang on.,Iron Man 2,2010,0.734504
2,PEGGY CARTER,Up.,Captain America: The First Avenger,2011,0.545618
3,FRIGGA,Always.,Thor,2011,0.544093
4,VOLSTAGG,Hang on!,Thor,2011,0.544093
5,TONY STARK,How long?,The Avengers,2012,0.541111
6,STEVE ROGERS,Hang on!,Captain America: The Winter Soldier,2014,0.541111
7,HANK PYM,Hang on.,Ant-Man,2015,0.538593
8,CLINT BARTON,Hard right... Now.,Avengers: Age of Ultron,2015,0.534134
9,NATASHA ROMANOFF,Nothing lasts forever.,Avengers: Age of Ultron,2015,0.534134
