# Rank BM 25 Search Engine

## Configuration

In [1]:
%load_ext autotime

In [2]:
from rank_bm25 import *

import pandas as pd
pd.options.display.max_rows = 999
pd.set_option("max_colwidth", 100)
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import os

In [3]:
path = os.getcwd()+'\\clean_data\\'
print('Path: ', path)

Path:  C:\Users\ramos\OneDrive\Documents\SyracuseUniversity\9th_Quarter\IST736\mcu_marvel_search_engine\clean_data\


## Analysis

### About the Data

In [4]:
df = pd.read_csv(path + 'mcu_data_clean_all.csv', index_col=0).reset_index(drop=True)[['character', 'line', 'movie', 'year']]
print('Entries: ', len(df))
df.head()

Entries:  18387


Unnamed: 0,character,line,movie,year
0,TONY STARK,"Oh, I get it. You guys aren’t allowed to talk. Is that it? Are you not allowed to talk?",Iron Man,2008
1,IRON MAN JIMMY,No. We’re allowed to talk.,Iron Man,2008
2,TONY STARK,Oh. I see. So it’s personal.,Iron Man,2008
3,RAMIREZ,I think they’re intimidated.,Iron Man,2008
4,TONY STARK,"Good God, you’re a woman. I, honestly, I couldn’t have called that. I would apologize, but isn’t...",Iron Man,2008


#### Get Lines for Important Characters

In [8]:
char_line = np.unique(df['character'].values, return_counts=True)
char_line = pd.DataFrame(zip(char_line[0], char_line[1]), columns=['character', 'line_count'])
imp_chars = char_line.loc[char_line['line_count']>30]['character'].unique()

## Get Datasets for Training

### Character Lines

In [9]:
lines = df.loc[df['character'].isin(imp_chars)].reset_index(drop=True)
lines.to_csv(path + 'mcu_data_lines.csv')

### Characters

In [57]:
chars = df.loc[df['character'].isin(imp_chars)].groupby(['character'])['line'].apply(lambda x: ' \n'.join(x)).reset_index()
print(len(chars))
chars['line'] = ['I am ' + char.title() + ' /n' + line for char, line in chars.values]
chars.to_csv(path + 'mcu_data_chars.csv')
chars.head()

103


Unnamed: 0,character,line
0,ADRIAN TOOMES,"I am Adrian Toomes /nThings are never gonna be the same now. I mean, look at this. You got alien..."
1,ALDRICH KILLIAN,"I am Aldrich Killian /nMr. Stark! \nOh, wow! Hey, Tony! Aldrich Killian. I'm a big fan of your w..."
2,ALEXANDER PIERCE,"I am Alexander Pierce /nFor the record, councilman, he's Algerian. I can draw a map if it'll hel..."
3,AUNT MAY,I am Aunt May /nMmm. Hey. How was school today? \nYou didn't tell me about the grant. \nYou didn...
4,BETTY BRANT,"I am Betty Brant /nRise and shine, Midtown Science and Technology. \nThanks, Jason, but I alread..."


### Movies

In [59]:
movies = pd.read_csv('raw_data/mcu_scipts.csv', index_col=0)
print('Entries: ', len(movies))
movies.head(23)
movies.to_csv(path + 'mcu_data_movies.csv')

Entries:  23


## Google Universal Sentence Encoder

In [19]:
import tensorflow as tf
import tensorflow_hub as hub

from sklearn.metrics.pairwise import linear_kernel

import shutil

In [20]:
#Model load through URL path:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)

#Create function for using model training
def embed(input):
    return model(input)

In [44]:
def train_model(data, model_name):
    
    ## Training
    Model_USE= embed(data)

    ## Save model
    exported = tf.train.Checkpoint(v=tf.Variable(Model_USE))
    exported.f = tf.function(
        lambda  x: exported.v * x,
        input_signature=[tf.TensorSpec(shape=None, dtype=tf.float32)])

    dirpath = os.getcwd()+'\\trained_models\\{model_name}'.format(model_name=model_name)
    if os.path.exists(dirpath) and os.path.isdir(dirpath):
        shutil.rmtree(dirpath)

    tf.saved_model.save(exported,dirpath)

In [54]:
train_model(data=lines['line'], model_name='relevant_lines')

train_model(data=chars['line'], model_name='relevant_chars')

train_model(data=movies['script'], model_name='relevant_movies')

INFO:tensorflow:Assets written to: C:\Users\ramos\OneDrive\Documents\SyracuseUniversity\9th_Quarter\IST736\mcu_marvel_search_engine\trained_models\relevant_lines\assets


INFO:tensorflow:Assets written to: C:\Users\ramos\OneDrive\Documents\SyracuseUniversity\9th_Quarter\IST736\mcu_marvel_search_engine\trained_models\relevant_lines\assets


INFO:tensorflow:Assets written to: C:\Users\ramos\OneDrive\Documents\SyracuseUniversity\9th_Quarter\IST736\mcu_marvel_search_engine\trained_models\relevant_chars\assets


INFO:tensorflow:Assets written to: C:\Users\ramos\OneDrive\Documents\SyracuseUniversity\9th_Quarter\IST736\mcu_marvel_search_engine\trained_models\relevant_chars\assets


INFO:tensorflow:Assets written to: C:\Users\ramos\OneDrive\Documents\SyracuseUniversity\9th_Quarter\IST736\mcu_marvel_search_engine\trained_models\relevant_movies\assets


INFO:tensorflow:Assets written to: C:\Users\ramos\OneDrive\Documents\SyracuseUniversity\9th_Quarter\IST736\mcu_marvel_search_engine\trained_models\relevant_movies\assets


## Create Search Function

In [52]:
## Function for Document Search
def SearchDocument(query, topn=10):
    
    ## Create Query
    q =[query]
    
    # embed the query for calcluating the similarity
    Q_Train = embed(q)
    
    ## Load Model
    imported_m = tf.saved_model.load(os.getcwd()+'\\trained_models\\relevant_lines')
    loadedmodel = imported_m.v.numpy()
    
    # Calculate the Similarity
    linear_similarities = linear_kernel(Q_Train, loadedmodel).flatten()

    #Sort top 10 index with similarity score
    Top_index_doc = linear_similarities.argsort()[:-(topn + 1):-1]
     
    # sort by similarity score
    linear_similarities.sort()
    a = df.loc[df.index.isin(Top_index_doc)].reset_index(drop=True)
    a['Score'] = linear_similarities[:-(topn+1):-1]
    
    return a.sort_values(['Score'], ascending=False)[['character', 'line', 'movie', 'year', 'Score']]

In [53]:
SearchDocument('character dies')

Unnamed: 0,character,line,movie,year,Score
0,JUSTIN HAMMER,"Well, you’re talking to the right guy. Claridge Hi-Tec, semi-automatic, 9mm pistol. Too downtow...",Iron Man 2,2010,0.414177
1,PEPPER POTTS,"You know, I think I understand why you don't want to give up the suits. What am I going to compl...",Iron Man 3,2013,0.382143
2,STEVE ROGERS,Don't hold your breath.,Captain America: The Winter Soldier,2014,0.381484
3,NOVA PRIME,"All Nova pilots, interlock and form a blockade. The Dark Aster must not reach the ground.",Guardians of the Galaxy,2014,0.372609
4,MANTIS,The bodies are his children.,Guardians of the Galaxy Vol. 2,2017,0.356258
5,STEVE ROGERS,"So your body's changing. Believe me, I know how that feels.",Spider-Man: Homecoming,2017,0.355951
6,AARON DAVIS,Can I give you some advice?,Spider-Man: Homecoming,2017,0.35497
7,ODIN,The skull of Surtur? That’s a formidable weapon.,Thor: Ragnarok,2017,0.350028
8,HANK,No shit.,Ant-Man and the Wasp,2018,0.347618
9,SONNY,"You can go.But, I'm afraid your money's gonna have to remain.Let's call it compensation for my i...",Ant-Man and the Wasp,2018,0.347357
