# GOOGLE Universal Senetence Encoder Search Engine

## Configuration

In [1]:
%load_ext autotime

In [2]:
from rank_bm25 import *

import pandas as pd
pd.options.display.max_rows = 999
pd.set_option("max_colwidth", 100)
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import os

In [3]:
path = os.getcwd()+'\\clean_data\\'
print('Path: ', path)

Path:  C:\Users\ramos\OneDrive\Documents\SyracuseUniversity\9th_Quarter\IST736\mcu_marvel_search_engine\clean_data\


## Analysis

### About the Data

In [4]:
df = pd.read_csv(path + 'mcu_data_clean_all.csv', index_col=0).reset_index(drop=True)[['character', 'line', 'movie', 'year']]
print('Entries: ', len(df))
df.head()

Entries:  18387


Unnamed: 0,character,line,movie,year
0,TONY STARK,"Oh, I get it. You guys aren’t allowed to talk. Is that it? Are you not allowed to talk?",Iron Man,2008
1,IRON MAN JIMMY,No. We’re allowed to talk.,Iron Man,2008
2,TONY STARK,Oh. I see. So it’s personal.,Iron Man,2008
3,RAMIREZ,I think they’re intimidated.,Iron Man,2008
4,TONY STARK,"Good God, you’re a woman. I, honestly, I couldn’t have called that. I would apologize, but isn’t...",Iron Man,2008


#### Get Lines for Important Characters

In [5]:
char_line = np.unique(df['character'].values, return_counts=True)
char_line = pd.DataFrame(zip(char_line[0], char_line[1]), columns=['character', 'line_count'])
imp_chars = char_line.loc[char_line['line_count']>30]['character'].unique()

## Get Datasets for Training

### Character Lines

In [6]:
lines = df.loc[df['character'].isin(imp_chars)].reset_index(drop=True)
lines.to_csv(path + 'mcu_data_lines.csv')
lines['line'] = ['{} said: {}'.format(char.title(), line) for char, line in lines[['character', 'line']].values]

### Characters

In [7]:
chars = lines.loc[lines['character'].isin(imp_chars)].groupby(['character'])['line'].apply(lambda x: ' \n'.join(x)).reset_index()
print(len(chars))
chars.to_csv(path + 'mcu_data_chars.csv')
chars.head()

103


Unnamed: 0,character,line
0,ADRIAN TOOMES,"Adrian Toomes said: Things are never gonna be the same now. I mean, look at this. You got aliens..."
1,ALDRICH KILLIAN,"Aldrich Killian said: Mr. Stark! \nAldrich Killian said: Oh, wow! Hey, Tony! Aldrich Killian. I'..."
2,ALEXANDER PIERCE,"Alexander Pierce said: For the record, councilman, he's Algerian. I can draw a map if it'll help..."
3,AUNT MAY,Aunt May said: Mmm. Hey. How was school today? \nAunt May said: You didn't tell me about the gra...
4,BETTY BRANT,"Betty Brant said: Rise and shine, Midtown Science and Technology. \nBetty Brant said: Thanks, Ja..."


### Movies

In [8]:
movies = pd.read_csv('raw_data/mcu_scipts.csv', index_col=0)
print('Entries: ', len(movies))
movies.to_csv(path + 'mcu_data_movies.csv')
movies['script'] = ['The title of this movie is: {movie} \n\n{script}'.format(movie=title, script=script) for title, script in movies[['title', 'script']].values]
movies.head(23)

Entries:  23


Unnamed: 0,title,script
0,Ant-Man,The title of this movie is: Ant-Man \n\nPrevious transcript:\n Next transcript:\n\n\n Avengers: ...
1,Ant-Man and the Wasp,The title of this movie is: Ant-Man and the Wasp \n\nThis transcript is not finished!This page d...
2,The Avengers,The title of this movie is: The Avengers \n\nThis transcript isn't tidy!This page's transcript i...
3,Avengers: Age of Ultron,The title of this movie is: Avengers: Age of Ultron \n\nPrevious transcript:\n Next transcript:\...
4,Avengers: Endgame,The title of this movie is: Avengers: Endgame \n\nPrevious transcript:\n Next transcript:\n\n\n ...
5,Avengers: Infinity War,The title of this movie is: Avengers: Infinity War \n\nPrevious transcript:\n Next transcript:\n...
6,Black Panther,The title of this movie is: Black Panther \n\nThis transcript isn't tidy!This page's transcript ...
7,Captain America: Civil War,The title of this movie is: Captain America: Civil War \n\nPrevious transcript:\n Next transcrip...
8,Captain America: The First Avenger,The title of this movie is: Captain America: The First Avenger \n\nPrevious transcript:\n Next t...
9,Captain America: The Winter Soldier,The title of this movie is: Captain America: The Winter Soldier \n\nPrevious transcript:\n Next ...


## Google Universal Sentence Encoder

In [9]:
import tensorflow as tf
import tensorflow_hub as hub

from sklearn.metrics.pairwise import linear_kernel

import shutil

In [10]:
#Model load through URL path:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)

#Create function for using model training
def embed(input):
    return model(input)

In [11]:
def train_model(data, model_name):
    
    ## Training
    Model_USE= embed(data)

    ## Save model
    exported = tf.train.Checkpoint(v=tf.Variable(Model_USE))
    exported.f = tf.function(
        lambda  x: exported.v * x,
        input_signature=[tf.TensorSpec(shape=None, dtype=tf.float32)])

    dirpath = os.getcwd()+'\\trained_models\\{model_name}'.format(model_name=model_name)
    if os.path.exists(dirpath) and os.path.isdir(dirpath):
        shutil.rmtree(dirpath)

    tf.saved_model.save(exported,dirpath)

In [12]:
train_model(data=lines['line'], model_name='relevant_lines')

train_model(data=chars['line'], model_name='relevant_chars')

train_model(data=movies['script'], model_name='relevant_movies')

INFO:tensorflow:Assets written to: C:\Users\ramos\OneDrive\Documents\SyracuseUniversity\9th_Quarter\IST736\mcu_marvel_search_engine\trained_models\relevant_lines\assets


INFO:tensorflow:Assets written to: C:\Users\ramos\OneDrive\Documents\SyracuseUniversity\9th_Quarter\IST736\mcu_marvel_search_engine\trained_models\relevant_lines\assets


INFO:tensorflow:Assets written to: C:\Users\ramos\OneDrive\Documents\SyracuseUniversity\9th_Quarter\IST736\mcu_marvel_search_engine\trained_models\relevant_chars\assets


INFO:tensorflow:Assets written to: C:\Users\ramos\OneDrive\Documents\SyracuseUniversity\9th_Quarter\IST736\mcu_marvel_search_engine\trained_models\relevant_chars\assets


INFO:tensorflow:Assets written to: C:\Users\ramos\OneDrive\Documents\SyracuseUniversity\9th_Quarter\IST736\mcu_marvel_search_engine\trained_models\relevant_movies\assets


INFO:tensorflow:Assets written to: C:\Users\ramos\OneDrive\Documents\SyracuseUniversity\9th_Quarter\IST736\mcu_marvel_search_engine\trained_models\relevant_movies\assets


## Create Search Function

In [13]:
## Function for Document Search
def SearchDocument(query, topn=10):
    
    ## Create Query
    q =[query]
    
    # embed the query for calcluating the similarity
    Q_Train = embed(q)
    
    ## Load Model
    imported_m = tf.saved_model.load(os.getcwd()+'\\trained_models\\relevant_lines')
    loadedmodel = imported_m.v.numpy()
    
    # Calculate the Similarity
    linear_similarities = linear_kernel(Q_Train, loadedmodel).flatten()

    #Sort top 10 index with similarity score
    Top_index_doc = linear_similarities.argsort()[:-(topn + 1):-1]
     
    # sort by similarity score
    linear_similarities.sort()
    a = df.loc[df.index.isin(Top_index_doc)].reset_index(drop=True)
    a['Score'] = linear_similarities[:-(topn+1):-1]
    
    return a.sort_values(['Score'], ascending=False)[['character', 'line', 'movie', 'year', 'Score']]

In [14]:
SearchDocument('character dies')

Unnamed: 0,character,line,movie,year,Score
0,ELON MUSK,Yeah.,Iron Man 2,2010,0.296965
1,HEIMDALL,None do. All is ready. You may pass.,Thor,2011,0.258153
2,FRIGGA,"I would not have exiled him to a world of mortals, stripped of his powers, to suffer alone. I wo...",Thor,2011,0.257656
3,STEVE ROGERS,Don't hold your breath.,Captain America: The Winter Soldier,2014,0.245847
4,DOCTOR,Let me take her.,Captain America: The Winter Soldier,2014,0.238613
5,DAVE,Did you see that?,Ant-Man,2015,0.237854
6,ULTRON,"It can, you can. You lack the materials. You're a brilliant woman, Helen. But we all have room t...",Avengers: Age of Ultron,2015,0.236187
7,KAREN,With pleasure.,Spider-Man: Homecoming,2017,0.234615
8,AARON DAVIS,Can I give you some advice?,Spider-Man: Homecoming,2017,0.227712
9,ULYSSES KLAUE,"He's right outside, why don't you ask him yourself?",Black Panther,2018,0.224722
