# A short and sweet modification of the SBERT tutorials 

SentenceTransformers is a Python framework for state-of-the-art sentence, text and image embeddings. The initial work is described in the paper Sentence-BERT: [Sentence Embeddings using Siamese BERT-Networks](https://arxiv.org/abs/1908.10084).




In [None]:
# Install SentenceBert Library
!pip install -U sentence-transformers

In [None]:
# Import kaggle dataset
import pandas as pd
pd.set_option('display.max_colwidth', 500)

df = pd.read_csv("../input/netflix-shows/netflix_titles.csv")
df.head()

In [None]:
# Filtering df
df = df[['title', 'rating', 'description']]

In [None]:
# Quick check of description lens (for BERT token len)
import matplotlib.pyplot as plt

lens = [len(x.split()) for x in df['description']]

plt.hist(lens, bins=30)
plt.show()

In [None]:
# Import library, utilities 
from sentence_transformers import SentenceTransformer, util
import torch

# Set embedding model and max_seq_len and push to GPU
embedder = SentenceTransformer('bert-base-uncased')
embedder.to('cuda')
# going a little longer for user inputed synopsis
embedder.max_seq_len = 128

In [None]:
# Set feature lists for concatonation to sematic asearch results
titles = df['title'].tolist()
ratings = df['rating'].tolist()
stories = df['description'].tolist()

# Fit model to corpus qnd push to GPU
story_embeddings = embedder.encode(stories, convert_to_tensor=True)
story_embeddings = story_embeddings.to('cuda')

# Just run the cell below and enter a title or a short synopsis which you would like to find similar results for... 

In [None]:
# Define Semantic Search Function
def semantic_search(input_data):
  # set lists to capture results
  title_list = []
  rating_list = []
  story_list = []
  score_list = []
  # empty dataframe to display results 
  results = pd.DataFrame()
  # Find the closest 5 stories of the corpus for each query sentence based on cosine similarity
  top_k = min(10, len(story_embeddings))
  
  # If the input is too short to be a story or its not in the dataset  
  if len(input_data) < 20 and input_data not in titles:
    print('Title Not Found')

  # If input is in the dataset
  elif input_data in titles:
    # Load and encode the description for the title match    
    query_embeddings = embedder.encode(str(df[df['title'] == input_data]['description'])[5:-33], convert_to_tensor=True)
    query_embeddings = query_embeddings.to('cuda')

    # Use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.pytorch_cos_sim(query_embeddings, story_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)
    # Format the astory not to run off the cell
    input_data_2 = input_data.replace('.', '.\n')
    print("\n\n======================")
    print("\tSTORY")
    print("======================\n")
    print('',str(df[df['title'] == input_data]['description'])[5:-33])
    print("\n\n======================")
    print("    TOP RESULTS")
    print("======================\n")
    
    # For score, index in torch.topk(cos_scores, k=top_k) use index  locator for feature lists
    # push score to cpu and convert to 1D array
    for score, idx in zip(top_results[0], top_results[1]):
      title_list.append(titles[idx])
      rating_list.append(ratings[idx])
      story_list.append(stories[idx])
      score_list.append(score.cpu().numpy().flatten())

    # Push results to dictionary columns 
    results['Title'] = title_list
    results['Rating'] = rating_list
    results['Story'] = story_list
    results['Score'] = score_list
    # return dictionary
    return results.iloc[1:, :]

  # If the input is long enough to be a story which is in the dataset
  elif len(input_data) > 20 and input_data not in titles:
    # Find the closest 5 stories of the corpus for each query sentence based on cosine similarity
    query_embeddings = embedder.encode(input_data, convert_to_tensor=True)
    query_embeddings = query_embeddings.to('cuda')

    # Use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.pytorch_cos_sim(query_embeddings, story_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    input_data = input_data.replace('.', '.\n')
    print("\n\n======================")
    print("\tSTORY")
    print("======================\n")
    print(input_data)
    print("\n\n======================")
    print("    TOP RESULTS")
    print("======================\n")
    
    # For score, index in torch.topk(cos_scores, k=top_k) use index  locator for feature lists
    # push score to cpu and convert to 1D array
    for score, idx in zip(top_results[0], top_results[1]):
      title_list.append(titles[idx])
      rating_list.append(ratings[idx])
      story_list.append(stories[idx])
      score_list.append(score.cpu().numpy().flatten())

    # Push results to dictionary columns 
    results['Title'] = title_list
    results['Rating'] = rating_list
    results['Story'] = story_list
    results['Score'] = score_list
    # return dictionary
    return results
    

In [None]:
# Push user input to Semantic Search function
# Example of user created synopsis
semantic_search("When CIA analyst Jack Ryan stumbles upon a suspicious series of bank transfers his search for answers pulls him from the safety of his desk job and catapults him into a deadly game of cat and mouse throughout Europe and the Middle East, with a rising terrorist figurehead preparing for a massive attack against the US")

In [None]:
# Example of existing title
semantic_search('Chappie')

In [None]:
# Example of incorrect entry
semantic_search("chappie")