In [None]:
!pip install sentence-transformers

In [None]:
!nvidia-smi 

In [None]:
import pandas as pd
import time
from tqdm import tqdm
import seaborn as sns
import numpy as np
from textblob import TextBlob
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('msmarco-distilbert-base-dot-prod-v3')

In [None]:
def fetch_movie_info(dataframe_idx):
    info = df.iloc[dataframe_idx]
    meta_dict = {}
    meta_dict['Title'] = info['Title']
    meta_dict['Plot'] = info['Plot'][:500]
    return meta_dict
    
def search(query, top_k, index, model):
    t=time.time()
    query_vector = model.encode([query])
    top_k = index.search(query_vector, top_k)
    print('>>>> Results in Total Time: {}'.format(time.time()-t))
    top_k_ids = top_k[1].tolist()[0]
    top_k_ids = list(np.unique(top_k_ids))
    results =  [fetch_movie_info(idx) for idx in top_k_ids]
    return results

In [None]:
data = pd.read_csv('../input/wikipedia-movie-plots/wiki_movie_plots_deduped.csv',memory_map=True)
data.info()

In [None]:
data.head()

In [None]:
import gc
df = data[['Title','Plot']]


In [None]:
df.dropna(inplace=True)
df.drop_duplicates(subset=['Plot'],inplace=True)

In [None]:
!pip install faiss-gpu

In [None]:
import faiss
encoded_data = model.encode(df.Plot.tolist())
encoded_data = np.asarray(encoded_data.astype('float32'))
index = faiss.IndexIDMap(faiss.IndexFlatIP(768))
index.add_with_ids(encoded_data, np.array(range(0, len(df))))
faiss.write_index(index, 'movie_plot.index')

In [None]:
from pprint import pprint

query="Artificial Intelligence based action movie"
results=search(query, top_k=5, index=index, model=model)

print("\n")
for result in results:
    print('\t',pprint(result))

# Re-Ranker: Cross-Encoder

The retriever has to be efficient for large document collections with millions of entries. However, it might return irrelevant candidates.

A re-ranker based on a Cross-Encoder can substantially improve the final results for the user. The query and a possible document is passed simultaneously to transformer network, which then outputs a single score between 0 and 1 indicating how relevant the document is for the given query.

The advantage of Cross-Encoders is the higher performance, as they perform attention across the query and the document.

Scoring thousands or millions of (query, document)-pairs would be rather slow. Hence, we use the retriever to create a set of e.g. 100 possible candidates which are then re-ranked by the Cross-Encoder.


In [None]:
## Load our cross-encoder. Use fast tokenizer to speed up the tokenization
from sentence_transformers import CrossEncoder
cross_model = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6', max_length=512)

In [None]:

def cross_score(model_inputs):
    scores = cross_model.predict(model_inputs)
    return scores

model_inputs = [[query,item['Plot']] for item in results]
scores = cross_score(model_inputs)
#Sort the scores in decreasing order
ranked_results = [{'Title': inp['Title'], 'Score': score} for inp, score in zip(results, scores)]
ranked_results = sorted(ranked_results, key=lambda x: x['Score'], reverse=True)


In [None]:
print("\n")
for result in ranked_results:
    print('\t',pprint(result))

In [None]:
!pip install bert-score

In [None]:

# check your installation
import bert_score
bert_score.__version__

In [None]:
from bert_score import score

In [None]:
ref=["Artificial Intelligence based action movie"]

In [None]:
ranked_results_bert = []

for cand in results:
    P, R, F1 = score([cand['Plot']], ref, lang='en')
    ranked_results_bert.append({'Title': cand['Title'], 'Score': F1.numpy()[0]})
    

In [None]:
#Sort the scores in decreasing order
ranked_results_bert = sorted(ranked_results_bert, key=lambda x: x['Score'], reverse=True)
print("\n")
for result in ranked_results_bert:
    print('\t',pprint(result))

In [None]:
final_results = pd.DataFrame()
final_results['faiss_ranking'] = [item['Title'] for item in results]
final_results['cross_encoder'] = [item['Title'] for item in ranked_results]
final_results['bert_score'] = [item['Title'] for item in ranked_results_bert]


In [None]:
final_results.head()

In [None]:

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

In [None]:
item_i = model.encode(['action'])
item_i = np.asarray(item_i.astype('float32'))

item_j = model.encode(['drama'])
item_j = np.asarray(item_j.astype('float32'))

item_k = model.encode(['romance'])
item_k = np.asarray(item_k.astype('float32'))

In [None]:
data = np.asarray([item_i, item_j, item_k])

In [None]:
data = data.reshape((3,768))

In [None]:
tsne = TSNE(n_components=2)
tsne_results = tsne.fit_transform(data)

In [None]:
tsne_results.shape

In [None]:
pca_df = pd.DataFrame()
pca_df['x'] = tsne_results[:,0]
pca_df['y'] = tsne_results[:,1]

In [None]:
plt.figure(figsize=(8,8))
sns.scatterplot(
    x="x", y="y",
    hue=['action','drama','romance'],
    palette=sns.color_palette("hls", 3),
    data=pca_df,
    legend="full",
    alpha=0.8
)

In [None]:
# generate random integer values
from random import seed
from random import randint
# seed random number generator
seed(1)

action_movie_scores=[]
drama_movie_scores=[]
romance_movie_scores=[]
# generate some integers
for _ in range(10):
    action_movie_score = randint(0, 10)
    drama_movie_score = randint(0, 10)
    romance_movie_score = randint(0, 10)
    action_movie_scores.append(action_movie_score)
    drama_movie_scores.append(drama_movie_score)
    romance_movie_scores.append(romance_movie_score)

In [None]:
user_watch_hist = pd.DataFrame()
user_watch_hist['action_movie'] = action_movie_scores
user_watch_hist['drama_movie'] = drama_movie_scores
user_watch_hist['romance_movie'] = romance_movie_scores

In [None]:

user_watch_hist.plot(kind='bar',figsize=(16,10))

In [None]:
user_watch_hist

In [None]:
import scipy.stats as stats

In [None]:
action_movie_zscore =   stats.zscore(user_watch_hist['action_movie'])[-1]
drama_movie_zscore = stats.zscore(user_watch_hist['drama_movie'])[-1]
romance_movie_zscore = stats.zscore(user_watch_hist['romance_movie'])[-1]

In [None]:
def weight(i):
    alpha = 0.5
    return alpha*pow((1-alpha),10-i)

In [None]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

In [None]:
x = np.asarray([action_movie_zscore, drama_movie_zscore, romance_movie_zscore])
genre_weights = softmax(x)

In [None]:
user_encoded_vector = np.asarray([item_i*genre_weights[0]+ item_j*genre_weights[1] +item_k*genre_weights[2]])
user_encoded_vector = user_encoded_vector.reshape((1,768))

In [None]:
from pprint import pprint

query="Artificial Intelligence based action movie"
results=search(query, top_k=5, index=index, model=model)


print("\n")
for result in results:
    print('\t',pprint(result))

In [None]:
candidate_plots = [x['Plot'] for x in results]

In [None]:
from sentence_transformers import SentenceTransformer, util

#Compute embeddings
embeddings = model.encode(candidate_plots)

#Compute cosine-similarities for each sentence with each other sentence
cosine_scores = util.pytorch_cos_sim(user_encoded_vector, embeddings)

#Find the pairs with the highest cosine similarity scores
titles = [x['Title'] for x in results]

ranked_user_behaviour = [{'Title':x ,'Score': y} for x,y in zip(titles,cosine_scores.numpy()[0])]
ranked_user_behaviour = sorted(ranked_user_behaviour, key=lambda x: x['Score'], reverse=True)

In [None]:
final_results = pd.DataFrame()
final_results['faiss_ranking'] = [item['Title'] for item in results]
final_results['cross_encoder'] = [item['Title'] for item in ranked_results]
final_results['bert_score'] = [item['Title'] for item in ranked_results_bert]
final_results['user_interaction_ranking'] = [item['Title'] for item in ranked_user_behaviour]

In [None]:
final_results

In [None]:
def fetch_movie_info(dataframe_idx):
    info = df.iloc[dataframe_idx]
    meta_dict = {}
    meta_dict['Title'] = info['Title']
    return meta_dict
    

t=time.time()
query_vector = user_encoded_vector
top_k = index.search(query_vector, 20)
print('>>>> Recommendation Results in Total Time: {}'.format(time.time()-t))
top_k_ids = top_k[1].tolist()[0]
top_k_ids = list(np.unique(top_k_ids))
[fetch_movie_info(idx) for idx in top_k_ids]
