In [1]:
!pip install sentence_transformers



In [2]:
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import pandas as pd
import numpy as np
import torch
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel

In [5]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
sentences = [
    "The weather is lovely today.",
    "Today class will be over at 3:30",
    "He drove to the stadium.",
    "Tommorow there won't be class",
    "She drove to the gym."
]
embeddings = model.encode(sentences)

similarities = model.similarity(embeddings, embeddings)
print(similarities.shape)

torch.Size([5, 5])


In [6]:
similarities

tensor([[1.0000, 0.2484, 0.1046, 0.0367, 0.0627],
        [0.2484, 1.0000, 0.0590, 0.5646, 0.1128],
        [0.1046, 0.0590, 1.0000, 0.0454, 0.4679],
        [0.0367, 0.5646, 0.0454, 1.0000, 0.1149],
        [0.0627, 0.1128, 0.4679, 0.1149, 1.0000]])

In [10]:
df = pd.read_csv("/content/titles.csv")
df_indian_movies = df[(df['production_countries'].str.contains('IN')) & (df['type']=="MOVIE" )]
df_indian_movies['description'].isnull().sum()

np.int64(0)

In [11]:
all_desc = list( df_indian_movies['description'] )
out_embs = []
for r in tqdm(all_desc):
    embd = model.encode(r)
    out_embs.append(embd)
out_embs = np.array(out_embs)
out_embs.shape

100%|██████████| 577/577 [00:04<00:00, 141.22it/s]


(577, 384)

In [17]:
query = "mughal emperor falling in love"
qy_emb = model.encode([query])
sims = []
for e in tqdm(out_embs):
    sims.append(model.similarity(qy_emb,e).numpy()[0])
df_ind = df_indian_movies[['id', 'title','description']]
df_ind['sims'] = np.array(sims)
df_ind.sort_values('sims', ascending=False)

100%|██████████| 577/577 [00:00<00:00, 12010.31it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ind['sims'] = np.array(sims)


Unnamed: 0,id,title,description,sims
412,tm29269,Jodhaa Akbar,Jodhaa Akbar is a sixteenth century love story...,0.519925
4023,tm455161,Panipat,"During 18th century India, the Marathas emerge...",0.445302
4341,tm935388,Ginny Weds Sunny,Delhi girl Ginny lives with her matchmaker mot...,0.439301
176,tm12876,Bombay,A Hindu man and a Muslim woman fall in love in...,0.434673
86,tm172482,Sohni Mahiwal,Shahjada Ijjat Beg comes to India with his car...,0.428184
...,...,...,...,...
4472,tm832463,Mundina Nildana,Three different people with three different go...,-0.057429
1444,tm208713,Gabbar Is Back,A vigilante network taking out corrupt officia...,-0.061983
573,tm130301,Gangaajal,"Posted to a small, crime-ridden town, a cop so...",-0.081905
5674,tm886193,Laabam,The president of a farmers' association wants ...,-0.105985


In [22]:
query = "husband finds his wife as a serial killer"
qy_emb = model.encode([query])
sims = []
for e in tqdm(out_embs):
    sims.append(model.similarity(qy_emb,e).numpy()[0])
df_ind = df_indian_movies[['id', 'title','description']]
df_ind['sims'] = np.array(sims)
df_ind.sort_values('sims', ascending=False)

100%|██████████| 577/577 [00:00<00:00, 9331.28it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ind['sims'] = np.array(sims)


Unnamed: 0,id,title,description,sims
4067,tm886214,Mrs. Serial Killer,When her husband is framed and imprisoned for ...,0.641743
5622,tm867681,Haseen Dillruba,Under investigation as a suspect in her husban...,0.561317
3922,tm918043,Raat Akeli Hai,"When a newly married landlord is murdered, a m...",0.485654
2864,tm374395,Vanjagar Ulagam,A woman is murdered and her neighbour is picke...,0.469363
5477,tm1143258,Hey! Sinamika,A wife who feels suffocated by her husband's i...,0.457079
...,...,...,...,...
1009,tm178513,Bombay Talkies,One hundred years of Hindi cinema is celebrate...,-0.045280
4490,tm475815,The Zoya Factor,An advertising agency executive Zoya Singh Sol...,-0.052230
1621,tm340927,Motu Patlu: King Of Kings,Motu and Patlu are your home-grown Laurel and ...,-0.056431
5269,tm833467,Hurdang,"Back in 1990s, an aspiring IAS starts a rebell...",-0.061509


`gradio` App

In [24]:
import gradio as gr

In [23]:
def search_similar( query ):
    qy_emb = model.encode([query])
    sims = []
    for e in tqdm(out_embs):
        sims.append(model.similarity(qy_emb,e).numpy()[0])

    df_ind = df_indian_movies[['id', 'title','description']]
    df_ind.loc[:,'sims'] = np.array(sims)
    return df_ind.sort_values('sims', ascending=False)[:5]

In [25]:
demo = gr.Interface(fn = search_similar,
                    inputs=[gr.Textbox(label="Your Review:")],
                    outputs=[gr.Dataframe(label="Most Similar Description Movies:")])
demo.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://83a2f9f7d48e5aae45.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


