In [1]:
from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer,  LoggingHandler, losses, models, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample
import pandas as pd
import numpy as np
import os

In [2]:
train = pd.read_csv("datasets/movie_datasets/imdb/recom_train.csv")
val = pd.read_csv("datasets/movie_datasets/imdb/recom_test.csv")
test = pd.read_csv("datasets/movie_datasets/imdb/recom_test.csv")

In [3]:
train.head()

Unnamed: 0,anchor,sample,label
0,The movie Interstellar was released in the yea...,The movie Goodfellas was released in the year ...,0
1,The movie The Christmas Chronicles was release...,The movie Minnal Murali was released in the ye...,0
2,The movie The Babadook was released in the yea...,The movie Me Before You was released in the ye...,0
3,The movie The Host was released in the year 20...,The movie The Witch Files was released in the ...,1
4,The movie Fargo was released in the year 1996 ...,The movie The Nice Guys was released in the ye...,1


In [4]:
train.shape

(110448, 3)

In [5]:
val.shape

(15940, 3)

In [7]:
train_samples = []
for _,  row in train.iterrows():
    train_samples.append(InputExample(texts=[row["anchor"], row["sample"]], label=np.float32(row["label"])))

dev_samples = []
for _, row in val.iterrows():
    dev_samples.append(InputExample(texts=[row["anchor"], row["sample"]], label=np.float32(row["label"])))

In [8]:
model_name="sentence-transformers/all-MiniLM-L6-v2"

In [9]:
word_embedding_model = models.Transformer(model_name)

pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

In [10]:
model = SentenceTransformer(modules=[word_embedding_model, pooling_model], device=0)

In [11]:
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=32)
train_loss = losses.ContrastiveLoss(model=model)

In [12]:
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='dev')

In [13]:
warmup_steps = math.ceil(len(train_dataloader) * 2 * 0.1) #10% of train data for warm-up

In [14]:
output_path = "./model_output"

In [15]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=2,
          evaluation_steps=1_000,
          warmup_steps=warmup_steps,
          output_path=os.path.join(output_path, model_name.replace("/","-")))

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3452 [00:00<?, ?it/s]

# Inference

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer,  models, util
tqdm.pandas()

In [2]:
movie_complete = pd.read_csv("./datasets/movie_datasets/imdb/movie_complete_clean.csv")
test = pd.read_csv("datasets/movie_datasets/imdb/recom_test.csv")
recommendation_array = np.load("./datasets/movie_datasets/imdb/recommendation_array.npy", allow_pickle=True)

In [3]:
def format_text(details):
    return f'The movie {details["originalTitle"]} was released in the year {details["startYear"]} is a {details["genres"]} with a  runtime of {details["runtimeMinutes"]} minutes'


In [4]:
movie_complete["details"]  = movie_complete.progress_apply(format_text, axis=1)

  0%|          | 0/499864 [00:00<?, ?it/s]

100%|██████████| 499864/499864 [00:04<00:00, 100040.23it/s]


In [5]:
trained_model = SentenceTransformer("reco_output/sentence-transformers-all-MiniLM-L6-v2")

In [6]:
test.sample(5)

Unnamed: 0,anchor,sample,label
3310,The movie Doctor Sleep was released in the yea...,The movie Mother India was released in the yea...,0
9140,The movie Insidious was released in the year 2...,The movie The Long Goodbye was released in the...,0
9877,The movie The Curious Case of Benjamin Button ...,The movie The Strange Case of the Law was rele...,0
9967,The movie Midsommar was released in the year 2...,The movie Call Me by Your Name was released in...,0
9624,The movie Insidious: Chapter 3 was released in...,The movie Annabelle was released in the year 2...,1


In [7]:
movies_embeddings = trained_model.encode(movie_complete["details"].to_list(), convert_to_tensor=True, normalize_embeddings=True, show_progress_bar=True)

Batches:   0%|          | 0/15621 [00:00<?, ?it/s]

In [8]:
def generate_embeddings(model, question):
    return model.encode(question, convert_to_tensor=True, normalize_embeddings=True, show_progress_bar=True)

In [9]:
idx = 9311
print(test["anchor"].iloc[idx])
question_embeddings = generate_embeddings(trained_model, test["anchor"].iloc[idx])

The movie Dilwale Dulhania Le Jayenge was released in the year 1995 is a Drama,Romance with a  runtime of 189 minutes


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [18]:
for score in util.semantic_search(question_embeddings, movies_embeddings, top_k=50)[0]:
    print(movie_complete["originalTitle"].iloc[score["corpus_id"]], movie_complete["tconst"].iloc[score["corpus_id"]])

Dilwale Dulhania Le Jayenge tt0112870
Afsana Dilwalon Ka tt0429541
Kabhi Khushi Kabhie Gham... tt0248126
Yeh Hai Jalwa tt0328671
Tum Se Achcha Kaun Hai tt0318956
Dil To Pagal Hai tt0118983
Kabhie Kabhie tt0074730
Meesha Madhavan tt0353725
Dil Chahta Hai tt0292490
Dil Dhadakne Do tt4110568
English Babu Desi Mem tt0136153
Dil... Akhir Dil Hai tt0384915
Dil Hai Tumhaara tt0328998
3 Idiots tt1187043
Raaj tt1874689
Pyar Ki Jeet tt0350005
Dil Hai Ki Manta Nahin tt0101733
Badhaai Ho Badhaai tt0325041
Jab Pyaar Kisise Hota Hai tt0182251
Doli Saja Ke Rakhna tt0207415
Gondya Martay Tangda tt5900134
Barfi! tt2082197
Rocket Singh: Salesman of the Year tt1434447
Money Hai Toh Honey Hai tt1126516
Dil Se.. tt0164538
Ye habe ghand tt1846445
Pelli Choopulu tt5824826
Dil Tera Aashiq tt0106725
Anwar Ka Ajab Kissa tt3164774
Kuch Naa Kaho tt0369637
Dil Ka Rishta tt0330217
Hello Memsaheb tt4773212
Hawayein tt0378025
Dilwale Kabhi Na Hare tt0400365
Dilwale Dulhaniya Le Jayenge tt2714436
Snehithan tt0357192
P

'Dil Dhadakne Do'

In [17]:
for array in recommendation_array:
    if "tt0112870" in array:
        for tconst in array:
            print(movie_complete[movie_complete["tconst"]==tconst].values[0][1])
        break

Baazigar
Dilwale Dulhania Le Jayenge
Kuch Kuch Hota Hai
Kabhi Khushi Kabhie Gham...
Devdas
Taare Zameen Par
Dil Dhadakne Do
