In [1]:
from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer,  LoggingHandler, losses, models, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample
import pandas as pd
import numpy as np
import os

In [2]:
train = pd.read_feather("datasets/movie_datasets/imdb/train_sbert_ds_v1.feather")
val = pd.read_feather("datasets/movie_datasets/imdb/val_sbert_ds_v1.feather")
test = pd.read_feather("datasets/movie_datasets/imdb/test_sbert_ds_v1.feather")

In [3]:
train.head()

Unnamed: 0,question,originalTitle,label
0,What is the production company of Malevich?,Queer Genius,0
1,Is the movie Konpaku inspired by actual events?,The Friendliest Railway in the World,0
2,What genre is Mukherjee Dar Bou?,Mukherjee Dar Bou,1
3,What is the language spoken in the movie `let'...,Kita tylos puse,0
4,What is the main message of The Earthing Movie?,Dansh,0


In [4]:
train_samples = []
for _,  row in train.iterrows():
    train_samples.append(InputExample(texts=[row["question"], row["originalTitle"]], label=np.float32(row["label"])))

dev_samples = []
for _, row in val.iterrows():
    dev_samples.append(InputExample(texts=[row["question"], row["originalTitle"]], label=np.float32(row["label"])))

In [5]:
model_name="sentence-transformers/all-MiniLM-L6-v2"

In [6]:
word_embedding_model = models.Transformer(model_name)

pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

In [7]:
model = SentenceTransformer(modules=[word_embedding_model, pooling_model], device=0)

In [8]:
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=32)
train_loss = losses.ContrastiveLoss(model=model)

In [9]:
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='dev')

In [10]:
warmup_steps = math.ceil(len(train_dataloader) * 2 * 0.1) #10% of train data for warm-up

In [11]:
output_path = "./model_output"

In [13]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=2,
          evaluation_steps=1_000,
          warmup_steps=warmup_steps,
          output_path=os.path.join(output_path, model_name.replace("/","-")))

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5097 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5097 [00:00<?, ?it/s]

# Inference

In [1]:
import pandas as pd
from utils import augment_string
from sentence_transformers import SentenceTransformer,  models, util

In [2]:
train = pd.read_feather("datasets/movie_datasets/imdb/train_sbert_ds_v1.feather")
val = pd.read_feather("datasets/movie_datasets/imdb/val_sbert_ds_v1.feather")
test = pd.read_feather("datasets/movie_datasets/imdb/test_sbert_ds_v1.feather")

In [4]:
trained_model = SentenceTransformer("model_output/sentence-transformers-all-MiniLM-L6-v2/")
base_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [5]:
movie_list = train["originalTitle"].unique().tolist()+val["originalTitle"].unique().tolist()+test["originalTitle"].unique().tolist()

In [30]:
test.sample(5)

Unnamed: 0,question,originalTitle,label
2638,What is the genre of En Kadhali Scene Podura?,En Kadhali Scene Podura,1
2161,What genre is Chhota Bheem Kung Fu Dhamaka?,The World of Esports,0
1959,"What is the number of votes for ""Mantan Manten""?",Vocalités vivantes,0
1208,What genre is the movie *Posts to the Pope*?,Posts to the Pope,1
1590,What do the friends find one night in Piola?,Piola,1


In [39]:
question = augment_string(test.iloc[2638].question)
print(question)

What is the Xefre of En Kadhali Soent Poturf?


In [32]:
def generate_embeddings(model, question, movies):
    return model.encode(question, convert_to_tensor=True, normalize_embeddings=True), model.encode(movie_list, convert_to_tensor=True, normalize_embeddings=True)

In [40]:
question_embeddings, movies_embeddings = generate_embeddings(trained_model, question, movie_list)

In [41]:
for score in util.semantic_search(question_embeddings, movies_embeddings, top_k=5)[0]:
    print(score["score"], movie_list[score["corpus_id"]])

0.7660066485404968 En Kadhali Scene Podura
0.6924241185188293 Kasaai
0.6911365389823914 Kare Kasif
0.679943323135376 Ermitage. Il potere dell'arte
0.6743850111961365 The Shaman Sorceress
