## Semantic Search

In [109]:
from sentence_transformers import SentenceTransformer, InputExample, models, datasets, losses, util, CrossEncoder
import pandas as pd
import random
import os.path
from os import path
import time
import re
import pickle
from tqdm import tqdm

In [5]:
movie_data_path = '../movie_data.csv'
gen_queries_path = '../Queries/generated_queries.csv'

### Load in Movie Data

In [82]:
movie_data = pd.read_csv(movie_data_path,header=0)
movie_data

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...
...,...,...,...,...,...,...,...,...
34881,2014,The Water Diviner,Turkish,Director: Russell Crowe,Director: Russell Crowe\r\nCast: Russell Crowe...,unknown,https://en.wikipedia.org/wiki/The_Water_Diviner,"The film begins in 1919, just after World War ..."
34882,2017,Çalgı Çengi İkimiz,Turkish,Selçuk Aydemir,"Ahmet Kural, Murat Cemcir",comedy,https://en.wikipedia.org/wiki/%C3%87alg%C4%B1_...,"Two musicians, Salih and Gürkan, described the..."
34883,2017,Olanlar Oldu,Turkish,Hakan Algül,"Ata Demirer, Tuvana Türkay, Ülkü Duru",comedy,https://en.wikipedia.org/wiki/Olanlar_Oldu,"Zafer, a sailor living with his mother Döndü i..."
34884,2017,Non-Transferable,Turkish,Brendan Bradley,"YouTubers Shanna Malcolm, Shira Lazar, Sara Fl...",romantic comedy,https://en.wikipedia.org/wiki/Non-Transferable...,The film centres around a young woman named Am...


### Load in Generated Queries

In [83]:
queries_plots = pd.read_csv(gen_queries_path,header=0)

training_data = []
for _, row in queries_plots.iterrows():
    training_data.append(InputExample(texts=[row['Query'], row['Plot']]))

# shuffle training data
random.shuffle(training_data)

### Implement Search

In [153]:
class SearchUsingBert():
    
    def __init__(self, data, training_data=None, base_model='sentence-transformers/msmarco-distilbert-base-dot-prod-v3', model_file_path='../Semantic Search/bert_models/search-bert-model', emb_file_path = '../Semantic Search/embeddings/plot_embeddings.pkl', finetune=True, num_of_epochs=3, save_model_path='../Semantic Search/bert_models/search-bert-model'):
        print("Initialising Bert Search Engine...\n")
        
        self.movie_data = data
        self.titles = list(data['Title'].to_numpy())
        self.plots = self.process_plot(list(data['Plot'].to_numpy()))
        self.reranker = None

        # if model file exists, load in the model
        if path.exists(model_file_path):
            print("Model found, loading model from path...")
            self.model = self.load_model(model_file_path)
            print('Model loaded.\n')

        else:
            if training_data == None:
                print("Please provide training data.")
                return
            self.model = self.finetune_model(training_data, finetune, base_model, num_of_epochs, model_file_path)
            
        # if embeddings file exist, load in the embeddings
        if path.exists(emb_file_path):
            print("Pre-computed embeddings for movie plots found, loading from file...")
            
            with open(emb_file_path, "rb") as file:
                emb_data = pickle.load(file)
                self.plots_emb = emb_data['embeddings']
                
            print("Embeddings loaded.\n")

        else:
            print("Pre-computed embeddings for movie plots not found, encoding in progress...")
            self.plots_emb = self.model.encode(self.plots)
            
            # Save pre-computed plot embeddings to file for easier retrieval when testing
            with open(emb_file_path, "wb") as file:
                pickle.dump({'embeddings': self.plots_emb}, file)
                
            print("Embeddings generated.\n")

        print("Initialisation of Bert Search Engine completed.\n")

    def load_model(self, model_file_path=None):
        return SentenceTransformer(model_file_path)

    def process_plot(self, plots):
        processed_plots = []
        
        for plot in plots:
            processed_plot = re.sub(r'\[.*?\]', '', plot)
            processed_plots.append(processed_plot)
        
        return processed_plots

    def finetune_model(self, training_data, finetune, base_model, num_of_epochs, save_model_path):
        print("Model not found, default model will undergo finetuning...")

        if base_model != "sentence-transformers/msmarco-distilbert-base-dot-prod-v3":
            print("User-defined model: {}".format(base_model))
        else:
            print("Default model: {}".format(base_model))
        
        # Create sentence transformer model
        word_embeddings = models.Transformer(base_model)
        pooling = models.Pooling(word_embeddings.get_word_embedding_dimension())
        model = SentenceTransformer(modules=[word_embeddings, pooling])

        if finetune:
            # MultipleNegativesRankingLoss requires input pairs (query, relevant_passage)
            # and trains the model so that is is suitable for semantic search
            train_loss = losses.MultipleNegativesRankingLoss(model)

            # For the MultipleNegativesRankingLoss, it is important
            # that the batch does not contain duplicate entries, i.e.
            # no two equal queries and no two equal paragraphs.
            # To ensure this, we use a special data loader
            train_dataloader = datasets.NoDuplicatesDataLoader(training_data, batch_size=8)

            # Tune the model
            warmup_steps = int(len(train_dataloader) * num_of_epochs * 0.1)
            model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=num_of_epochs, warmup_steps=warmup_steps, show_progress_bar=True)
            
            print('Model finetuning completed.\n')

        # Save the model
        model.save(save_model_path)
        print('Model saved.\n')
        
        return model

    def cosine_sim(self, query, plots):
        scores = util.cos_sim(query, plots).tolist()[0]
        return scores
    
    def get_k_highest(self, k, scores):
        sorted_scores = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)
        return sorted_scores[0:k]

    def retrieve_movies(self, scores):
        results = []
        for idx, score in scores:
            results.append((self.movie_data.iloc[idx], score))
        
        return results
    
    def re_rank(self, query, scores, rerank_method='cross-encoder', encoder_name='cross-encoder/ms-marco-TinyBERT-L-6'):
        if rerank_method == 'cross-encoder':
            indexes = [i for i, _ in scores]
            inputs = [[query, self.plots[i]] for i in indexes]
            
            if self.reranker == None:
                self.reranker = CrossEncoder(encoder_name, max_length=512)
            scores = self.reranker.predict(inputs)
            
            new_scores = sorted([(indexes[i], scores[i]) for i in range(len(indexes))], key=lambda x: x[1], reverse=True)
            return new_scores
    
    def search(self, query, top_k, print_progress=True, print_results=True, re_rank=False, rerank_method=None, cross_encoder=None):
        if print_progress:
            print('Searching in progress...')
        start_time = time.time()
        query_embedding = self.model.encode([query])
        sim_scores = self.cosine_sim(query_embedding, self.plots_emb)
        highest_k = self.get_k_highest(top_k, sim_scores)
        time_taken = time.time() - start_time
        if print_progress:
            print('Search completed.\n')

        if re_rank:
            if rerank_method == None:
                print('Please specify a re-ranking method.')
                return
            if rerank_method == 'cross-encoder':
                if print_progress:
                    print('Reranking in progress...')
                if cross_encoder == None:
                    highest_k = self.re_rank(query, highest_k, rerank_method)
                else:
                    highest_k = self.re_rank(query, highest_k, rerank_method, cross_encoder)
                if print_progress:
                    print('Reranking completed.')
        
        movies = self.retrieve_movies(highest_k)

        results = []
        for movie, score in movies:
            results.append({
                'title': movie['Title'].strip(),
                'plot': movie['Plot'],
                'score': score,
                'year': movie['Release Year']
            })
        
        if print_results:
            self.print_results(query, time_taken, results)
        
        return results

    def print_results(self, query, time, results):
        print('-----------------Search Results-----------------')
        print('Total Search Time: {}'.format(time))
        print('User Query: {}'.format(query))
        print('\nResults:')
        for i in range(len(results)):
            r = results[i]
            print('{}. {} ({}) ----- Score: {}'.format(i+1, r['title'], r['year'], r['score']))
        print('\n')

#### Test 1: Using Base T5 Model for Query Generation

In [112]:
# Start up the search engine
search_engine = SearchUsingBert(movie_data)

Initialising Bert Search Engine...

Model found, loading model from path...
Model loaded.

Pre-computed embeddings for movie plots found, loading from file...
Embeddings loaded.

Initialisation of Bert Search Engine completed.



In [113]:
test_query = "spider man and his girlfriend"
k = 5

In [114]:
results = search_engine.search(test_query, k)

Searching in progress...
Search completed.

-----------------Search Results-----------------
Total Search Time: 0.1935739517211914
User Query: spider man and his girlfriend
Search Results:
1. In the Nick (1960) ----- Score: 0.6806657314300537
2. Spider-Man 2 (2004) ----- Score: 0.6549500226974487
3. Spider (2002) ----- Score: 0.6299688816070557
4. The Amazing Spider-Man 2 (2014) ----- Score: 0.6162744164466858
5. Spider-Man 3 (2007) ----- Score: 0.613756537437439




##### With Re-ranking (Cross-Encoder)

In [115]:
reranked_results = search_engine.search(test_query, k, re_rank=True, rerank_method='cross-encoder')

Searching in progress...


Downloading config.json: 100%|██████████| 612/612 [00:00<00:00, 221kB/s]
Downloading pytorch_model.bin: 100%|██████████| 255M/255M [00:15<00:00, 16.8MB/s] 
Downloading tokenizer_config.json: 100%|██████████| 541/541 [00:00<00:00, 213kB/s]
Downloading vocab.txt: 100%|██████████| 226k/226k [00:01<00:00, 210kB/s]  
Downloading special_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 38.2kB/s]


Search completed.

-----------------Search Results-----------------
Total Search Time: 39.99907994270325
User Query: spider man and his girlfriend
Search Results:
1. The Amazing Spider-Man 2 (2014) ----- Score: 0.9068192839622498
2. Spider (2002) ----- Score: 0.18480351567268372
3. Spider-Man 2 (2004) ----- Score: 0.148503378033638
4. In the Nick (1960) ----- Score: 0.11605063825845718
5. Spider-Man 3 (2007) ----- Score: 0.07466521114110947




#### Test 2: Using T5 One Line Summary for Query Generation

In [154]:
# Start up the search engine
search_engine_2 = SearchUsingBert(movie_data, model_file_path="../Semantic Search/bert_models/search-bert-model-2", emb_file_path="../Semantic Search/embeddings/plot_embeddings_2.pkl")

Initialising Bert Search Engine...

Model found, loading model from path...
Model loaded.

Pre-computed embeddings for movie plots found, loading from file...
Embeddings loaded.

Initialisation of Bert Search Engine completed.



In [143]:
results_2 = search_engine_2.search(test_query, k)

Searching in progress...
Search completed.

-----------------Search Results-----------------
Total Search Time: 0.5797228813171387
User Query: spider man and his girlfriend
Search Results:
1. Spider-Man 2 (2004) ----- Score: 0.6860555410385132
2. Spider (2002) ----- Score: 0.6339931488037109
3. Spider-Man 3 (2007) ----- Score: 0.6079459190368652
4. Spider-Man (2002) ----- Score: 0.5855851173400879
5. Superman (1948) ----- Score: 0.5796255469322205




##### With Re-ranking (Cross-Encoder)

In [145]:
reranked_results_2 = search_engine_2.search(test_query, k, re_rank=True, rerank_method='cross-encoder')

Searching in progress...
Search completed.

Reranking in progress...
Reranking completed.
-----------------Search Results-----------------
Total Search Time: 0.34468722343444824
User Query: spider man and his girlfriend
Search Results:
1. Spider (2002) ----- Score: 0.18480351567268372
2. Spider-Man 2 (2004) ----- Score: 0.148503378033638
3. Spider-Man 3 (2007) ----- Score: 0.07466521114110947
4. Spider-Man (2002) ----- Score: 0.0067075504921376705
5. Superman (1948) ----- Score: 0.00042220266186632216




#### Test 3: No Fine-Tuning

In [126]:
# Start up the search engine
search_engine_3 = SearchUsingBert(movie_data, training_data, model_file_path="../Semantic Search/bert_models/search-base-bert-model", emb_file_path="../Semantic Search/embeddings/plot_embeddings_base.pkl", finetune=False)

Initialising Bert Search Engine...

Model found, loading model from path...
Model loaded.

Pre-computed embeddings for movie plots found, loading from file...
Embeddings loaded.

Initialisation of Bert Search Engine completed.



In [127]:
results_3 = search_engine_3.search(test_query, k)

Searching in progress...
Search completed.

-----------------Search Results-----------------
Total Search Time: 0.4367396831512451
User Query: spider man and his girlfriend
Search Results:
1. Killing 'em Softly (1982) ----- Score: 0.5811159610748291
2. Kakera: A Piece of Our Life (2010) ----- Score: 0.5429856181144714
3. Log Kya Kahenge (1983) ----- Score: 0.5053036212921143
4. Spider-Man 3 (2007) ----- Score: 0.49053627252578735
5. Breakfast for Two (1937) ----- Score: 0.48401421308517456




##### With Re-ranking (Cross-Encoder)

In [128]:
reranked_results_3 = search_engine_3.search(test_query, k, re_rank=True, rerank_method='cross-encoder')

Searching in progress...
Search completed.

-----------------Search Results-----------------
Total Search Time: 25.07381510734558
User Query: spider man and his girlfriend
Search Results:
1. Spider-Man 3 (2007) ----- Score: 0.07466521114110947
2. Killing 'em Softly (1982) ----- Score: 0.03640236333012581
3. Breakfast for Two (1937) ----- Score: 0.000536297622602433
4. Kakera: A Piece of Our Life (2010) ----- Score: 0.00038331502582877874
5. Log Kya Kahenge (1983) ----- Score: 0.0002616419515106827




### Model Accuracy

In [69]:
test_queries = '../Semantic Search/test_queries/test_queries.csv'
test_queries_2 = '../Semantic Search/test_queries/test_queries_2.csv'

test_k = 1

titles = list(movie_data['Title'].to_numpy())

In [146]:
def test_accuracy(queries_plots, engine, rerank=False):
    total = 0
    correct = 0

    for idx in tqdm(range(len(queries_plots))): 
    # for idx, row in queries_plots.iterrows():
        query = queries_plots["Query"][idx]
        plot = queries_plots["Plot"][idx]
        correct_title = titles[idx]
        total += 1

        if rerank:
            predicted_title = engine.search(query, test_k, re_rank=True, rerank_method='cross-encoder', print_progress=False, print_results=False)[0]['title']
        else:
            predicted_title = engine.search(query, test_k, print_progress=False, print_results=False)[0]['title']

        if predicted_title == correct_title:
            correct += 1
    
    accuracy = correct / total
    print("-----------------Accuracy Results-----------------")
    print("Total: {}".format(total))
    print("Correct: {}".format(correct))
    print("Accuracy: {}".format(accuracy))
    
    return accuracy


#### Test 1: Using Base T5 Model for Query Generation

In [73]:
test_queries_plots = pd.read_csv(test_queries, header=0)
test_queries_plots_2 = pd.read_csv(test_queries_2, header=0)

In [78]:
acc = test_accuracy(test_queries_plots, search_engine)

# test on the other test queries dataset
acc_other = test_accuracy(test_queries_plots_2, search_engine)

100%|██████████| 34886/34886 [1:10:47<00:00,  8.21it/s]


-----------------Accuracy Results-----------------
Total: 34886
Correct: 18162
Accuracy: 0.5206099868141948


100%|██████████| 34886/34886 [1:10:42<00:00,  8.22it/s]

-----------------Accuracy Results-----------------
Total: 34886
Correct: 19746
Accuracy: 0.5660150203520037





#### Test 2: Using T5 One Line Summary for Query Generation

In [79]:
acc_2 = test_accuracy(test_queries_plots_2, search_engine_2)

# test on the other test queries dataset
acc_2_other = test_accuracy(test_queries_plots, search_engine_2)

100%|██████████| 34886/34886 [1:11:14<00:00,  8.16it/s]


-----------------Accuracy Results-----------------
Total: 34886
Correct: 29762
Accuracy: 0.8531215960557244


100%|██████████| 34886/34886 [1:11:38<00:00,  8.12it/s]

-----------------Accuracy Results-----------------
Total: 34886
Correct: 10731
Accuracy: 0.30760190334231496





#### Test 3: Using Cross-Encoder for Reranking

In [155]:
acc_3 = test_accuracy(test_queries_plots_2, search_engine_2, rerank=True)

100%|██████████| 34886/34886 [5:03:16<00:00,  1.92it/s]  

-----------------Accuracy Results-----------------
Total: 34886
Correct: 29483
Accuracy: 0.8451241185575876



