In [1]:
import yaml
import os
import re

import pymupdf

from tqdm.notebook import tqdm
from os.path import join as pjoin

In [2]:
def load_all_dialogues(movies_available, superhero_dialogue_path, dialogues_joiner):
    dialogues = []
    movies_as_meta_data = []
    for movie in movies_available:
        with open(pjoin(superhero_dialogue_path, movie)) as file:
            data = file.read()

        temp = data.split(dialogues_joiner)
        movies_as_meta_data.extend([movie]*len(temp))
        dialogues.extend(temp)

    return dialogues, movies_as_meta_data

In [3]:
from haystack_integrations.components.embedders.fastembed import FastembedDocumentEmbedder

def load_embedding_model(embedding_model):
    document_embedder = FastembedDocumentEmbedder(model = embedding_model)
    document_embedder.warm_up()
    return document_embedder

from haystack import Document
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore

def index_documents(superhero, dialogues, movies_as_meta_data, vector_store_path, document_embedder, embed_dim):

    documents = [
        Document(
            content = dialogue,
            meta = {
                'name': superhero,
                'movie': movie[:-4]
            }
        )
        for dialogue, movie in zip(dialogues, movies_as_meta_data) if len(dialogue)<2000
    ]

    document_store = QdrantDocumentStore(
        path= vector_store_path,
        index=superhero,
        embedding_dim=embed_dim,
    )

    documents_with_embeddings = document_embedder.run(documents)["documents"]
    document_store.write_documents(documents_with_embeddings)

    return document_store

In [4]:
root = '..'
data_folder = 'data'
script_folder = 'scripts'
dialogue_folder = 'dialogues'
config_file = 'config.yaml'
embed_dim = 384
vector_store_name = 'QDRANT_VECTOR_DATABASE'
vector_store_path = pjoin(root, vector_store_name)
embedding_model = 'BAAI/bge-small-en-v1.5'

In [5]:
with open(pjoin(root, config_file), 'r') as f:
    config = yaml.safe_load(f)

dialogues_joiner = config['DIALOGUES_JOINER']
list_of_superheroes = config['LIST_OF_SUPERHEROES']

In [6]:
for superhero in list_of_superheroes:
    print(f'Generation vector-index for {superhero}')
    movies_available = os.listdir(pjoin(root, data_folder, dialogue_folder, superhero))
    superhero_dialogue_path = pjoin(root, data_folder, dialogue_folder, superhero)

    dialogues, movies_as_meta_data = load_all_dialogues(movies_available, superhero_dialogue_path, dialogues_joiner)
    document_embedder = load_embedding_model(embedding_model)
    index_documents(superhero, dialogues, movies_as_meta_data, vector_store_path, document_embedder, embed_dim)

Generation vector-index for Batman


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Calculating embeddings: 100%|██████████| 1009/1009 [00:48<00:00, 20.98it/s]
1100it [00:00, 1903.71it/s]                          


Generation vector-index for Superman


Calculating embeddings: 100%|██████████| 374/374 [00:16<00:00, 22.81it/s]
400it [00:00, 1832.21it/s]                         


Generation vector-index for Wonder Woman


Calculating embeddings: 100%|██████████| 269/269 [00:15<00:00, 17.79it/s]
300it [00:00, 1878.17it/s]                         


Generation vector-index for Spiderman


Calculating embeddings: 100%|██████████| 593/593 [00:24<00:00, 23.75it/s]
600it [00:00, 1727.78it/s]                         


Generation vector-index for Ironman


Calculating embeddings: 100%|██████████| 873/873 [00:35<00:00, 24.34it/s]
900it [00:00, 1757.22it/s]                         


Generation vector-index for Captain America


Calculating embeddings: 100%|██████████| 693/693 [00:26<00:00, 26.57it/s]
700it [00:00, 1823.11it/s]                         


Generation vector-index for Black Widow


Calculating embeddings: 100%|██████████| 208/208 [00:09<00:00, 21.98it/s]
300it [00:00, 2515.09it/s]                         


Generation vector-index for Hulk


Calculating embeddings: 100%|██████████| 256/256 [00:07<00:00, 32.57it/s]
300it [00:00, 2024.07it/s]                         


Generation vector-index for Thor


Calculating embeddings: 100%|██████████| 754/754 [00:41<00:00, 18.33it/s]
800it [00:00, 1856.60it/s]                         


Generation vector-index for Deadpool


Calculating embeddings: 100%|██████████| 400/400 [00:22<00:00, 17.65it/s]
100%|██████████| 400/400 [00:00<00:00, 1773.46it/s]


Generation vector-index for Star Lord


Calculating embeddings: 100%|██████████| 417/417 [00:22<00:00, 18.53it/s]
500it [00:00, 2118.59it/s]                         


Generation vector-index for Thanos


Calculating embeddings: 100%|██████████| 114/114 [00:03<00:00, 29.60it/s]
200it [00:00, 3079.07it/s]             


Generation vector-index for Groot


Calculating embeddings: 100%|██████████| 32/32 [00:02<00:00, 15.98it/s]
100it [00:00, 5960.78it/s]            


Generation vector-index for Rocket


Calculating embeddings: 100%|██████████| 264/264 [00:16<00:00, 15.93it/s]
300it [00:00, 1864.10it/s]                         


Generation vector-index for Doctor Strange


Calculating embeddings: 100%|██████████| 105/105 [00:03<00:00, 28.62it/s]
200it [00:00, 3356.01it/s]             


Generation vector-index for Drax


Calculating embeddings: 100%|██████████| 148/148 [00:09<00:00, 14.85it/s]
200it [00:00, 2358.30it/s]             


Generation vector-index for Vision


Calculating embeddings: 100%|██████████| 3/3 [00:00<00:00, 48.95it/s]
100it [00:00, 51596.80it/s]          


Generation vector-index for Jarvis


Calculating embeddings: 100%|██████████| 60/60 [00:04<00:00, 13.43it/s]
100it [00:00, 2883.24it/s]            
