In [1]:
import yaml
import os
import re

import pymupdf

from tqdm.notebook import tqdm
from os.path import join as pjoin

In [3]:
root = '..'
data_folder = 'data'
script_folder = 'scripts'
config_file = 'config.yaml'

In [4]:
with open(pjoin(root, config_file), 'r') as f:
    config = yaml.safe_load(f)

In [5]:
list_of_superheroes = config['LIST_OF_SUPERHEROES']
superhero_synonyms = config['SUPERHERO_SYNONYMS']
movies_list_of_superheroes = config['MOVIES_LIST_OF_SUPERHEROES']

dialogue_folder = 'dialogues'
max_context_length = 100
dialogues_joiner = '\n|_/-|_/-|_/-|_/-|_/-|_/-|_/-|_/-|_/-|_/-|\n\n'
data_folder_path = pjoin(root, data_folder)

In [6]:
all_movie_scripts = os.listdir(pjoin(root, data_folder, script_folder))

In [7]:
def extract_text_from_pdf(pdf_path):
    pdf = pymupdf.open(pdf_path)
    text = ''
    for page in pdf:
        text += page.get_text()
    return text

def get_all_superhero_names(superhero, superhero_synonyms):
    superhero_synonym = superhero_synonyms[superhero][0]
    superhero_names = [superhero.upper(), superhero_synonym.upper(), superhero_synonym.replace(' ', '-').upper()]
    superhero_names = superhero_names + [i.upper () for i in superhero_synonym.split()]
    return superhero_names

def split_script_by_superhero_dialogue(script_text, superhero_names):
    matches = re.finditer("|".join(superhero_names), script_text)
    split_points = [match.start() for match in matches][1:] + [len(script_text)]
    extrcated_split_script_text = [script_text[split_points[i]:split_points[i+1]] for i in range(len(split_points) - 1)]
    return extrcated_split_script_text

def remove_extra_charachters_dialogue_from_each_split(extrcated_split_script_text, max_extra_dialogues=3):
    pattern = re.compile(r'^[A-Z\s\'().,-]+$', re.MULTILINE)
    extrcated_split_script_text_filtered = []

    for idx in range(len(extrcated_split_script_text)):
        matches = re.finditer(pattern, extrcated_split_script_text[idx])
        indices = [match.start() for match in matches]
        if len(indices) >=1:
            max_indices = len(extrcated_split_script_text[idx]) if len(indices) == 1 else indices[:max_extra_dialogues][-1]
            extrcated_split_script_text_filtered.append(extrcated_split_script_text[idx][:max_indices])
    
    return extrcated_split_script_text_filtered

def combine_dialogue_with_context(script_text, extrcated_split_script_text_filtered, max_context_length):
    
    dialogue_with_context_all = []
    for idx in range(len(extrcated_split_script_text_filtered)):
        dialogue_idx = script_text.find(extrcated_split_script_text_filtered[idx])
        dialogue_with_context = script_text[dialogue_idx-max_context_length:dialogue_idx] + extrcated_split_script_text_filtered[idx]
        dialogue_with_context_all.append(dialogue_with_context)

    return dialogue_with_context_all

In [8]:
for superhero in tqdm(list_of_superheroes):
    superhero_script = []
    for script in movies_list_of_superheroes[superhero]:
        superhero_dialogue_save_path = pjoin(data_folder_path, dialogue_folder, superhero)
        save_script_name = ".".join(script.split('.')[:-1])+'.txt'
        script_path = pjoin(data_folder_path, script_folder, script)
        os.makedirs(superhero_dialogue_save_path, exist_ok=True)

        script_text = extract_text_from_pdf(script_path)
        superhero_names = get_all_superhero_names(superhero, superhero_synonyms)
        extrcated_split_script_text = split_script_by_superhero_dialogue(script_text, superhero_names)
        extrcated_split_script_text_filtered = remove_extra_charachters_dialogue_from_each_split(extrcated_split_script_text, max_extra_dialogues=3)
        dialogues_with_context = combine_dialogue_with_context(script_text, extrcated_split_script_text_filtered, max_context_length)
        dialogues_with_context_combined = f"{dialogues_joiner}".join(dialogues_with_context)
        with open(pjoin(superhero_dialogue_save_path, save_script_name), 'w') as f:
            f.write(dialogues_with_context_combined)

  0%|          | 0/18 [00:00<?, ?it/s]

In [None]:
# import torch
# import transformers

# os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"

# def load_model_pipeline(model_id, batch_size):
#     pipeline = transformers.pipeline(
#         "text-generation",
#         model=model_id,
#         model_kwargs={"torch_dtype": torch.bfloat16},
#         device_map="auto",
#         batch_size=batch_size,
#     )

#     torch.backends.cuda.enable_mem_efficient_sdp(False)
#     torch.backends.cuda.enable_flash_sdp(False)
#     return pipeline

# def extract_dialogue_from_llm(pipeline, messages):

#     pipeline.tokenizer.pad_token_id = pipeline.tokenizer.eos_token_id
#     pipeline.tokenizer.padding_side = 'left'

#     terminators = [
#         pipeline.tokenizer.eos_token_id,
#         pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
#     ]

#     outputs = pipeline(
#         messages,
#         max_new_tokens=256,
#         eos_token_id=terminators,
#         do_sample=True,
#         temperature=1,
#         top_p=1,
#     )

#     return outputs

# def create_batch(extrcated_split_script_text, superhero, superhero_names, batch_size):
    
#     messages_batch = []
#     for extracted_text in tqdm(extrcated_split_script_text[:batch_size]):
#         system_prompt = f"You are a movie dialogue separator. From the context you are given, separate the dialogue and provide the dialogue of a charachter. You are only allowed to give final dialoige without any thing. Don't say anything else, just list the dialogue. Always start with the NAME of the character followed by a colon and then the dialogue. The extracted dialogue should always be in single line. Make sure that you extract all the dialouges of the asked charachters. It can be present in multiple lines. These are the identifier for charachter dialoges for which you need to extrcat the dialouges: {", ".join([f"'{i}'" for i in superhero_names])} The identifier are always in captital leter."
#         user_prompt = f"Extract only the dialogues of {superhero.upper()} - Synonyms of {superhero.upper()} are {", ".join([f"'{i}'" for i in superhero_names])}. Now extract dialogue based on the synonyms given from the following text\n\n\n\n {extracted_text} \n\n\n\n\n Make sure you only extract dialogue of {", ".join([f"'{i}'" for i in superhero_names])}. The dialogues starts only after the name of the charachter is in capital letter."

#         messages = [
#             {"role": "system", "content": system_prompt},
#             {"role": "user", "content": user_prompt},
#         ]
#         messages_batch.append(messages)
#     return messages_batch

# batch_size = 8
# model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

# pipeline = load_model_pipeline(model_id, batch_size)
# messages_batch = create_batch(extrcated_split_script_text, superhero, superhero_names, batch_size)
# extrcated_dialogue = extract_dialogue_from_llm(pipeline, messages_batch)