In [1]:
import yaml
import os
import re
import torch
import pymupdf
import transformers
from tqdm.notebook import tqdm
from os.path import join as pjoin

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"

In [3]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
    batch_size=4,
)

torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
root = '..'
data_folder = 'data'
script_folder = 'scripts'
config_file = 'config.yaml'

In [5]:
with open(pjoin(root, data_folder, config_file), 'r') as f:
    config = yaml.safe_load(f)

In [6]:
all_movie_scripts = os.listdir(pjoin(root, data_folder, script_folder))

In [7]:
def extract_text_from_pdf(pdf_path):
    pdf = pymupdf.open(pdf_path)
    text = ''
    for page in pdf:
        text += page.get_text()
    return text

In [8]:
list_of_superheroes = config['LIST_OF_SUPERHEROES']
superhero_synonyms = config['SUPERHERO_SYNONYMS']
movies_list_of_superheroes = config['MOVIES_LIST_OF_SUPERHEROES']

In [9]:
for superhero in list_of_superheroes:
    superhero_script = []
    for script in movies_list_of_superheroes[superhero]:
        script_path = pjoin(root, data_folder, script_folder, script)
        script_text = extract_text_from_pdf(script_path)
        break
    break

In [10]:
superhero_synonym = superhero_synonyms[superhero][0]
superhero_names = [superhero.upper(), superhero_synonym.upper(), superhero_synonym.replace(' ', '-').upper()]
superhero_names = superhero_names + [i.upper () for i in superhero_synonym.split()]

matches = re.finditer("|".join(superhero_names), script_text)
split_points = [match.start() for match in matches][1:] + [len(script_text)]

In [11]:
extrcated_split_script_text = [script_text[split_points[i]:split_points[i+1]] for i in range(len(split_points) - 1)]

In [None]:
def extract_dialogue_from_llm(superhero, superhero_names, extracted_text):
    system_prompt = f"You are a movie dialogue separator. From the context you are given, separate the dialogue and provide the dialogue of a charachter. You are only allowed to give final dialoige without any thing. Don't say anything else, just list the dialogue. Always start with the NAME of the character followed by a colon and then the dialogue. The extracted dialogue should always be in single line. Make sure that you extract all the dialouges of the asked charachters. It can be present in multiple lines. These are the identifier for charachter dialoges for which you need to extrcat the dialouges: {", ".join([f"'{i}'" for i in superhero_names])} The identifier are always in captital leter."
    user_prompt = f"Extract only the dialogues of {superhero.upper()} - Synonyms of {superhero.upper()} are {", ".join([f"'{i}'" for i in superhero_names])}. Now extract dialogue based on the synonyms given from the following text\n\n\n\n {extracted_text} \n\n\n\n\n Make sure you only extract dialogue of {", ".join([f"'{i}'" for i in superhero_names])}. The dialogues starts only after the name of the charachter is in capital letter."

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]

    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = pipeline(
        messages,
        max_new_tokens=256,
        eos_token_id=terminators,
        do_sample=True,
        temperature=1,
        top_p=1,
    )

    extracted_dialogue = outputs[0]["generated_text"][-1]['content'].replace('\n', ' ')
    return extracted_dialogue

In [None]:
extrcated_dialogue_full = []

for extracted_text in tqdm(extrcated_split_script_text):
    extrcated_dialogue = extract_dialogue_from_llm(superhero, superhero_names, extracted_text)
    extrcated_dialogue_full.append(extrcated_dialogue)

In [12]:
    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

In [13]:
m = []
for extracted_text in tqdm(extrcated_split_script_text[:4]):
    system_prompt = f"You are a movie dialogue separator. From the context you are given, separate the dialogue and provide the dialogue of a charachter. You are only allowed to give final dialoige without any thing. Don't say anything else, just list the dialogue. Always start with the NAME of the character followed by a colon and then the dialogue. The extracted dialogue should always be in single line. Make sure that you extract all the dialouges of the asked charachters. It can be present in multiple lines. These are the identifier for charachter dialoges for which you need to extrcat the dialouges: {", ".join([f"'{i}'" for i in superhero_names])} The identifier are always in captital leter."
    user_prompt = f"Extract only the dialogues of {superhero.upper()} - Synonyms of {superhero.upper()} are {", ".join([f"'{i}'" for i in superhero_names])}. Now extract dialogue based on the synonyms given from the following text\n\n\n\n {extracted_text} \n\n\n\n\n Make sure you only extract dialogue of {", ".join([f"'{i}'" for i in superhero_names])}. The dialogues starts only after the name of the charachter is in capital letter."

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]
    m.append(messages)

  0%|          | 0/4 [00:00<?, ?it/s]

In [20]:
pipeline.tokenizer.pad_token_id = pipeline.tokenizer.eos_token_id

In [21]:
    outputs = pipeline(
        m,
        max_new_tokens=256,
        eos_token_id=terminators,
        do_sample=True,
        temperature=1,
        top_p=1,
    )

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


In [34]:
outputs[2][0]["generated_text"][-1]['content'].replace('\n', ' ')

"Here is the extracted dialogue of the corresponding characters:  BATMAN: 5 children. 6...  That's enough! He goes back in cuffs. Not a coffin."

In [37]:
pattern = re.compile(r'^[A-Z\s\'().,-]+$', re.MULTILINE)

In [51]:
matches = re.finditer(pattern, extrcated_split_script_text[1])
indices = [match.start() for match in matches]

In [57]:
max_indices = indices[0] if len(indices) == 1 else indices[1]

In [58]:
max_indices

89

In [59]:
print(extrcated_split_script_text[1][:max_indices])

BATMAN(CON'T)
(angry)
4 women.
He PUNCHES Zsasz in the STOMACH sending him to his knees,



In [36]:
print(extrcated_split_script_text[2])

BATMAN(CON'T)
(angrier)
5 children.
He CRACKS him across the JAW.
Zsasz LAUGHS and SMILES through CRACKED, BLOODSTAINED TEETH.
ZSASZ
6... 
(coughs)
6 children.
Batman's eyes fill with HATE.
He SNATCHES Zsasz's WINDPIPE and begins to SQUEEZE.
In a FRENZY he starts to RAIN DOWN BLOWS.
ONE -- TWO -- THREE--
His arm is GRABBED and YANKED back.
MAN(O.S)
(firmly)
That's enough!
Batman SPINS around and finds himself FACE to FACE with a
MAN, mid 20's, MASKED and DRESSED in BLACK with A DARK BLUE
BIRD SYMBOL on his CHEST. This is DICK GRAYSON, FORMER
ROBIN, now NIGHTWING.
NIGHTWING(CON'T)
(serious)
He goes back in cuffs. Not a
coffin.
8FLiX.com SCREENPLAY DATABASE 
FOR EDUCATIONAL USE ONLY
28.
Batman UNCLENCHES his FIST and looks down on Zsasz's BLOODY
and SWOLLEN face.
Batman DROPS him.
The Serial Killer still emits SOFT LAUGHTER through his
BUSTED mouth.
SIRENS can be heard.
Batman PULLS his arm free of Nightwing's GRIP and marches
toward the Batmobile.
Nightwing watches him leave and DEPARTS

In [None]:
!pip install PyPDF2==2.9

In [None]:
import PyPDF2
PyPDF2.__version__

In [None]:

# Open the PDF file
file_path = "/home/quamer23nasim38/Superhero-Character-Based-On-RAG-AI-Using-Haystack-And-Qdrant/data/spider-man-no-way-home-2021.pdf"
pdf_file = open(file_path, "rb")

# Create a PDF reader object
pdf_reader = PyPDF2.PdfFileReader(pdf_file)

# Get the total number of pages
total_pages = pdf_reader.numPages

# Function to extract text from each page and filter Spider-Man's dialogues
def extract_spiderman_dialogues(reader, total_pages):
    dialogues = []
    spider_man_lines = False
    for page_num in range(total_pages):
        page = reader.getPage(page_num)
        text = page.extract_text()
        lines = text.split("\n")
        
        for line in lines:
            if line.strip() == "SPIDER-MAN":
                spider_man_lines = True
                dialogues.append(line.strip())
            elif spider_man_lines and line.strip() and not line.startswith(" "):
                dialogues.append(line.strip())
            elif not line.strip():
                spider_man_lines = False
    
    return dialogues

# Extract Spider-Man's dialogues
spiderman_dialogues = extract_spiderman_dialogues(pdf_reader, total_pages)

# Close the PDF file
pdf_file.close()

spiderman_dialogues


In [None]:
spiderman_dialogues