In [2]:
import re
import fitz
import spacy
import nltk
from nltk.corpus import stopwords
from prefect import task, flow
from tqdm import tqdm

In [3]:
def remove_timestamp(text):
    return re.sub(r'\[\d{2}:\d{2}:\d{2}\]', '', text).strip()

text = '''
No Data
'''

cleaned_text = remove_timestamp(text)
print(cleaned_text)

No Data


In [4]:
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

True

In [5]:
@task
def extract_pdf_text(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in tqdm(range(doc.page_count), desc="Extracting pages"):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

@task
def clean_text(text):
    paragraphs = text.split('\n\n')
    cleaned_paragraphs = []
    for paragraph in paragraphs:
        if re.match(r'^[A-Z][a-z]+:', paragraph):
            cleaned_paragraphs.append(paragraph.strip())
        else:
            paragraph = re.sub(r'\s+', ' ', paragraph)
            paragraph = re.sub(r'([.,?!:;])(\S)', r'\1 \2', paragraph)
            paragraph = re.sub(r'\b(ca) (nt)\b', r"can't", paragraph)
            paragraph = re.sub(r'\b(do) (nt)\b', r"don't", paragraph)
            paragraph = re.sub(r'\b(wo) (nt)\b', r"won't", paragraph)
            paragraph = re.sub(r'[^\w\s.,?!:;\'-]', '', paragraph)
            cleaned_paragraphs.append(paragraph.strip())
    return '\n\n'.join(cleaned_paragraphs)

@task
def process_text(text, nlp, stop_words):
    processed_text = []
    for paragraph in text.split('\n\n'):
        if paragraph.startswith("Podcast Episode") or paragraph.startswith("Synopsis:") or "Transcript:" in paragraph:
            processed_text.append(paragraph)
        else:
            doc = nlp(paragraph)
            processed_words = []
            for token in doc:
                if token.text.lower() not in stop_words and len(token.text) > 1:
                    if token.ent_type_ in ['PERSON', 'ORG', 'GPE']:
                        processed_words.append(token.text)
                    else:
                        processed_words.append(token.lemma_.lower())
            processed_text.append(' '.join(processed_words))
    return '\n\n'.join(processed_text)

@task
def extract_metadata(text):
    episodes = re.split(r'Podcast Episode \d+:', text)[1:]
    all_episodes_data = []
    episode_num = 0
    
    for episode_text in episodes:
        episode_num += 1
        lines = episode_text.strip().split('\n')
        episode_info = ""
        transcript = ""
        episode_info = lines[0].strip()
        transcript_start = -1
        for i, line in enumerate(lines):
            if line.startswith("Podcast Transcript:"):
                transcript_start = i
        if transcript_start != -1:
            transcript = '\n'.join(lines[transcript_start+1:]).strip()
        all_episodes_data.append({
            "episode_num" : episode_num,
            "episode_info" : episode_info,
            "transcript" : transcript
        })
    return all_episodes_data

In [6]:
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import traceback
from openai import OpenAI
import os

  from tqdm.autonotebook import tqdm, trange


In [7]:
os.environ['OPENAI_API_KEY'] = 'OPENAI_API_KEY_HERE'

@task
def load_models():
    api_key = os.environ.get('OPENAI_API_KEY')
    sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
    client = OpenAI(api_key=api_key)
    return sentence_model, client

@task
def create_chunks(all_episodes_data, chunk_size=500, overlap=100):
    chunks = []
    for episode in all_episodes_data:
        episode_text = f"Episode {episode['episode_num']}: \n{episode['transcript']}"
        words = episode_text.split()
        for i in range(0, len(words), chunk_size - overlap):
            chunk = ' '.join(words[i:i+chunk_size])
            chunks.append(chunk)
    return chunks

@task
def preprocess_corpus(all_episodes_data, sentence_model):
    chunks = create_chunks(all_episodes_data)
    chunk_embeddings = sentence_model.encode(chunks)
    return chunks, chunk_embeddings

@task
def retrieve_relevant_chunks(query, chunks, chunk_embeddings, sentence_model, top_k=4):
    query_embedding = sentence_model.encode([query])
    similarities = cosine_similarity(query_embedding, chunk_embeddings)[0]
    top_indices = np.argsort(similarities)[-top_k:][::-1]
    relevant_chunks = [chunks[i] for i in top_indices]
    return relevant_chunks

@task
def sliding_window_generation(prompt, context, client, max_length=2048, stride=1000):
    generated_text = ""
    input_text = f"{prompt}\n\n{context}"
    
    for i in range(0, len(input_text), stride):
        window = input_text[i:i+max_length]
        response = client.chat.completions.create(
            model='gpt-4o',
            messages = [
                {'role':'system', 'content': "You are a personal finance assistant designed to provide concise, accurate information based on Ramit Sethi's teachings. Your primary task is to directly answer the user's question using the provided context as general knowledge. Do not offer personalized advice or refer to specific individuals mentioned in the context. Treat the context as background information only, not as a scenario to be addressed. Focus solely on the user's explicit query. Keep your responses brief and to the point, ideally under 150 words. If the user's question cannot be fully answered using the given context, provide a clear, factual response based on general personal finance principles without speculation or elaboration. If you have multiple answers you would like to tell the user, condense them into a single, concise, logical answer for the user."},
                {'role':'user', 'content':window}
            ],
            max_tokens = 512
        )
        generated_text += response.choices[0].message.content.strip()
    return generated_text

@flow(name="Integrated Text Preprocessing and RAG Flow")
def integrated_preprocessing_rag_flow(pdf_path, query):
    nlp = spacy.load('en_core_web_lg')
    stop_words = set(stopwords.words('english'))
    stop_words.update(['actually', 'basically', 'literally', 'you know', 'um', 'uh'])

    extracted_text = extract_pdf_text(pdf_path)
    
    all_episodes_data = extract_metadata(extracted_text)
    sentence_model, client = load_models()
    chunks, chunk_embeddings = preprocess_corpus(all_episodes_data, sentence_model)
    relevant_chunks = retrieve_relevant_chunks(query, chunks, chunk_embeddings, sentence_model)
    context = ' '.join(relevant_chunks)
    print(f"Context given:\n{context}")
    answer = sliding_window_generation(query, context, client)
    return all_episodes_data, answer

In [8]:
if __name__ == "__main__":
    pdf_path = "podcast_transcript_small_version.pdf"
    query = """
        What is Ramit Sethi's most critical lesson for families struggling with money or struggling to spend money?
    """
    try:
        all_episodes_data, answer = integrated_preprocessing_rag_flow(pdf_path, query)
        print("\nQuery:", query)
        print("Answer:", answer)
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        print(traceback.format_exc())

Extracting pages: 100%|█████████████████████████████████████████████████████████████| 323/323 [00:00<00:00, 364.66it/s]




  "class": algorithms.Blowfish,


Context given:
parents. Ramit Sethi: I think it’s pretty cool that Chantha understands Calvin’s childhood affects him. And that understanding, that’s not going to make their money problems disappear, but at least that understanding is a great sign that the two of them can take this journey together. I will say, though, that I’m not sure Calvin has connected his childhood with his financial behaviors of today, the anxiety, the constant comparing cost of everything, and the stress that is putting on their relationship. When you think about your relationship with money today, what role do you think that your family upbringing had in that relationship? Calvin: It makes me just never want to be in debt and be in that position. Ramit Sethi: Because what? Calvin: I don’t know. That is a wonder because it doesn’t help me if I was at home, but I don’t like—and that’s why I don’t feel good at the end of the month when I’m in the red. Ramit Sethi: What happens if you get into debt? Calvin: It’s h


Query: 
        What is Ramit Sethi's most critical lesson for families struggling with money or struggling to spend money?
    
Answer: Ramit Sethi's most critical lesson for families struggling with money is understanding the psychological and emotional factors that drive their financial behaviors. Recognizing how childhood experiences impact their relationship with money can help individuals understand and change their spending and saving habits. This mutual understanding can be pivotal in addressing financial challenges together.Getting into debt can lead to several challenges, including increased stress and difficulty escaping the debt cycle. Ramit Sethi suggests that effective money management can prevent such issues, emphasizing the importance of having a plan and being conscious of spending. Without careful management, arguments about finances can strain relationships, and continuous financial problems can lead to long-term negative impacts, such as strained family dynamics an