In [1]:
import re
import fitz
import spacy
import nltk
from nltk.corpus import stopwords
from prefect import task, flow
from tqdm import tqdm

In [2]:
def remove_timestamp(text):
    return re.sub(r'\[\d{2}:\d{2}:\d{2}\]', '', text).strip()

text = '''
No Data
'''

cleaned_text = remove_timestamp(text)
print(cleaned_text)

No Data


In [3]:
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

True

In [4]:
@task
def extract_pdf_text(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in tqdm(range(doc.page_count), desc="Extracting pages"):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

@task
def clean_text(text):
    paragraphs = text.split('\n\n')
    cleaned_paragraphs = []
    for paragraph in paragraphs:
        if re.match(r'^[A-Z][a-z]+:', paragraph):
            cleaned_paragraphs.append(paragraph.strip())
        else:
            paragraph = re.sub(r'\s+', ' ', paragraph)
            paragraph = re.sub(r'([.,?!:;])(\S)', r'\1 \2', paragraph)
            paragraph = re.sub(r'\b(ca) (nt)\b', r"can't", paragraph)
            paragraph = re.sub(r'\b(do) (nt)\b', r"don't", paragraph)
            paragraph = re.sub(r'\b(wo) (nt)\b', r"won't", paragraph)
            paragraph = re.sub(r'[^\w\s.,?!:;\'-]', '', paragraph)
            cleaned_paragraphs.append(paragraph.strip())
    return '\n\n'.join(cleaned_paragraphs)

@task
def process_text(text, nlp, stop_words):
    processed_text = []
    for paragraph in text.split('\n\n'):
        if paragraph.startswith("Podcast Episode") or paragraph.startswith("Synopsis:") or "Transcript:" in paragraph:
            processed_text.append(paragraph)
        else:
            doc = nlp(paragraph)
            processed_words = []
            for token in doc:
                if token.text.lower() not in stop_words and len(token.text) > 1:
                    if token.ent_type_ in ['PERSON', 'ORG', 'GPE']:
                        processed_words.append(token.text)
                    else:
                        processed_words.append(token.lemma_.lower())
            processed_text.append(' '.join(processed_words))
    return '\n\n'.join(processed_text)

@task
def extract_metadata(text):
    episodes = re.split(r'Podcast Episode \d+:', text)[1:]
    all_episodes_data = []
    episode_num = 0
    
    for episode_text in episodes:
        episode_num += 1
        lines = episode_text.strip().split('\n')
        episode_info = ""
        transcript = ""
        episode_info = lines[0].strip()
        transcript_start = -1
        for i, line in enumerate(lines):
            if line.startswith("Podcast Transcript:"):
                transcript_start = i
        if transcript_start != -1:
            transcript = '\n'.join(lines[transcript_start+1:]).strip()
        all_episodes_data.append({
            "episode_num" : episode_num,
            "episode_info" : episode_info,
            "transcript" : transcript
        })
    return all_episodes_data

In [5]:
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import traceback
from openai import OpenAI
import os

  from tqdm.autonotebook import tqdm, trange


In [18]:
os.environ['OPENAI_API_KEY'] = 'OPENAI_API_KEY_HERE'

@task
def load_models():
    api_key = os.environ.get('OPENAI_API_KEY')
    sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
    client = OpenAI(api_key=api_key)
    return sentence_model, client

@task
def create_chunks(all_episodes_data, chunk_size=500, overlap=100):
    chunks = []
    for episode in all_episodes_data:
        episode_text = f"Episode {episode['episode_num']}: \n{episode['transcript']}"
        words = episode_text.split()
        for i in range(0, len(words), chunk_size - overlap):
            chunk = ' '.join(words[i:i+chunk_size])
            chunks.append(chunk)
    return chunks

@task
def preprocess_corpus(all_episodes_data, sentence_model):
    chunks = create_chunks(all_episodes_data)
    chunk_embeddings = sentence_model.encode(chunks)
    return chunks, chunk_embeddings

@task
def retrieve_relevant_chunks(query, chunks, chunk_embeddings, sentence_model, top_k=4):
    query_embedding = sentence_model.encode([query])
    similarities = cosine_similarity(query_embedding, chunk_embeddings)[0]
    top_indices = np.argsort(similarities)[-top_k:][::-1]
    relevant_chunks = [chunks[i] for i in top_indices]
    return relevant_chunks

@task
def sliding_window_generation(prompt, context, client, max_length=2048, stride=1000):
    generated_text = ""
    input_text = f"{prompt}\n\n{context}"
    
    for i in range(0, len(input_text), stride):
        window = input_text[i:i+max_length]
        response = client.chat.completions.create(
            model='gpt-4o',
            messages = [
                {'role':'system', 'content': "You are a personal finance assistant designed to provide concise, accurate information based on Ramit Sethi's teachings. Your primary task is to directly answer the user's question using the provided context as general knowledge. Do not offer personalized advice or refer to specific individuals mentioned in the context. Treat the context as background information only, not as a scenario to be addressed. Focus solely on the user's explicit query. Keep your responses brief and to the point, ideally under 150 words. If the user's question cannot be fully answered using the given context, provide a clear, factual response based on general personal finance principles without speculation or elaboration."},
                {'role':'user', 'content':window}
            ],
            max_tokens = 512
        )
        generated_text += response.choices[0].message.content.strip()
    return generated_text

@flow(name="Integrated Text Preprocessing and RAG Flow")
def integrated_preprocessing_rag_flow(pdf_path, query):
    nlp = spacy.load('en_core_web_lg')
    stop_words = set(stopwords.words('english'))
    stop_words.update(['actually', 'basically', 'literally', 'you know', 'um', 'uh'])

    extracted_text = extract_pdf_text(pdf_path)
    
    all_episodes_data = extract_metadata(extracted_text)
    sentence_model, client = load_models()
    chunks, chunk_embeddings = preprocess_corpus(all_episodes_data, sentence_model)
    relevant_chunks = retrieve_relevant_chunks(query, chunks, chunk_embeddings, sentence_model)
    context = ' '.join(relevant_chunks)
    print(f"Context given:\n{context}")
    answer = sliding_window_generation(query, context, client)
    return all_episodes_data, answer

In [14]:
if __name__ == "__main__":
    pdf_path = r"C:\Users\sambi\Documents\School Stuff\Ramit Sethi - I will teach you to be rich.pdf"
    query = """
        What is the conscious spending plan? How does Ramit talk about it and why does it work?
    """
    try:
        all_episodes_data, answer = integrated_preprocessing_rag_flow(pdf_path, query)
        print("\nQuery:", query)
        print("Answer:", answer)
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        print(traceback.format_exc())

Extracting pages: 100%|█████████████████████████████████████████████████████████████| 323/323 [00:00<00:00, 361.40it/s]




Context given:
leave? Right? Ramit Sethi: Greg has a point. If you’re going to leave in a couple of years, certain renovations might not make financial sense. Now, one thing I notice is that his money lens, the way that he views the world is financial. While Laura’s money lens seems to be toward comfort, right? She wants to be comfortable in the place where she lives. She wants to make it feel like it’s theirs, not just his. When you are working on money with your partner, it’s very important to be transparent about what your money lenses are. For certain things, my money lens is comfort or security. For another part of my life, it’s speed or results, sometimes, security, and sometimes, it’s cost, right? You can Google Ramit Sethi’s money lenses to find out some of the other ways that you might think about money. What do you think the real problem is here, both of you? Laura: We have differences in what we each need to feel comfortable in the home. I want things to be a little bit nice


Query: 
        What are the different money lenses that I can look through? List them all for me.
    
Answer: According to Ramit Sethi, some common money lenses, or ways people view their finances, include:

1. Financial (cost)
2. Comfort
3. Security
4. Speed
5. Results

Different individuals may have unique combinations or additional lenses they prioritize.Building equity through homeownership can be beneficial as it allows you to accumulate value in the property over time, which can be a significant financial asset. In contrast, renting typically means you are not building any equity, as your payments go towards the landlord's mortgage and property upkeep instead.Buying versus renting is a common debate. Buying allows you to build equity in the property over time, potentially benefiting from property value appreciation. In contrast, renting can be less of a financial commitment and offers greater flexibility with less responsibility for maintenance. Each option has its pros and co