### Step 0: Install dependencies

In [None]:
#!pip install ipywidgets PyPDF2 faiss-cpu sentence-transformers nltk jupyter ipython
#!pip install openai

### Step 1: Import dependencies

In [None]:
import os
import io
import gc
import faiss
import numpy as np
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
import torch
import time
from tqdm.auto import tqdm
from tkinter import Tk, filedialog
import tkinter as tk
from typing import List, Dict, Any, Tuple
from openai import OpenAI
import re
from collections import defaultdict

### Step 2: Develop RAG System

In [None]:
class RAGFinancialAnalyzer:
    #
    def __init__(self, openai_api_key):
        self.chunks = []
        self.index = None
        self.model = None
        self.client = OpenAI(api_key=openai_api_key)
        self.company_mappings = {
            'TRV': 'Travelers',
            'ALL': 'Allstate',
            'CB': 'Chubb',
            'PRG': 'Progressive'
        }
        print("RAG Financial Analyzer initialized successfully.")
# ---------------------------------------------------------------------------------
# Chunking & Preprocessing
# ---------------------------------------------------------------------------------
    def _extract_company_code(self, filename: str) -> str:
        """Extract company code from filename"""
        for code in self.company_mappings:
            if code in filename:
                return code
        return "Unknown"

    def _clean_text(self, text: str) -> str:
        """Clean and standardize text content"""
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\x20-\x7E]', '', text)
        return text.strip()

    def _get_company_name(self, company_code: str) -> str:
        """Get full company name from code"""
        return self.company_mappings.get(company_code, company_code)

    def _organize_by_company(self, chunks: List[Dict]) -> Dict[str, List[str]]:
        """Organize text chunks by company"""
        company_texts = defaultdict(list)
        for chunk in chunks:
            company_code = self._extract_company_code(chunk["metadata"]["source"])
            company_name = self._get_company_name(company_code)
            text_with_metadata = f"{chunk['text']} [Source: {chunk['metadata']['source']}, Page: {chunk['metadata']['page']}]"
            company_texts[company_name].append(text_with_metadata)
        return company_texts

    def _create_structured_context(self, chunks: List[Dict], max_tokens: int = 4000) -> Tuple[str, List[str]]:
        """Create structured context organized by company"""
        company_texts = self._organize_by_company(chunks)
        context = ""
        sources = []
        current_length = 0

        for company, texts in company_texts.items():
            company_section = f"\n{company} Analysis:\n"
            for text in texts:
                # Extract source information
                source_match = re.search(r'\[Source: (.*?), Page: (\d+)\]', text)
                if source_match:
                    source = f"{source_match.group(1)} (page {source_match.group(2)})"
                    if source not in sources:
                        sources.append(source)

                # Remove source information from text for context
                text_without_source = re.sub(r'\[Source:.*?\]', '', text)

                chunk_tokens = len(text_without_source) / 4
                if current_length + chunk_tokens > max_tokens:
                    break

                company_section += f"- {text_without_source}\n"
                current_length += chunk_tokens

            context += company_section

        return context.strip(), sources
# ---------------------------------------------------------------------------------
# Embedding & Indexing
# ---------------------------------------------------------------------------------

    def upload_and_process(self):
        """Process uploaded financial documents"""
        try:
            print("Select your financial statement PDFs...")
            root = tk.Tk()
            root.withdraw()
            file_paths = filedialog.askopenfilenames(
                title="Select Financial Statement PDFs",
                filetypes=[("PDF files", "*.pdf")]
            )
            root.destroy()

            if not file_paths:
                print("No files were selected.")
                return

            print(f"\nProcessing {len(file_paths)} documents...")
            start_time = time.time()

            for file_path in file_paths:
                try:
                    filename = os.path.basename(file_path)
                    company_code = self._extract_company_code(filename)
                    company_name = self._get_company_name(company_code)
                    print(f"\nProcessing {company_name} ({filename})...")

                    with open(file_path, 'rb') as file:
                        pdf = PdfReader(file)

                        for page_num, page in enumerate(pdf.pages):
                            text = page.extract_text()
                            if not text:
                                continue

                            clean_text = self._clean_text(text)
                            paragraphs = clean_text.split('\n\n')

                            for para in paragraphs:
                                if len(para.strip()) > 50:
                                    self.chunks.append({
                                        "text": para.strip(),
                                        "metadata": {
                                            "source": filename,
                                            "page": page_num + 1,
                                            "company": company_name
                                        }
                                    })

                except Exception as e:
                    print(f"Error processing {filename}: {str(e)}")
                    continue

            if not self.chunks:
                print("No text could be extracted from the documents.")
                return

            print(f"\nGenerating embeddings for {len(self.chunks)} text segments...")
            self.model = SentenceTransformer('all-MiniLM-L6-v2')
            texts = [chunk["text"] for chunk in self.chunks]
            embeddings = self.model.encode(
                texts,
                batch_size=32,
                show_progress_bar=True
            )

            print("\nBuilding search index...")
            dimension = embeddings.shape[1]
            self.index = faiss.IndexFlatL2(dimension)
            self.index.add(embeddings.astype('float32'))

            del embeddings
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            gc.collect()

            processing_time = time.time() - start_time
            print(f"\nProcessing complete! Time taken: {processing_time:.2f} seconds")

        except Exception as e:
            print(f"An error occurred during processing: {str(e)}")

    def analyze(self, question: str, top_k: int = 10):
        """Generate analytical responses with company-specific insights"""
        if not self.index or not self.chunks:
            print("Please process documents first using upload_and_process()")
            return

        try:
            print(f"Analyzing: {question}\n")

# ---------------------------------------------------------------------------------
# Retrieval Process
# ---------------------------------------------------------------------------------

            query_embedding = self.model.encode([question])
            D, I = self.index.search(query_embedding.astype('float32'), top_k)

            selected_chunks = [self.chunks[idx] for idx in I[0]]
# -----------------------------------------------------------------------------------
# Augmentation Process
# -----------------------------------------------------------------------------------

            context, sources = self._create_structured_context(selected_chunks)

            prompt = f"""As an expert financial analyst, provide a detailed analysis of the following financial statements,
organized by company. Focus on specific findings for each company mentioned in the documents.

Question: {question}

{context}

Please provide a comprehensive analysis that:
1. Clearly separates findings by company
2. Includes specific numerical evidence
3. Cites sources (document and page number) for each major point
4. Explains business implications for each company

Analysis:"""

# -----------------------------------------------------------------------------------
# Generation Process
# -----------------------------------------------------------------------------------

            response = self.client.chat.completions.create(
                model="gpt-4",
                messages=[
                    {"role": "system", "content": """You are an expert financial analyst.
                    Organize your analysis by company, clearly stating which company you're discussing for each point.
                    Always cite specific sources and page numbers for your findings.
                    Focus on providing company-specific insights and comparisons where relevant."""},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.7,
                max_tokens=1500
            )

            print("Analysis:")
            print("-" * 80)
            print(response.choices[0].message.content)
            print("\nSources Referenced:")
            for source in sources:
                print(f"- {source}")
            print("-" * 80)

        except Exception as e:
            print(f"Error during analysis: {str(e)}")

### Step 3: Insert Open API Key

In [None]:
from getpass import getpass

# Securely prompt user for the API key (input will be hidden)
openai_api_key = getpass('Enter your OpenAI API key: ')

# Initialize both components with the provided key
analyzer = RAGFinancialAnalyzer(openai_api_key)

print("RAG system initialized successfully!")

Enter your OpenAI API key:  ········


RAG Financial Analyzer initialized successfully.
RAG system initialized successfully!


### Step 4: Upload Financial Reports

In [None]:
analyzer.upload_and_process()

Select your financial statement PDFs...

Processing 4 documents...

Processing Allstate (ALL_10K_FY2023.pdf)...

Processing Chubb (CB_10K_FY2023.pdf)...

Processing Progressive (PRG_10K_FY2023.pdf)...

Processing Travelers (TRV_10K_FY2023.pdf)...

Generating embeddings for 1020 text segments...


Batches:   0%|          | 0/32 [00:00<?, ?it/s]


Building search index...

Processing complete! Time taken: 70.28 seconds


### Step 5: Ask Questions

In [None]:
analyzer.analyze("What are the causes driving the largest amount of losses across all companies? Please cite all the references from the source. Make sure I know which company is talked about.")

Analyzing: What are the causes driving the largest amount of losses across all companies? Please cite all the references from the source. Make sure I know which company is talked about.

Analysis:
--------------------------------------------------------------------------------
Allstate Corporation:

1. Allstate's net loss applicable to common shareholders was $316 million in 2023, which was an improvement from the $1.39 billion net loss in 2022. The company attributes this improvement to better underwriting results and net gains on equity valuations (2023 Form 10-K Consolidated net income, p.38).
2. The company's total revenue increased by 11.1% to $57.09 billion in 2023, primarily due to a 10.4% increase in property and casualty insurance premiums. The company also had net gains on equity valuations in 2023 as compared to losses in 2022 (2023 Form 10-K Total revenue, p.38).
3. Allstate's major risks include insurance and financial services, business strategy and operations, and macro,