In [None]:
import requests
import os
from unstructured.partition.auto import partition
from unstructured.partition.pdf import partition_pdf
from unstructured.partition.pptx import partition_pptx
import json

In [9]:
def fetch_documents(urls, save_folder='documents'):
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)

    for url in urls:
        # Check and handle the Microsoft slide URL explicitly
        if 'microsoft.com' in url and 'SlidesFY25Q2' in url:
            filename = 'SlidesFY25Q2.pptx'
        else:
            filename = url.split('/')[-1].split('?')[0]
            if not filename.endswith('.pdf') and not filename.endswith('.pptx'):
                filename += '.pdf'  # default extension if unclear

        response = requests.get(url, stream=True)

        if response.status_code == 200:
            filepath = os.path.join(save_folder, filename)
            with open(filepath, 'wb') as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)
            print(f'Downloaded: {filename}')
        else:
            print(f'Failed to download {url} (status code: {response.status_code})')

In [10]:
# Usage
urls = [
    "https://cdn-dynmedia-1.microsoft.com/is/content/microsoftcorp/SlidesFY25Q2",
    "https://digitalassets.tesla.com/tesla-contents/image/upload/IR/TSLA-Q4-2024-Update.pdf",
    "https://s2.q4cdn.com/470004039/files/doc_earnings/2025/q1/filing/10Q-Q1-2025-as-filed.pdf",
    "https://www.apple.com/newsroom/pdfs/fy2025-q1/FY25_Q1_Consolidated_Financial_Statements.pdf",
    "https://s2.q4cdn.com/470004039/files/doc_financials/2021/q4/_10-K-2021-(As-Filed).pdf"
]

fetch_documents(urls)

Downloaded: SlidesFY25Q2.pptx
Downloaded: TSLA-Q4-2024-Update.pdf
Downloaded: 10Q-Q1-2025-as-filed.pdf
Downloaded: FY25_Q1_Consolidated_Financial_Statements.pdf
Downloaded: _10-K-2021-(As-Filed).pdf


In [2]:
# Predefined categories 
CATEGORIES = {
    "Financial Reports": ["annual report", "quarterly report", "earnings"],
    "Investor Presentations": ["presentation", "conference", "slides"],
    "Corporate Governance Documents": ["policy", "charter", "governance"],
    "Press Releases": ["announcement", "merger", "leadership"],
    "Stock Market Information": ["stock price", "dividend", "shareholder"],
    "Corporate Social Responsibility (CSR) Reports": ["sustainability", "ESG", "community"]
}

In [11]:
def extract_elements(filepath):
    if filepath.endswith('.pdf'):
        elements = partition_pdf(filename=filepath,skip_infer_table_types=False, strategy='hi_res')
    elif filepath.endswith('.pptx'):
        elements = partition_pptx(filename=filepath)
    else:
        raise ValueError(f"Unsupported file type: {filepath}")
    return elements

def classify_document(text):
    for category, keywords in CATEGORIES.items():
        if any(keyword.lower() in text.lower() for keyword in keywords):
            return category
    return "Unknown"

def save_extracted_data(filepath, elements, output_folder="extracted_data"):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Convert elements to dictionaries
    elements_dict = [element.to_dict() for element in elements]
    filename = os.path.basename(filepath).rsplit('.', 1)[0] + '.json'

    with open(os.path.join(output_folder, filename), 'w', encoding='utf-8') as file:
        json.dump(elements_dict, file, ensure_ascii=False, indent=4)




In [15]:
def process_documents(folder='documents'):
    for filename in os.listdir(folder):
        filepath = os.path.join(folder, filename)
        if os.path.isfile(filepath) and filepath.endswith(('.pdf', '.pptx')):
            try:
                elements = extract_elements(filepath)
                full_text = " ".join([element.text for element in elements if hasattr(element, 'text')])
                category = classify_document(full_text)
                save_extracted_data(filepath, elements)
                print(f"{filename}: Extracted and classified as {category}")
            except Exception as e:
                print(f"Failed to process {filename}: {e}")

process_documents()

FY25_Q1_Consolidated_Financial_Statements.pdf: Extracted and classified as Financial Reports
SlidesFY25Q2.pptx: Extracted and classified as Unknown
TSLA-Q4-2024-Update.pdf: Extracted and classified as Financial Reports
_10-K-2021-(As-Filed).pdf: Extracted and classified as Financial Reports
10Q-Q1-2025-as-filed.pdf: Extracted and classified as Financial Reports


In [None]:
tables = [el for el in elements if hasattr(el, 'table') and el.table]
for table in tables:
    print(f"Table found in {filepath}:")
    print(table.table)

In [26]:
import os
import uuid
import json
from dotenv import load_dotenv
import nest_asyncio
from llama_cloud_services import LlamaParse
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Load environment variables and setup
load_dotenv()
nest_asyncio.apply()
LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
parser = LlamaParse(result_type="markdown")

documents_folder = 'documents'
output_jsonl = 'parsed_documents.jsonl'

# Define document categories based on keywords
CATEGORIES = {
    "Financial Reports": ["annual report", "quarterly report", "earnings"],
    "Investor Presentations": ["presentation", "conference", "slides"],
    "Corporate Governance Documents": ["policy", "charter", "governance"],
    "Press Releases": ["announcement", "merger", "leadership"],
    "Stock Market Information": ["stock price", "dividend", "shareholder"],
    "Corporate Social Responsibility (CSR) Reports": ["sustainability", "ESG", "community"]
}

# Function to classify documents based on keywords
def classify_document(text):
    for category, keywords in CATEGORIES.items():
        if any(keyword.lower() in text.lower() for keyword in keywords):
            return category
    return "Unknown"

# Initialize LangChain splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

# Parse documents, chunk content, and write structured output
with open(output_jsonl, 'w', encoding='utf-8') as jsonl_file:
    for file in os.listdir(documents_folder):
        file_path = os.path.join(documents_folder, file)
        if file.lower().endswith(('.pdf', '.pptx')):
            doc_type = 'pdf' if file.lower().endswith('.pdf') else 'ppt'
            try:
                llama_docs = parser.load_data(file_path)
                for doc in llama_docs:
                    content = doc.text
                    page_num = doc.metadata.get('page', 0)
                    source = file
                    chunks = text_splitter.split_text(content)

                    for chunk in chunks:
                        chunk_id = str(uuid.uuid4())
                        content_type = 'table' if '|' in chunk else 'text'
                        category = classify_document(chunk)

                        json_obj = {
                            "chunk_id": chunk_id,
                            "content": chunk,
                            "metadata": {
                                "source": source,
                                "page_num": page_num,
                                "doc_type": doc_type,
                                "content_type": content_type,
                                "category": category
                            }
                        }

                        jsonl_file.write(json.dumps(json_obj, ensure_ascii=False) + '\n')
                print(f"Successfully processed and saved: {file}")
            except Exception as e:
                print(f"Failed to process {file}: {e}")


Started parsing the file under job_id ee9df6e9-6181-4872-ad1f-c0512b5cb7b2
Successfully processed and saved: FY25_Q1_Consolidated_Financial_Statements.pdf
Started parsing the file under job_id b377af2b-cb01-48cc-9966-551389e856e7
Successfully processed and saved: SlidesFY25Q2.pptx
Started parsing the file under job_id 51f0f918-e2ae-46e6-8d91-eabca3fc08c3
Successfully processed and saved: TSLA-Q4-2024-Update.pdf
Started parsing the file under job_id 0b042626-99ef-412b-8b5d-6047f24ca841
Successfully processed and saved: _10-K-2021-(As-Filed).pdf
Started parsing the file under job_id 1bb35f65-02d6-4bb2-9bba-ab2926a2dca2
Successfully processed and saved: 10Q-Q1-2025-as-filed.pdf


In [None]:

import os
import uuid
import json
from dotenv import load_dotenv
import nest_asyncio
from llama_cloud_services import LlamaParse
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Load environment variables and setup
load_dotenv()
nest_asyncio.apply()
LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
# Initialize the parser (markdown or text)
parser = LlamaParse(result_type="markdown")

documents_folder = 'documents'
output_jsonl = 'parsed_documents.jsonl'

# Define document categories based on keywords
CATEGORIES = {
    "Financial Reports": ["annual report", "quarterly report", "earnings"],
    "Investor Presentations": ["presentation", "conference", "slides"],
    "Corporate Governance Documents": ["policy", "charter", "governance"],
    "Press Releases": ["announcement", "merger", "leadership"],
    "Stock Market Information": ["stock price", "dividend", "shareholder"],
    "Corporate Social Responsibility (CSR) Reports": ["sustainability", "ESG", "community"]
}

# Function to classify documents based on keywords
def classify_document(text):
    for category, keywords in CATEGORIES.items():
        if any(keyword.lower() in text.lower() for keyword in keywords):
            return category
    return "Unknown"

# Initialize LangChain splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

# Parse documents, chunk content, and write structured output
with open(output_jsonl, 'w', encoding='utf-8') as jsonl_file:
    for file in os.listdir(documents_folder):
        file_path = os.path.join(documents_folder, file)
        if file.lower().endswith(('.pdf', '.pptx')):
            doc_type = 'pdf' if file.lower().endswith('.pdf') else 'ppt'
            try:
                llama_docs = parser.load_data(file_path)
                for doc in llama_docs:
                    content = doc.text
                    page_num = doc.metadata.get('page', 0)
                    source = file
                    chunks = text_splitter.split_text(content)

                    for chunk in chunks:
                        chunk_id = str(uuid.uuid4())
                        content_type = 'table' if '|' in chunk else 'text'
                        category = classify_document(chunk)

                        json_obj = {
                            "chunk_id": chunk_id,
                            "content": chunk,
                            "metadata": {
                                "source": source,
                                "page_num": page_num,
                                "doc_type": doc_type,
                                "content_type": content_type,
                                "category": category
                            }
                        }

                        jsonl_file.write(json.dumps(json_obj, ensure_ascii=False) + '\n')
                print(f"Successfully processed and saved: {file}")
            except Exception as e:
                print(f"Failed to process {file}: {e}")


Started parsing the file under job_id dc8653a1-a131-4e68-8f35-d953363e0858
Successfully processed and saved: FY25_Q1_Consolidated_Financial_Statements.pdf
Started parsing the file under job_id 1e1942e7-4f8b-430d-8a47-d5295d0b54e8
Error while parsing the file 'documents/SlidesFY25Q2.pptx': Server disconnected without sending a response.
Successfully processed and saved: SlidesFY25Q2.pptx
Started parsing the file under job_id 940f500f-3001-4db2-8628-71a100223a75
Error while parsing the file 'documents/TSLA-Q4-2024-Update.pdf': Server disconnected without sending a response.
Successfully processed and saved: TSLA-Q4-2024-Update.pdf
Started parsing the file under job_id 388ad148-b11a-4f61-812f-5264b9b1527c
Successfully processed and saved: _10-K-2021-(As-Filed).pdf
Started parsing the file under job_id 84dc2661-a273-4535-b932-124b83705390
Successfully processed and saved: 10Q-Q1-2025-as-filed.pdf
