In [None]:
!pip install anthropic
!pip install pypdf
!pip install pandas
!pip install json

Collecting anthropic
  Downloading anthropic-0.34.2-py3-none-any.whl.metadata (18 kB)
Collecting httpx<1,>=0.23.0 (from anthropic)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from anthropic)
  Downloading jiter-0.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->anthropic)
  Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->anthropic)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading anthropic-0.34.2-py3-none-any.whl (891 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m891.9/891.9 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpx-0.27.2-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpcore-1.0.5-py3-none-any.whl (

In [None]:
import os
import json
import pandas as pd
from pypdf import PdfReader, PdfWriter
from anthropic import Anthropic
from concurrent.futures import ThreadPoolExecutor
from functools import partial

In [None]:
import os
import json
import pandas as pd
from pypdf import PdfReader, PdfWriter
from anthropic import Client

# Setup the API client with your API key
os.environ['ANTHROPIC_API_KEY'] = "your_antropic_api_key"
anthropic_client = Client(api_key=os.getenv("ANTHROPIC_API_KEY"))

MODEL_NAME = "claude-3-5-sonnet-20240620"

# Define the path for the error log file
error_log_path = "./error_logs.txt"

# Function to log errors to the error log file
def log_error(message):
    with open(error_log_path, 'a') as log_file:
        log_file.write(message + '\n')

# Function to get a completion from the Claude model
def get_completion(client, prompt):
    try:
        response = client.messages.create(
            model=MODEL_NAME,
            max_tokens=8192,
            messages=[{
                "role": 'user', "content": prompt
            }]
        )
        return response.content[0].text
    except Exception as e:
        error_message = f"Error fetching completion from API: {e}"
        print(error_message)
        log_error(error_message)
        return None

# Function to extract JSON content from a response string
def extract_json_from_response(response):
    try:
        # Find the first occurrence of '[' and ']', and extract the JSON content
        start_index = response.find('[')
        end_index = response.find(']', start_index) + 1  # Include the closing bracket

        if start_index != -1 and end_index != -1:
            json_content = response[start_index:end_index]
            try:
                # Parse the JSON content
                json_data = json.loads(json_content)
                return json_data
            except json.JSONDecodeError as e:
                error_message = f"Error parsing JSON content: {e}"
                print(error_message)
                log_error(error_message)
                return None
        else:
            error_message = "No valid JSON found in the response."
            print(error_message)
            log_error(error_message)
            return None
    except Exception as e:
        error_message = f"Unexpected error while extracting JSON: {e}"
        print(error_message)
        log_error(error_message)
        return None

# Function to split PDF into individual pages
def split_pdf(pdf_path, split_folder):
    try:
        reader = PdfReader(pdf_path)
        pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]

        for i, page in enumerate(reader.pages):
            writer = PdfWriter()
            writer.add_page(page)

            # Update naming convention to pdfname-0001.pdf, pdfname-0002.pdf, etc.
            split_pdf_path = os.path.join(split_folder, f'{pdf_name}-{i+1:04d}.pdf')
            with open(split_pdf_path, 'wb') as output_pdf:
                writer.write(output_pdf)

            print(f"Saved split PDF: {split_pdf_path}")
    except Exception as e:
        error_message = f"Error splitting PDF {pdf_path}: {e}"
        print(error_message)
        log_error(error_message)

# Function to extract JSON data from a single PDF page
def extract_json_from_page(pdf_page_path, output_folder):
    try:
        reader = PdfReader(pdf_page_path)
        text = ''.join(page.extract_text() for page in reader.pages)

        # Define the prompt for Claude
        prompt = f"""You are a seasoned data scientist at a fortune 500 company, Here is a document: <doc>{text}</doc>. Detect tables in the documents. Give all the transactions made in the document without making any assumptions and don't give samples. Only give everything that is given in the document. Convert to a completed JSON format with containing all of its required details"""

        # Get JSON completion from Claude
        completion = get_completion(anthropic_client, prompt)

        if completion is None:
            log_error(f"Skipping file {pdf_page_path} due to API error.")
            return None

        # Extract and parse the JSON portion from the completion
        json_data = extract_json_from_response(completion)

        if json_data is None:
            log_error(f"Skipping file {pdf_page_path} due to JSON extraction issues.")
            return None

        # Save JSON file with the same name as the PDF page (e.g., pdfname-0001.json)
        page_name = os.path.splitext(os.path.basename(pdf_page_path))[0]
        json_path = os.path.join(output_folder, f'{page_name}.json')

        # Save JSON to the specified output folder
        with open(json_path, 'w') as f:
            json.dump(json_data, f, indent=4)  # Save in a readable format

        return json_path
    except Exception as e:
        error_message = f"Error processing PDF page {pdf_page_path}: {e}"
        print(error_message)
        log_error(error_message)
        return None

# Function to merge multiple JSON files into a single DataFrame and save as CSV
def merge_json_to_csv(json_folder, csv_output_folder, csv_filename):
    try:
        json_files = sorted([f for f in os.listdir(json_folder) if f.endswith('.json')])

        merged_df = pd.DataFrame()

        for json_file in json_files:
            json_path = os.path.join(json_folder, json_file)
            with open(json_path, 'r') as f:
                data = json.load(f)
            df = pd.json_normalize(data)
            merged_df = pd.concat([merged_df, df], ignore_index=True)

        # Generate CSV file path
        csv_path = os.path.join(csv_output_folder, csv_filename)

        # Save merged DataFrame as CSV
        merged_df.to_csv(csv_path, index=False)
        print(f"Merged CSV saved: {csv_path}")
    except Exception as e:
        error_message = f"Error merging JSON to CSV: {e}"
        print(error_message)
        log_error(error_message)

# Main function to process PDFs
def process_pdfs(pdf_folder, output_folder):
    # List all PDF files in the folder
    pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith('.pdf')]

    for pdf_file in pdf_files:
        pdf_path = os.path.join(pdf_folder, pdf_file)

        # Create output folders for each PDF
        pdf_name = os.path.splitext(pdf_file)[0]
        pdf_output_folder = os.path.join(output_folder, pdf_name)
        split_folder = os.path.join(pdf_output_folder, 'split_pdf')
        converted_json_folder = os.path.join(pdf_output_folder, 'converted_json')
        converted_csv_folder = os.path.join(pdf_output_folder, 'converted_csv')
        os.makedirs(split_folder, exist_ok=True)
        os.makedirs(converted_json_folder, exist_ok=True)
        os.makedirs(converted_csv_folder, exist_ok=True)

        # Split PDF into individual pages
        split_pdf(pdf_path, split_folder)

        # Process each split page to JSON
        for split_pdf_file in sorted(os.listdir(split_folder)):
            split_pdf_path = os.path.join(split_folder, split_pdf_file)
            extract_json_from_page(split_pdf_path, converted_json_folder)

        # Merge JSON files and convert to CSV
        merge_json_to_csv(converted_json_folder, converted_csv_folder, f'{pdf_name}_merged.csv')

# Specify the PDF folder and desired output location
pdf_folder = "./PDFs"
output_folder = "./"

# Process the PDFs
process_pdfs(pdf_folder, output_folder)


Saved split PDF: ./a12/split_pdf/a12-0001.pdf
Saved split PDF: ./a12/split_pdf/a12-0002.pdf
Saved split PDF: ./a12/split_pdf/a12-0003.pdf
Saved split PDF: ./a12/split_pdf/a12-0004.pdf
Saved split PDF: ./a12/split_pdf/a12-0005.pdf
Saved split PDF: ./a12/split_pdf/a12-0006.pdf
Saved split PDF: ./a12/split_pdf/a12-0007.pdf
Saved split PDF: ./a12/split_pdf/a12-0008.pdf
Saved split PDF: ./a12/split_pdf/a12-0009.pdf
Saved split PDF: ./a12/split_pdf/a12-0010.pdf
Saved split PDF: ./a12/split_pdf/a12-0011.pdf
Saved split PDF: ./a12/split_pdf/a12-0012.pdf
Saved split PDF: ./a12/split_pdf/a12-0013.pdf
Saved split PDF: ./a12/split_pdf/a12-0014.pdf
Saved split PDF: ./a12/split_pdf/a12-0015.pdf
Saved split PDF: ./a12/split_pdf/a12-0016.pdf
Saved split PDF: ./a12/split_pdf/a12-0017.pdf
Saved split PDF: ./a12/split_pdf/a12-0018.pdf
Saved split PDF: ./a12/split_pdf/a12-0019.pdf
Saved split PDF: ./a12/split_pdf/a12-0020.pdf
Saved split PDF: ./a12/split_pdf/a12-0021.pdf
Saved split PDF: ./a12/split_pdf/a

  merged_df = pd.concat([merged_df, df], ignore_index=True)


#Old Code For refrence DON'T RUN!


In [None]:

import os
import json
import pandas as pd
from pypdf import PdfReader
from anthropic import Client

# Setup the API client with your API key
os.environ['ANTHROPIC_API_KEY'] = "your_anthropic_api_key"
anthropic_client = Client(api_key=os.getenv("ANTHROPIC_API_KEY"))

MODEL_NAME = "claude-3-5-sonnet-20240620"

# Define the path for the error log file
error_log_path = "./error_logs.txt"

# Function to log errors to the error log file
def log_error(message):
    with open(error_log_path, 'a') as log_file:
        log_file.write(message + '\n')

# Function to get a completion from the Claude model
def get_completion(client, prompt):
    try:
        response = client.messages.create(
            model=MODEL_NAME,
            max_tokens=8192,
            messages=[{
                "role": 'user', "content": prompt
            }]
        )
        return response.content[0].text
    except Exception as e:
        error_message = f"Error fetching completion from API: {e}"
        print(error_message)
        log_error(error_message)
        return None

# Function to extract JSON content from a response string
def extract_json_from_response(response):
    try:
        # Find the first occurrence of '[' and ']', and extract the JSON content
        start_index = response.find('[')
        end_index = response.find(']', start_index) + 1  # Include the closing bracket

        if start_index != -1 and end_index != -1:
            json_content = response[start_index:end_index]
            try:
                # Parse the JSON content
                json_data = json.loads(json_content)
                return json_data
            except json.JSONDecodeError as e:
                error_message = f"Error parsing JSON content: {e}"
                print(error_message)
                log_error(error_message)
                return None
        else:
            error_message = "No valid JSON found in the response."
            print(error_message)
            log_error(error_message)
            return None
    except Exception as e:
        error_message = f"Unexpected error while extracting JSON: {e}"
        print(error_message)
        log_error(error_message)
        return None

# Function to extract JSON data from PDF
def extract_json_from_pdf(pdf_path, output_folder):
    try:
        reader = PdfReader(pdf_path)
        text = ''.join(page.extract_text() for page in reader.pages)

        # Define the prompt for Claude
        prompt = f"""You are a seasoned data scientist at a fortune 500 company, Here is a document: <doc>{text}</doc>. Detect tables in the documents. Give all the transactions made in the document without making any assumptions and don't give samples. Only give everything that is given in the document. Convert to a completed JSON format with containing all of its required details"""

        # Get JSON completion from Claude
        completion = get_completion(anthropic_client, prompt)

        if completion is None:
            log_error(f"Skipping file {pdf_path} due to API error.")
            return None

        # Extract and parse the JSON portion from the completion
        json_data = extract_json_from_response(completion)

        if json_data is None:
            log_error(f"Skipping file {pdf_path} due to JSON extraction issues.")
            return None

        # Generate JSON file path using the PDF name
        pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
        json_path = os.path.join(output_folder, 'extracted_json', f'{pdf_name}.json')

        # Save JSON to the specified output folder
        with open(json_path, 'w') as f:
            json.dump(json_data, f, indent=4)  # Save in a readable format

        return json_path
    except Exception as e:
        error_message = f"Error processing PDF {pdf_path}: {e}"
        print(error_message)
        log_error(error_message)
        return None

# Function to convert JSON to CSV
def convert_json_to_csv(json_path, csv_output_folder):
    try:
        if not json_path:
            log_error("Skipping CSV conversion due to JSON parsing error.")
            return

        # Load JSON data
        with open(json_path, 'r') as f:
            data = json.load(f)

        # Convert JSON to DataFrame
        df = pd.json_normalize(data)

        # Generate CSV file path using the JSON file name
        csv_filename = os.path.splitext(os.path.basename(json_path))[0] + '.csv'
        csv_path = os.path.join(csv_output_folder, 'converted_csv', csv_filename)

        # Save DataFrame as CSV
        df.to_csv(csv_path, index=False)
    except Exception as e:
        error_message = f"Error converting JSON to CSV for {json_path}: {e}"
        print(error_message)
        log_error(error_message)

# Main function to process PDFs
def process_pdfs(pdf_folder, output_folder):
    # List all PDF files in the folder
    pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith('.pdf')]

    for pdf_file in pdf_files:
        pdf_path = os.path.join(pdf_folder, pdf_file)

        # Create output folders for each PDF
        pdf_name = os.path.splitext(pdf_file)[0]
        pdf_output_folder = os.path.join(output_folder, pdf_name)
        os.makedirs(os.path.join(pdf_output_folder, 'extracted_json'), exist_ok=True)
        os.makedirs(os.path.join(pdf_output_folder, 'converted_csv'), exist_ok=True)

        # Extract JSON and convert to CSV
        json_path = extract_json_from_pdf(pdf_path, pdf_output_folder)
        convert_json_to_csv(json_path, pdf_output_folder)

# Specify the PDF folder and desired output location
pdf_folder = "./PDFs"
output_folder = "./"

# Process the PDFs
process_pdfs(pdf_folder, output_folder)


Error parsing JSON content: Expecting value: line 1 column 1 (char 0)
