
# Script Description
This script is designed to extract the Total Principal Funds Available from downloaded PDF files. It involves the following steps:

1. Reading PDF files
2. Identifying the relevant sections
3. Extracting and processing the data
4. Storing the extracted information




In [None]:
import fitz  # PyMuPDF
import re
import os
import csv

# Directory where PDF files are stored
pdf_directory = r"C:\Users\Rex Fuentes\Documents\Data Engineering Case Study\citigroup_case_study\certfc8_holdrs_st8mnt"  # Replace with the path to your directory containing the PDFs
csv_filename = r"C:\Users\Rex Fuentes\Documents\Data Engineering Case Study\citigroup_case_study\extracted_data.csv"

# Regular expression patterns
distribution_date_pattern = r"Distribution Date:\s*(\d{1,2}/\d{1,2}/\d{4})"
determination_date_pattern = r"(\d{1,2}/\d{1,2}/\d{4})\s*Determination Date:"
total_principal_pattern = r"Total Principal Funds Available:\s*\$?([\d,]+\.\d{2})"

def extract_data_from_pdf(pdf_path):
    pdf_document = fitz.open(pdf_path)
    first_page_text = pdf_document[0].get_text()
    
    # Extract dates from the first page
    distribution_date_match = re.search(distribution_date_pattern, first_page_text)
    determination_date_match = re.search(determination_date_pattern, first_page_text)
    distribution_date = distribution_date_match.group(1) if distribution_date_match else "Not found"
    determination_date = determination_date_match.group(1) if determination_date_match else "Not found"
    
    # Initialize total principal amount
    total_principal = "Not found"
    
    # Loop through all pages to find the total principal funds available
    for page_num in range(len(pdf_document)):
        page_text = pdf_document[page_num].get_text()
        total_principal_match = re.search(total_principal_pattern, page_text)
        if total_principal_match:
            total_principal = total_principal_match.group(1)
            break  # Stop after finding the first match
    
    pdf_document.close()
    return distribution_date, determination_date, total_principal

# Write the headers to the CSV file
with open(csv_filename, 'w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(['Filename', 'Distribution Date', 'Determination Date', 'Total Principal Funds Available'])

    # Iterate over each PDF file in the directory
    for filename in os.listdir(pdf_directory):
        if filename.lower().endswith('.pdf'):
            pdf_path = os.path.join(pdf_directory, filename)
            # Extract the data
            distribution_date, determination_date, total_principal = extract_data_from_pdf(pdf_path)
            # Write the data to the CSV file
            csv_writer.writerow([filename, distribution_date, determination_date, total_principal])

print("Extraction complete. Data saved to", csv_filename)


### This script fetches the data from the downloaded pdf files and load them to the database.

In [None]:
import fitz  # PyMuPDF
import re
import os
import psycopg2
from psycopg2 import sql
from datetime import datetime
import configparser


# Read database credentials and file directory from a config file
config = configparser.ConfigParser()
config.read('config.ini')

# Directory where PDF files are stored
pdf_directory = config['files']['pdf_directory']

# PostgreSQL database connection parameters
db_params = {
    'database': config['database']['database'],
    'user': config['database']['user'],
    'password': config['database']['password'],
    'host': config['database']['host'],
    'port': config['database']['port']
}

# Connect to your PostgreSQL database
conn = psycopg2.connect(**db_params)

# Open a cursor to perform database operations
cur = conn.cursor()

# Function to create the table if it doesn't exist
def create_table_if_not_exists(cur):
    cur.execute("""
        CREATE TABLE IF NOT EXISTS certificate_holder_statements (
            filename TEXT,
            distribution_date DATE,
            determination_date DATE,
            total_principal_funds_available NUMERIC
        );
    """)

# Call the function to create the table
create_table_if_not_exists(cur)

# Regular expression pdf
total_principal_pattern = r"Total Principal Funds Available\s*:\s*\$?([0-9,]+\.?[0-9]*)"
distribution_date_pattern = r"Distribution Date:\s+(\d{1,2}/\d{1,2}/\d{4})"
determination_date_pattern = r"(\d{1,2}/\d{1,2}/\d{4})\s*Determination Date:"

# Function to extract data from the PDF
def extract_data_from_pdf(pdf_path):
    pdf_document = fitz.open(pdf_path)
    first_page_text = pdf_document[0].get_text("text")

    # Search for the patterns in the text
    distribution_date_match = re.search(distribution_date_pattern, first_page_text, re.MULTILINE)
    determination_date_match = re.search(determination_date_pattern, first_page_text, re.MULTILINE)

    # Extract the matched values
    distribution_date = distribution_date_match.group(1) if distribution_date_match else None
    determination_date = determination_date_match.group(1) if determination_date_match else None

    # Initialize total principal amount
    total_principal = None
    
    # Loop through all pages to find the total principal funds available
    for page_num in range(len(pdf_document)):
        page_text = pdf_document[page_num].get_text("text")
        total_principal_match = re.search(total_principal_pattern, page_text, re.MULTILINE)
        if total_principal_match:
            total_principal = float(total_principal_match.group(1).replace(',', ''))
            break

    # Close the document after all operations
    pdf_document.close()
    
    return distribution_date, determination_date, total_principal
    
def convert_date_format(date_string):
    try:
        return datetime.strptime(date_string, '%m/%d/%Y').date()
    except ValueError:
        return None  # or raise an exception, depending on how you want to handle errors
        
# Iterate over each PDF file in the directory
for filename in os.listdir(pdf_directory):
    if filename.lower().endswith('.pdf'):
        pdf_path = os.path.join(pdf_directory, filename)
        distribution_date, determination_date, total_principal = extract_data_from_pdf(pdf_path)

        # Convert dates to the correct format for PostgreSQL
        distribution_date = convert_date_format(distribution_date)
        determination_date = convert_date_format(determination_date)

        # Prepare the SQL insert statement
        insert_stmt = """
            INSERT INTO certificate_holder_statements (filename, distribution_date, determination_date, total_principal_funds_available)
            VALUES (%s, %s, %s, %s);
        """

        # Execute the insert statement
        cur.execute(insert_stmt, (filename, distribution_date, determination_date, total_principal))

# Commit changes to the database
conn.commit()

# Close communication with the database
cur.close()
conn.close()

print("Data insertion complete.")
