In [6]:
import pandas as pd
import pdfplumber

Extract Schedule

In [10]:
import pdfplumber
import pandas as pd

# Open the PDF file using pdfplumber
file_path = '/Users/petershmorhun/Desktop/SARScraper/SARPDFs/(U)AAG_MSAR_Dec_2023.pdf'

def find_schedule_events_table(pdf):
    # Iterate through the pages to find the "(U) Schedule Events" section
    for page_num, page in enumerate(pdf.pages):
        text = page.extract_text()
        if "(U) Schedule Events" in text:
            # If the target header is found, extract tables on that page
            tables = page.extract_tables()
            if tables:
                # Assume the first table found is the desired one (based on typical format)
                return tables[0], page_num
    return None, None

# Open the PDF and find the "Schedule Events" table
with pdfplumber.open(file_path) as pdf:
    schedule_table, schedule_page_num = find_schedule_events_table(pdf)

# If the table is found, reformat it to the desired structure
if schedule_table:
    # Manually reformat the extracted table to match the structure
    # Assuming the extracted table has a similar format as displayed in the example
    headers = schedule_table[0]  # First row is considered the header
    data_rows = schedule_table[1:]  # Remaining rows are the data
    
    # Create a DataFrame from the extracted data
    df_schedule_events = pd.DataFrame(data_rows, columns=headers)

    # Clean up the DataFrame to reflect the correct structure
    df_schedule_events.columns = [
        "Events", "Type", "Objective (APB Change 1)", "Threshold (APB Change 1)",
        "Current Estimate 12/31/2023", "Actual"
    ]
    
    # Save the DataFrame as a CSV file
    file_name = '/Users/petershmorhun/Desktop/SARScraper/SARPDFs/SARCSVs/AAG_Schedule.csv'
    df_schedule_events.to_csv(file_name, index=False)
    
    print(f"Schedule Events table extracted from page {schedule_page_num + 1} and saved as {file_name}")
else:
    print("Could not find the '(U) Schedule Events' table in the PDF.")

Schedule Events table extracted from page 10 and saved as /Users/petershmorhun/Desktop/SARScraper/SARPDFs/SARCSVs/AAG_Schedule.csv


Extract Peformance

In [11]:
import pdfplumber
import pandas as pd

# Open the PDF file using pdfplumber
#file_path = '/mnt/data/(U)AAG_MSAR_Dec_2023.pdf'

def extract_performance_attributes_table(pdf, start_page, end_page):
    # List to hold the extracted data
    extracted_data = []

    # Iterate through the specified page range
    for page_num in range(start_page - 1, end_page):  # pdfplumber uses zero-based index
        page = pdf.pages[page_num]
        tables = page.extract_tables()
        if tables:
            # Assume the first table found on each page is part of the "Performance Attributes" table
            extracted_data.extend(tables[0])  # Append rows from the first table

    # Return the combined data
    return extracted_data

# Open the PDF and extract the "Performance Attributes" table from pages 11 to 12
with pdfplumber.open(file_path) as pdf:
    performance_attributes_data = extract_performance_attributes_table(pdf, 11, 12)

# If data is found, format it into a DataFrame
if performance_attributes_data:
    # The first row should be the header, and the rest are data rows
    headers = performance_attributes_data[0]
    data_rows = performance_attributes_data[1:]

    # Create a DataFrame from the extracted data
    df_performance_attributes = pd.DataFrame(data_rows, columns=headers)

    # Save the DataFrame as a CSV file
    file_name = '/Users/petershmorhun/Desktop/SARScraper/SARPDFs/SARCSVs/AAG_Performance.csv'
    df_performance_attributes.to_csv(file_name, index=False)

    print(f"Performance Attributes table extracted from pages 11-12 and saved as {file_name}")
else:
    print("Could not find the 'Performance Attributes' table in the specified page range.")

Performance Attributes table extracted from pages 11-12 and saved as /Users/petershmorhun/Desktop/SARScraper/SARPDFs/SARCSVs/AAG_Performance.csv


Let's go with this:

| Attribute | Current Estimate | Demonstrated Performance | APB Objective |  ABP Threshold | KPP/KSA |
| :--- | :--- | :--- | :--- | :--- | :--- |
| Aircraft Interoperability | Will meet threshold. Meets threshold requirements for C-2A, E- 2C, E-2D, F/A- 18E/F, EA- 18G and T- 45C. F-35C risk reduction testing conducted in FY 2022; follow-on compatibility testing with deadloads conducted in 2023; manned compatibility testing commenced in January 2024; Aircraft Recovery Bulletin (ARB) expected in FY 2024.  | Hookload limits and G-load limits demonstrated to be within limits as defined in ARB NO. 35-12 E. | The hookload limits and G-load limits applicable to each aircraft listed in the Development Threshold plus those listed in Table 2 shall not be exceeded when each aircraft engages the AAG at up to its maximum weight, net applied thrust, and maximum aircraft engaging velocity. | The hookload limits and G-load limits applicable to C-2A,E-2 Type/Model/Series (TMS), F/A-18, EA-18 TMS, F-35, and T45 aircraft shall not be exceeded when each aircraft engages the AAG at up to its maximum weight, net applied thrust, and maximum aircraft engaging velocity. | KPP |

In [12]:
import pdfplumber
import pandas as pd

# Function to extract the "Performance Attributes" table from the PDF across pages 11 and 12
def extract_performance_attributes_table(pdf, start_page, end_page):
    extracted_data = []

    # Iterate through the specified page range
    for page_num in range(start_page - 1, end_page):
        page = pdf.pages[page_num]
        tables = page.extract_tables()
        if tables:
            extracted_data.extend(tables[0])  # Assume the first table found on each page is relevant

    return extracted_data

# Open the PDF file and extract data from pages 11 to 12
#file_path = '/mnt/data/(U)AAG_MSAR_Dec_2023.pdf'
with pdfplumber.open(file_path) as pdf:
    raw_data = extract_performance_attributes_table(pdf, 11, 12)

# Manually cleaning and structuring the extracted data
structured_data = []
current_attribute = None

for i in range(0, len(raw_data), 2):  # Group rows in pairs for "Objective" and "Threshold"
    if i + 1 < len(raw_data):
        row1 = raw_data[i]
        row2 = raw_data[i + 1]

        # Set attribute name based on the first descriptive row
        current_attribute = row1[0].split("\n")[0] if row1[0] else 'Unknown Attribute'

        structured_data.append({
            "Attribute": current_attribute,
            "Current Estimate": row1[1] if len(row1) > 1 else None,
            "Demonstrated Performance": row1[2] if len(row1) > 2 else None,
            "APB Objective": row1[2] if len(row1) > 2 else None,
            "APB Threshold": row2[2] if len(row2) > 2 else None,
            "KPP/KSA": row2[3] if len(row2) > 3 else None
        })

# Convert to DataFrame
df_performance_attributes = pd.DataFrame(structured_data)

# Display or save the DataFrame
df_performance_attributes.to_csv('AAG_Performance_Attributes.csv', index=False)

In [13]:
import pdfplumber
import pandas as pd

def extract_table_after_signal(pdf, signal_text):
    """
    Extract the first table found immediately after the specified signal text.
    """
    for page_num, page in enumerate(pdf.pages):
        text = page.extract_text()
        if signal_text in text:
            # Extract the tables from the current page
            tables = page.extract_tables()
            if tables:
                # Assume the first table is the one following the signal text
                return tables[0], page_num + 1  # Return the table and page number (1-based index)
    return None, None

# Open the PDF file and search for the table following "(U) Total Acquisition Estimates and Quantities"
#file_path = '/mnt/data/(U)AAG_MSAR_Dec_2023.pdf'
signal_text = '(U) Total Acquisition Estimates and Quantities'

with pdfplumber.open(file_path) as pdf:
    table_data, page_number = extract_table_after_signal(pdf, signal_text)

# Process the table data if found
if table_data:
    # Convert the extracted table to a DataFrame
    df_table = pd.DataFrame(table_data[1:], columns=table_data[0])  # First row as header
    # Save the DataFrame to a CSV file
    output_file = 'AAG_Total_Acquisition_Estimates.csv'
    df_table.to_csv(output_file, index=False)
    print(f"Table found on page {page_number} and saved as {output_file}")
else:
    print("Table not found with the given signal text.")

Table found on page 14 and saved as AAG_Total_Acquisition_Estimates.csv


In [14]:

def extract_contract_data(pdf, start_page, end_page):
    """
    Extract contract information from pages specified (e.g., 22 and 23).
    """
    extracted_data = []

    # Iterate through the specified page range
    for page_num in range(start_page - 1, end_page):  # pdfplumber uses zero-based indexing
        page = pdf.pages[page_num]
        text = page.extract_text()
        tables = page.extract_tables()

        # Example text parsing logic (adjust as needed to extract specific details)
        if tables:
            # Assuming one table per page based on the example
            table = tables[0]
            for row in table[1:]:  # Skip the header row
                contract_name = row[0]  # Contract title
                contract_number = row[1]  # Contract number
                contractor = row[2]  # Contractor name
                # Additional details would need to be parsed from the text

                # Example structured data (using placeholders for now)
                extracted_data.append({
                    "Contract Name": contract_name,
                    "Contract Number": contract_number,
                    "Contractor": contractor,
                    "Contractor Location": "San Diego, CA",  # Placeholder
                    "Contract Type": "Firm Fixed Price",  # Placeholder
                    "Award Date": "2022-01-15",  # Placeholder
                    "Definitization Date": "2022-02-01",  # Placeholder
                    "Initial Contract Price Target": 150000000.0,  # Placeholder
                    "Initial Contract Price Ceiling": None,
                    "Initial Contract Quantity": 0,
                    "Current Contract Price Target": 175000000.0,  # Placeholder
                    "Current Contract Price Ceiling": None,
                    "Current Contract Quantity": 0,
                    "Contractor's Estimated Price at Completion": 180000000.0,  # Placeholder
                    "PM's Estimated Price at Completion": 185000000.0,  # Placeholder
                    "Cost Variance": 8000000.0,  # Placeholder
                    "Schedule Variance": -3000000.0,  # Placeholder
                    "Cost Variance Explanation": "Explanation about cost variance goes here...",  # Placeholder
                    "Schedule Variance Explanation": "Explanation about schedule variance goes here...",  # Placeholder
                    "Variance Explanation": "<div>Explanation about both variances goes here...</div>",  # Placeholder
                    "Contract Comments": None
                })

    return extracted_data

# Open the PDF file and extract contract data from pages 22 and 23
#file_path = '/mnt/data/(U)AAG_MSAR_Dec_2023.pdf'

with pdfplumber.open(file_path) as pdf:
    contracts_data = extract_contract_data(pdf, 22, 23)

# Convert the extracted data to a DataFrame and save it to a CSV file
df_contracts = pd.DataFrame(contracts_data)
output_file = 'Contracts_Data.csv'
df_contracts.to_csv(output_file, index=False)

print(f"Contracts data extracted and saved to {output_file}")

Contracts data extracted and saved to Contracts_Data.csv


In [15]:
import pdfplumber
import pandas as pd

def extract_lrip_data(pdf, page_num):
    """
    Extract the Low-Rate Initial Production (LRIP) data from the specified page.
    """
    page = pdf.pages[page_num - 1]  # pdfplumber uses zero-based indexing
    tables = page.extract_tables()

    if tables:
        # Assume the first table found on the page is the LRIP table
        table = tables[0]
        # Parse the relevant details from the table
        lrip_data = []
        for row in table[1:]:  # Skip the header row
            initial_approval_date = row[0]
            initial_quantity = int(row[1]) if row[1].isdigit() else None
            initial_reference = row[2]
            initial_start_year = row[3]
            initial_end_year = row[4]
            current_approval_date = row[5]
            current_quantity = int(row[6]) if row[6].isdigit() else None
            current_reference = row[7]
            current_start_year = row[8]
            current_end_year = row[9]
            notes = row[10] if len(row) > 10 else None

            # Create a structured dictionary based on the desired output
            lrip_data.append({
                "ID": 2,  # Placeholder; will need to be set dynamically if used in a loop
                "SubmissionID": 4,  # Placeholder
                "SubProgramID": 285,  # Placeholder
                "InitialApprovalDate": initial_approval_date,
                "InitialQuantity": initial_quantity,
                "InitialReference": initial_reference,
                "InitialStartYear": initial_start_year,
                "InitialEndYear": initial_end_year,
                "CurrentApprovalDate": current_approval_date,
                "CurrentQuantity": current_quantity,
                "CurrentReference": current_reference,
                "CurrentStartYear": current_start_year,
                "CurrentEndYear": current_end_year,
                "Notes": notes
            })

        return lrip_data

    return []

# Open the PDF file and extract LRIP data from page 24
#file_path = '/mnt/data/(U)AAG_MSAR_Dec_2023.pdf'

with pdfplumber.open(file_path) as pdf:
    lrip_data = extract_lrip_data(pdf, 24)

# Convert the extracted data to a DataFrame and save it to a CSV file
df_lrip = pd.DataFrame(lrip_data)
output_file = 'LRIP_Data.csv'
df_lrip.to_csv(output_file, index=False)

print(f"Low-Rate Initial Production data extracted and saved to {output_file}")

IndexError: list index out of range

In [16]:
import pdfplumber
import pandas as pd

def extract_lrip_table(pdf, page_num):
    """
    Extracts the Low-Rate Initial Production (LRIP) table from the specified page.
    """
    page = pdf.pages[page_num - 1]  # Zero-based indexing
    tables = page.extract_tables()

    if tables:
        # Assume the first table found on the page is the LRIP table
        return tables[0]
    return None

def parse_lrip_table(raw_table):
    """
    Parses the raw LRIP table data into the desired structured format.
    """
    return [{
        "ID": 2,  # Placeholder; can be dynamically assigned if necessary
        "SubmissionID": 4,  # Placeholder; to be adjusted based on actual requirements
        "SubProgramID": 285,  # Placeholder; replace with the correct SubProgramID if needed
        "InitialApprovalDate": raw_table[2][1],  # Original Date
        "InitialQuantity": int(raw_table[1][1]),  # Original Quantity
        "InitialReference": raw_table[3][1],  # Original Reference
        "InitialStartYear": raw_table[4][1].split(" - ")[0],  # Start year from LRIP Period
        "InitialEndYear": raw_table[4][1].split(" - ")[1],  # End year from LRIP Period
        "CurrentApprovalDate": raw_table[2][2],  # Current Date
        "CurrentQuantity": int(raw_table[1][2]),  # Current Quantity
        "CurrentReference": raw_table[3][2],  # Current Reference
        "CurrentStartYear": raw_table[4][2].split(" - ")[0],  # Start year from LRIP Period
        "CurrentEndYear": raw_table[4][2].split(" - ")[1],  # End year from LRIP Period
        "Notes": None  # Assuming no notes are present
    }]

# Open the PDF file and extract LRIP data from page 24
#file_path = '/mnt/data/(U)AAG_MSAR_Dec_2023.pdf'

with pdfplumber.open(file_path) as pdf:
    raw_table_page_24 = extract_lrip_table(pdf, 24)

# Parse the table into the desired structured format
if raw_table_page_24:
    lrip_parsed_data = parse_lrip_table(raw_table_page_24)
    # Convert to DataFrame and save to a CSV file
    df_lrip_parsed = pd.DataFrame(lrip_parsed_data)
    output_file = 'LRIP_Data.csv'
    df_lrip_parsed.to_csv(output_file, index=False)
    print(f"Low-Rate Initial Production data extracted and saved to {output_file}")
else:
    print("No table found on page 24.")

Low-Rate Initial Production data extracted and saved to LRIP_Data.csv


In [None]:
import pdfplumber
import pandas as pd

def make_columns_unique(columns):
    """
    Ensure unique column names by appending suffixes if needed.
    """
    seen = {}
    for i, col in enumerate(columns):
        if col in seen:
            seen[col] += 1
            columns[i] = f"{col}_{seen[col]}"
        else:
            seen[col] = 0
    return columns

def extract_tables_with_column_adjustment(pdf, pages):
    """
    Extract tables from specified pages and handle column adjustments.
    """
    all_tables = []
    for page_num in pages:
        page = pdf.pages[page_num - 1]  # Zero-based indexing for pages in pdfplumber
        tables = page.extract_tables()

        if tables:
            # Extract the "Code" from the first row (if present)
            code = tables[0][0][0] if tables[0] and len(tables[0][0]) > 0 else ""

            # Get the header row and adjust if needed
            header = make_columns_unique(tables[0][0])
            num_columns = len(header)

            # Process the remaining rows
            data_rows = []
            for row in tables[0][1:]:
                # Ensure each row has the same number of columns as the header
                if len(row) < num_columns:
                    row.extend([None] * (num_columns - len(row)))  # Pad with None
                elif len(row) > num_columns:
                    row = row[:num_columns]  # Truncate to match the header length
                data_rows.append(row)

            # Create a DataFrame using the adjusted rows
            df = pd.DataFrame(data_rows, columns=header)

            # Add the "Code" column to every row
            df["Code"] = code

            # Append the DataFrame to the list
            all_tables.append(df)

    return all_tables

# Open the PDF file and extract tables from pages 40, 41, and 42
file_path = '/mnt/data/(U)AAG_MSAR_Dec_2023.pdf'
with pdfplumber.open(file_path) as pdf:
    tables_list = extract_tables_with_column_adjustment(pdf, [40, 41, 42])

# Combine the tables into a single DataFrame
merged_df = pd.concat(tables_list, ignore_index=True)

# Clean the merged DataFrame by replacing '-' with NaN
merged_df.replace('-', pd.NA, inplace=True)

# Output the cleaned DataFrame to a CSV file for inspection
output_file_path = '/mnt/data/merged_cleaned_table.csv'
merged_df.to_csv(output_file_path, index=False)
print(f"Data saved to {output_file_path}")