In [2]:
import pandas as pd
import pdfplumber

Extract Schedule

In [3]:
import pdfplumber
import pandas as pd

# Open the PDF file using pdfplumber
file_path = 'C:/Users/PShmorhun/Desktop/Git/sarscraper-oct/SARPDFs/(U)AAG_MSAR_Dec_2023.pdf'

def find_schedule_events_table(pdf):
    # Iterate through the pages to find the "(U) Schedule Events" section
    for page_num, page in enumerate(pdf.pages):
        text = page.extract_text()
        if "(U) Schedule Events" in text:
            # If the target header is found, extract tables on that page
            tables = page.extract_tables()
            if tables:
                # Assume the first table found is the desired one (based on typical format)
                return tables[0], page_num
    return None, None

# Open the PDF and find the "Schedule Events" table
with pdfplumber.open(file_path) as pdf:
    schedule_table, schedule_page_num = find_schedule_events_table(pdf)

# If the table is found, reformat it to the desired structure
if schedule_table:
    # Manually reformat the extracted table to match the structure
    # Assuming the extracted table has a similar format as displayed in the example
    headers = schedule_table[0]  # First row is considered the header
    data_rows = schedule_table[1:]  # Remaining rows are the data
    
    # Create a DataFrame from the extracted data
    df_schedule_events = pd.DataFrame(data_rows, columns=headers)

    # Clean up the DataFrame to reflect the correct structure
    df_schedule_events.columns = [
        "Events", "Type", "Objective (APB Change 1)", "Threshold (APB Change 1)",
        "Current Estimate 12/31/2023", "Actual"
    ]
    
    # Save the DataFrame as a CSV file
    file_name = 'C:/Users/PShmorhun/Desktop/Git/sarscraper-oct/CSVs/AAG/AAG_Schedule.csv'
    df_schedule_events.to_csv(file_name, index=False)
    df_schedule_events.head()
    print(f"Schedule Events table extracted from page {schedule_page_num + 1} and saved as {file_name}")
else:
    print("Could not find the '(U) Schedule Events' table in the PDF.")
df_schedule_events.head()


Schedule Events table extracted from page 10 and saved as C:/Users/PShmorhun/Desktop/Git/sarscraper-oct/CSVs/AAG/AAG_Schedule.csv


Unnamed: 0,Events,Type,Objective (APB Change 1),Threshold (APB Change 1),Current Estimate 12/31/2023,Actual
0,Milestone A,MS A,Jul 2003,Jul 2003,-,16 Jul 2003
1,Milestone B,MS B,Feb 2005,Feb 2005,-,10 Feb 2005
2,IT-B3 JCTS complete,DT&E,Aug 2019,Aug 2019,-,11 Jul 2019
3,IT-B4 RALS complete,DT&E,Oct 2019,Oct 2019,-,24 Oct 2019
4,IOC,IOC,Jul 2021,Jan 2022,-,30 Apr 2021


Extract Peformance

In [4]:
import pdfplumber
import pandas as pd

# Function to extract the "Performance Attributes" table from the PDF across pages 11 and 12
def extract_performance_attributes_table(pdf, start_page, end_page):
    extracted_data = []

    # Iterate through the specified page range
    for page_num in range(start_page - 1, end_page):
        page = pdf.pages[page_num]
        tables = page.extract_tables()
        if tables:
            extracted_data.extend(tables[0])  # Assume the first table found on each page is relevant

    return extracted_data

# Open the PDF file and extract data from pages 11 to 12
#file_path = '/mnt/data/(U)AAG_MSAR_Dec_2023.pdf'
with pdfplumber.open(file_path) as pdf:
    raw_data = extract_performance_attributes_table(pdf, 11, 12)

# Manually cleaning and structuring the extracted data
structured_data = []
current_attribute = None

for i in range(0, len(raw_data), 2):  # Group rows in pairs for "Objective" and "Threshold"
    if i + 1 < len(raw_data):
        row1 = raw_data[i]
        row2 = raw_data[i + 1]

        # Set attribute name based on the first descriptive row
        current_attribute = row1[0].split("\n")[0] if row1[0] else 'Unknown Attribute'

        structured_data.append({
            "Attribute": current_attribute,
            "Current Estimate": row1[1] if len(row1) > 1 else None,
            "Demonstrated Performance": row1[2] if len(row1) > 2 else None,
            "APB Objective": row1[2] if len(row1) > 2 else None,
            "APB Threshold": row2[2] if len(row2) > 2 else None,
            "KPP/KSA": row2[3] if len(row2) > 3 else None
        })

# Convert to DataFrame
df_performance_attributes = pd.DataFrame(structured_data)

# Display or save the DataFrame
#df_performance_attributes.to_csv('AAG_Performance_Attributes.csv', index=False)

df_performance_attributes.head()

Unnamed: 0,Attribute,Current Estimate,Demonstrated Performance,APB Objective,APB Threshold,KPP/KSA
0,Aircraft Interoperability,,,,Will meet threshold. Meets threshold requireme...,
1,Demonstrated Performance,,Hookload limits and G-load limits demonstrated...,Hookload limits and G-load limits demonstrated...,The hookload limits and G-load limits applicab...,
2,2/5/2020,Threshold,The hookload limits and G-load limits applicab...,The hookload limits and G-load limits applicab...,,KPP
3,Current Estimate,,Will meet threshold. Mitigations include new C...,Will meet threshold. Mitigations include new C...,Cycle time of 37 seconds demonstrated during\n...,
4,APB Change 1,Objective,30 Seconds,30 Seconds,35 Seconds,


In [5]:
#general format/total acquistition estiamtes

def extract_table_after_signal(pdf, signal_text):
    """
    Extract the first table found immediately after the specified signal text.
    """
    for page_num, page in enumerate(pdf.pages):
        text = page.extract_text()
        if signal_text in text:
            # Extract the tables from the current page
            tables = page.extract_tables()
            if tables:
                # Assume the first table is the one following the signal text
                return tables[0], page_num + 1  # Return the table and page number (1-based index)
    return None, None

# Open the PDF file and search for the table following "(U) Total Acquisition Estimates and Quantities"
#file_path = '/mnt/data/(U)AAG_MSAR_Dec_2023.pdf'
signal_text = '(U) Total Acquisition Estimates and Quantities'

with pdfplumber.open(file_path) as pdf:
    table_data, page_number = extract_table_after_signal(pdf, signal_text)

# Process the table data if found
if table_data:
    # Convert the extracted table to a DataFrame
    df_table = pd.DataFrame(table_data[1:], columns=table_data[0])  # First row as header
    # Save the DataFrame to a CSV file
    output_file = 'AAG_Total_Acquisition_Estimates.csv'
    df_table.to_csv(output_file, index=False)
    print(f"Table found on page {page_number} and saved as {output_file}")
else:
    print("Table not found with the given signal text.")

df_table

Table found on page 14 and saved as AAG_Total_Acquisition_Estimates.csv


Unnamed: 0,Category ($M) Base Year: 2017,APB Change 1\n(Current)\n2/5/2020\nCY$ obs\nObjective / Threshold,None,Current Estimate\nPB 2025\nCY$ obs / TY$ obs,None.1
0,RDT&E,1550.1,1705.1,1407.9,1422.2
1,Procurement,1114.8,1226.3,"1,314.6*",1567.4
2,MILCON,16.9,18.6,16.9,15.4
3,O&M,0.0,0.0,0.0,0.0
4,R&MF,-,-,0.0,0.0
5,Total Acquisition,2681.8,-,2739.4,3005.0
6,Program Acquisition Unit Cost,670.450,737.495,684.856,751.26
7,Average Procurement Unit Cost,278.700,306.570,328.648*,391.85
8,Program End-Item Quantity,,,,
9,Development,0,,-,


In [6]:

def extract_contract_data(pdf, start_page, end_page):
    """
    Extract contract information from pages specified (e.g., 22 and 23).
    """
    extracted_data = []

    # Iterate through the specified page range
    for page_num in range(start_page - 1, end_page):  # pdfplumber uses zero-based indexing
        page = pdf.pages[page_num]
        text = page.extract_text()
        tables = page.extract_tables()

        # Example text parsing logic (adjust as needed to extract specific details)
        if tables:
            # Assuming one table per page based on the example
            table = tables[0]
            for row in table[1:]:  # Skip the header row
                contract_name = row[0]  # Contract title
                contract_number = row[1]  # Contract number
                contractor = row[2]  # Contractor name
                # Additional details would need to be parsed from the text

                # Example structured data (using placeholders for now)
                extracted_data.append({
                    "Contract Name": contract_name,
                    "Contract Number": contract_number,
                    "Contractor": contractor,
                    "Contractor Location": "San Diego, CA",  # Placeholder
                    "Contract Type": "Firm Fixed Price",  # Placeholder
                    "Award Date": "2022-01-15",  # Placeholder
                    "Definitization Date": "2022-02-01",  # Placeholder
                    "Initial Contract Price Target": 150000000.0,  # Placeholder
                    "Initial Contract Price Ceiling": None,
                    "Initial Contract Quantity": 0,
                    "Current Contract Price Target": 175000000.0,  # Placeholder
                    "Current Contract Price Ceiling": None,
                    "Current Contract Quantity": 0,
                    "Contractor's Estimated Price at Completion": 180000000.0,  # Placeholder
                    "PM's Estimated Price at Completion": 185000000.0,  # Placeholder
                    "Cost Variance": 8000000.0,  # Placeholder
                    "Schedule Variance": -3000000.0,  # Placeholder
                    "Cost Variance Explanation": "Explanation about cost variance goes here...",  # Placeholder
                    "Schedule Variance Explanation": "Explanation about schedule variance goes here...",  # Placeholder
                    "Variance Explanation": "<div>Explanation about both variances goes here...</div>",  # Placeholder
                    "Contract Comments": None
                })

    return extracted_data

# Open the PDF file and extract contract data from pages 22 and 23
#file_path = '/mnt/data/(U)AAG_MSAR_Dec_2023.pdf'

with pdfplumber.open(file_path) as pdf:
    contracts_data = extract_contract_data(pdf, 22, 23)

# Convert the extracted data to a DataFrame and save it to a CSV file
df_contracts = pd.DataFrame(contracts_data)
#output_file = 'Contracts_Data.csv'
#df_contracts.to_csv(output_file, index=False)

print(f"Contracts data extracted and saved to {output_file}")
df_contracts

Contracts data extracted and saved to AAG_Total_Acquisition_Estimates.csv


Unnamed: 0,Contract Name,Contract Number,Contractor,Contractor Location,Contract Type,Award Date,Definitization Date,Initial Contract Price Target,Initial Contract Price Ceiling,Initial Contract Quantity,...,Current Contract Price Ceiling,Current Contract Quantity,Contractor's Estimated Price at Completion,PM's Estimated Price at Completion,Cost Variance,Schedule Variance,Cost Variance Explanation,Schedule Variance Explanation,Variance Explanation,Contract Comments
0,AAG/EMALS CVN 79/80\nProduction,N0001914C0037,General Atomics,"San Diego, CA",Firm Fixed Price,2022-01-15,2022-02-01,150000000.0,,0,...,,0,180000000.0,185000000.0,8000000.0,-3000000.0,Explanation about cost variance goes here...,Explanation about schedule variance goes here...,<div>Explanation about both variances goes her...,
1,AAG/EMALS CVN 81 Pre-\nproduction Planning/\nP...,N0001922C0033,General Atomics,"San Diego, CA",Firm Fixed Price,2022-01-15,2022-02-01,150000000.0,,0,...,,0,180000000.0,185000000.0,8000000.0,-3000000.0,Explanation about cost variance goes here...,Explanation about schedule variance goes here...,<div>Explanation about both variances goes her...,


In [7]:
import pdfplumber

def extract_lrip_table(pdf, page_num):
    """
    Extracts the Low-Rate Initial Production (LRIP) table from the specified page.
    """
    page = pdf.pages[page_num - 1]  # Zero-based indexing
    tables = page.extract_tables()

    if tables:
        # Assume the first table found on the page is the LRIP table
        return tables[0]
    return None

def parse_lrip_table(raw_table):
    """
    Parses the raw LRIP table data into the desired structured format.
    """
    return [{
        "ID": 2,  # Placeholder; can be dynamically assigned if necessary
        "SubmissionID": 4,  # Placeholder; to be adjusted based on actual requirements
        "SubProgramID": 285,  # Placeholder; replace with the correct SubProgramID if needed
        "InitialApprovalDate": raw_table[2][1],  # Original Date
        "InitialQuantity": int(raw_table[1][1]),  # Original Quantity
        "InitialReference": raw_table[3][1],  # Original Reference
        "InitialStartYear": raw_table[4][1].split(" - ")[0],  # Start year from LRIP Period
        "InitialEndYear": raw_table[4][1].split(" - ")[1],  # End year from LRIP Period
        "CurrentApprovalDate": raw_table[2][2],  # Current Date
        "CurrentQuantity": int(raw_table[1][2]),  # Current Quantity
        "CurrentReference": raw_table[3][2],  # Current Reference
        "CurrentStartYear": raw_table[4][2].split(" - ")[0],  # Start year from LRIP Period
        "CurrentEndYear": raw_table[4][2].split(" - ")[1],  # End year from LRIP Period
        "Notes": None  # Assuming no notes are present
    }]

# Open the PDF file and extract LRIP data from page 24
#file_path = '/mnt/data/(U)AAG_MSAR_Dec_2023.pdf'

with pdfplumber.open(file_path) as pdf:
    raw_table_page_24 = extract_lrip_table(pdf, 24)

# Parse the table into the desired structured format
if raw_table_page_24:
    lrip_parsed_data = parse_lrip_table(raw_table_page_24)
    # Convert to DataFrame and save to a CSV file
    df_lrip_parsed = pd.DataFrame(lrip_parsed_data)
    output_file = 'LRIP_Data.csv'
    df_lrip_parsed.to_csv(output_file, index=False)
    print(f"Low-Rate Initial Production data extracted and saved to {output_file}")
else:
    print("No table found on page 24.")

df_lrip_parsed.head()   

Low-Rate Initial Production data extracted and saved to LRIP_Data.csv


Unnamed: 0,ID,SubmissionID,SubProgramID,InitialApprovalDate,InitialQuantity,InitialReference,InitialStartYear,InitialEndYear,CurrentApprovalDate,CurrentQuantity,CurrentReference,CurrentStartYear,CurrentEndYear,Notes
0,2,4,285,2/10/2005,5,Milestone B ADM,FY 2009,2012,12/22/2015,2,Revision to Milestone B ADM,FY 2009,2014,


In [19]:
import pdfplumber
import pandas as pd

file_path = 'C:/Users/PShmorhun/Desktop/Git/sarscraper-oct/SARPDFs/(U)AAG_MSAR_Dec_2023.pdf'

def make_columns_unique(columns):
    """
    Ensure unique column names by appending suffixes if needed.
    """
    seen = {}
    for i, col in enumerate(columns):
        if col in seen:
            seen[col] += 1
            columns[i] = f"{col}_{seen[col]}"
        else:
            seen[col] = 0
    return columns

def extract_tables_with_column_adjustment(pdf, pages):
    """
    Extract tables from specified pages and handle column adjustments.
    """
    all_tables = []
    for page_num in pages:
        page = pdf.pages[page_num - 1]  # Zero-based indexing for pages in pdfplumber
        tables = page.extract_tables()

        if tables:
            # Extract the "Code" from the first row (if present)
            code = tables[0][0][0] if tables[0] and len(tables[0][0]) > 0 else ""

            # Get the header row and adjust if needed
            header = make_columns_unique(tables[0][0])
            num_columns = len(header)

            # Process the remaining rows
            data_rows = []
            for row in tables[0][1:]:
                # Ensure each row has the same number of columns as the header
                if len(row) < num_columns:
                    row.extend([None] * (num_columns - len(row)))  # Pad with None
                elif len(row) > num_columns:
                    row = row[:num_columns]  # Truncate to match the header length
                data_rows.append(row)

            # Create a DataFrame using the adjusted rows
            df = pd.DataFrame(data_rows, columns=header)

            # Add the "Code" column to every row
            df["Code"] = code

            # Append the DataFrame to the list
            all_tables.append(df)

    return all_tables

# Open the PDF file and extract tables from pages 40, 41, and 42
#file_path = '/mnt/data/(U)AAG_MSAR_Dec_2023.pdf'
with pdfplumber.open(file_path) as pdf:
    tables_list = extract_tables_with_column_adjustment(pdf, [40, 41, 42])

# Combine the tables into a single DataFrame
merged_df = pd.concat(tables_list, ignore_index=True)

# Clean the merged DataFrame by replacing '-' with NaN
#merged_df.replace('-', pd.NA, inplace=True)

# Output the cleaned DataFrame to a CSV file for inspection
#output_file_path = '/mnt/data/merged_cleaned_table.csv'
#merged_df.to_csv(output_file_path, index=False)
#print(f"Data saved to {output_file_path}")

Let's clean the funding table

In [101]:
#function is still janky
def extract_funding(filepath, pagenum, dfname):
    with pdfplumber.open(file_path) as pdf:
        page = pdf.pages[pagenum]
        table = page.extract_table()
        dfname = pd.DataFrame(table)
        code = table[0][0]
        table['Code'] = code
        table = table[1:]
        table.columns = funding_column_names
        table = table[2:]
    return dfname


In [121]:
with pdfplumber.open(file_path) as pdf:
        page = pdf.pages[39]
        table = page.extract_table()
        table = pd.DataFrame(table)
        codeval = table[0][0]
        table['Code'] = codeval
        table = table[1:]
        columns = table.loc[1]
        columns = columns.str.replace('\n', ' ', regex=False)
        table.columns = columns
        table = table[2:]

In [123]:
codeval

'1810N - Other Procurement, Navy'

In [128]:
with pdfplumber.open(file_path) as pdf:
        page = pdf.pages[40]
        table2 = page.extract_table()
        table2 = pd.DataFrame(table2)
        codeval = table2[0][0]
        table2['Code'] = codeval
        table2 = table2[1:]
        columns = table2.loc[1]
        columns = columns.str.replace('\n', ' ', regex=False)
        table2.columns = columns
        table2 = table2[2:]

In [114]:
with pdfplumber.open(file_path) as pdf:
        page = pdf.pages[41]
        table3 = page.extract_table()
        table3 = pd.DataFrame(table3)
        code = table3[0][0]
        table3['Code'] = code
        table3 = table3[1:]
        columns = table3.loc[1]
        columns = columns.str.replace('\n', ' ', regex=False)
        table3.columns = columns
        table3 = table3[2:]

In [118]:
code

'1810N - Other Procurement, Navy'

In [129]:
table2

1,fiscal year,End Item Recurring Flyaway,Non-End Item Recurring Flyaway,Non- Recurring Flyaway,Initial Spares,Depot Activation,Other/ Unallocated,Total TY($M),Weighted Rate,Total CY2017 ($M),"1611N (BLS Hist) - Shipbuilding and Conversion, Navy"
3,2003,,,,,,,-,0.701354,-,1611N (BLS Hist) - Shipbuilding and Conversion...
4,2004,,,,,,,-,0.726797,-,1611N (BLS Hist) - Shipbuilding and Conversion...
5,2005,,,,,,,-,0.759012,-,1611N (BLS Hist) - Shipbuilding and Conversion...
6,2006,,,,,,,-,0.78578,-,1611N (BLS Hist) - Shipbuilding and Conversion...
7,2007,,,,,,,-,0.821887,-,1611N (BLS Hist) - Shipbuilding and Conversion...
8,2008,0.71,,,,,,0.7,0.84988,0.8,1611N (BLS Hist) - Shipbuilding and Conversion...
9,2009,52.35,,,,,,52.4,0.875879,59.8,1611N (BLS Hist) - Shipbuilding and Conversion...
10,2010,36.32,,,,,,36.3,0.90631,40.1,1611N (BLS Hist) - Shipbuilding and Conversion...
11,2011,44.23,,,,,,44.2,0.936042,47.3,1611N (BLS Hist) - Shipbuilding and Conversion...
12,2012,20.26,,,,,,20.3,0.957515,21.2,1611N (BLS Hist) - Shipbuilding and Conversion...


In [125]:
table2

1,fiscal year,End Item Recurring Flyaway,Non-End Item Recurring Flyaway,Non- Recurring Flyaway,Initial Spares,Depot Activation,Other/ Unallocated,Total TY($M),Weighted Rate,Total CY2017 ($M),"1611N (BLS Hist) - Shipbuilding and Conversion, Navy",Code
3,2003,,,,,,,-,0.701354,-,1611N (BLS Hist) - Shipbuilding and Conversion...,"1810N - Other Procurement, Navy"
4,2004,,,,,,,-,0.726797,-,1611N (BLS Hist) - Shipbuilding and Conversion...,"1810N - Other Procurement, Navy"
5,2005,,,,,,,-,0.759012,-,1611N (BLS Hist) - Shipbuilding and Conversion...,"1810N - Other Procurement, Navy"
6,2006,,,,,,,-,0.78578,-,1611N (BLS Hist) - Shipbuilding and Conversion...,"1810N - Other Procurement, Navy"
7,2007,,,,,,,-,0.821887,-,1611N (BLS Hist) - Shipbuilding and Conversion...,"1810N - Other Procurement, Navy"
8,2008,0.71,,,,,,0.7,0.84988,0.8,1611N (BLS Hist) - Shipbuilding and Conversion...,"1810N - Other Procurement, Navy"
9,2009,52.35,,,,,,52.4,0.875879,59.8,1611N (BLS Hist) - Shipbuilding and Conversion...,"1810N - Other Procurement, Navy"
10,2010,36.32,,,,,,36.3,0.90631,40.1,1611N (BLS Hist) - Shipbuilding and Conversion...,"1810N - Other Procurement, Navy"
11,2011,44.23,,,,,,44.2,0.936042,47.3,1611N (BLS Hist) - Shipbuilding and Conversion...,"1810N - Other Procurement, Navy"
12,2012,20.26,,,,,,20.3,0.957515,21.2,1611N (BLS Hist) - Shipbuilding and Conversion...,"1810N - Other Procurement, Navy"


In [126]:
table3

1,fiscal year,Non-End End Item Item Non- Recurring Recurring Recurring Depot Flyaway Flyaway Flyaway Initial Spares Activation,Other/ Unallocated,Total TY($M),Weighted Rate,Total CY2017 ($M),"1205N - Military Construction, Navy"
3,2003,,,-,0.798714,-,"1205N - Military Construction, Navy"
4,2004,,,-,0.819628,-,"1205N - Military Construction, Navy"
5,2005,,,-,0.842891,-,"1205N - Military Construction, Navy"
6,2006,,,-,0.865753,-,"1205N - Military Construction, Navy"
7,2007,,,-,0.883359,-,"1205N - Military Construction, Navy"
8,2008,,,-,0.898414,-,"1205N - Military Construction, Navy"
9,2009,,15.4,15.4,0.910724,16.9,"1205N - Military Construction, Navy"


In [127]:
table4

1,fiscal year,AAG,Unnamed: 3,Unnamed: 4,Total,"1611N (BLS Hist) - Shipbuilding and Conversion, Navy"
3,Undistributed,,,,-,1611N (BLS Hist) - Shipbuilding and Conversion...
4,2008,1.0,,,1,1611N (BLS Hist) - Shipbuilding and Conversion...
5,2009,,,,-,1611N (BLS Hist) - Shipbuilding and Conversion...
6,2010,,,,-,1611N (BLS Hist) - Shipbuilding and Conversion...
7,2011,,,,-,1611N (BLS Hist) - Shipbuilding and Conversion...
8,2012,,,,-,1611N (BLS Hist) - Shipbuilding and Conversion...
9,2013,1.0,,,1,1611N (BLS Hist) - Shipbuilding and Conversion...
10,2014,,,,-,1611N (BLS Hist) - Shipbuilding and Conversion...
11,2015,,,,-,1611N (BLS Hist) - Shipbuilding and Conversion...
12,2016,,,,-,1611N (BLS Hist) - Shipbuilding and Conversion...
