In [None]:
import json
import pandas as pd

In [None]:
# Convert JSON Responses 
with open("extracted_sections.json", "r") as f:
    pages_data = json.load(f)  

sections_data = []
for page in pages_data:
    try:
        response_dict = json.loads(page["Response"])
        if "sections" in response_dict:
            sections_data.extend(response_dict["sections"])
    except json.JSONDecodeError:
        print(f"Error parsing JSON for page {page['Page']}")

with open("merged_sections.json", "w") as f:
    json.dump({"sections": sections_data}, f, indent=4)

# Convert JSON to DataFrame
df = pd.DataFrame(sections_data)

# Handle "use_prev_section" placeholders from GPT parsed text
to_drop = [] 
for idx in range(len(df) - 1):
    if df.at[idx, "Section"] != "use_prev_section":
        # Check for placeholder "Sections" to append for sections spanning multiple pages
        for next_idx in range(idx + 1, len(df) - 1): 
            if df.at[next_idx, "Section"] == "use_prev_section":
                df.at[idx, "Section Body Text"] += "\n" + df.at[next_idx, "Section Body Text"]
                to_drop.append(next_idx) 
            else:
                break

# Drop all placeholder rows from final DataFrame
df.drop(index=to_drop, inplace=True)
df.reset_index(drop=True, inplace=True)  

In [None]:
# Assign Parent Section: remove last occurrence of '.' and all digits after it
def compute_parent_section(section):
    if "." in section:
        return ".".join(section.split(".")[:-1])
    return "" 
df["Parent Section"] = df["Section"].apply(compute_parent_section)

# Assign Level: count number of decimals in Section
df["Level"] = df["Section"].apply(lambda x: x.count("."))

# Define last page number of document 
last_page = int(df["Start Page"].max())

# # Ensure "Start Page" is always stored as an integer
# df["Start Page"] = pd.to_numeric(df["Start Page"], errors="coerce").fillna(last_page).astype(int)

# Create subset DF for End Page computations
df_sub = df[["Parent Section", "Section", "Start Page", "Level", "Starts On New Page"]].copy()
df_sub["Original Index"] = df_sub.index  
df_sub["Start Page"] = pd.to_numeric(df_sub["Start Page"], errors="coerce").astype("Int64") 
df_sub["End Page"] = df_sub["Start Page"]
# Apply shift() and retain correct data types
df_sub["Next Seq Start Page"] = df_sub["Start Page"].shift(-1).fillna(last_page + 1)
df_sub["Next Seq Starts On New Page"] = df_sub["Starts On New Page"].shift(-1).fillna(False).astype(bool)
# Sort DataFrame by Level, Parent Section, and Section Number
df_sub = df_sub.sort_values(by=["Level", "Parent Section", "Section"]).reset_index(drop=True)

# Loop through the DataFrame to assign End Page
for i in range(len(df_sub) - 2):
    # Extracting variables for readability
    curr_level = df_sub.iloc[i]["Level"]
    next_level = df_sub.iloc[i + 1]["Level"]
    curr_parent = df_sub.iloc[i]["Parent Section"]
    next_parent = df_sub.iloc[i + 1]["Parent Section"]

    next_start_page_sorted = df_sub.iloc[i + 1]["Start Page"]
    next_start_page_org = df_sub.iloc[i]["Next Seq Start Page"]
    next_snp_sorted = df_sub.iloc[i + 1]["Starts On New Page"]
    next_snp_org = df_sub.iloc[i]["Next Seq Starts On New Page"]

    # Case 1: Same Level & Same Parent
    if curr_level == next_level and curr_parent == next_parent:
        df_sub.at[i, "End Page"] = next_start_page_sorted - 1 if next_snp_sorted else next_start_page_sorted

    # Case 2: Same Level but Different Parent
    if curr_level == next_level and curr_parent != next_parent:
        df_sub.at[i, "End Page"] = next_start_page_org - 1 if next_snp_org else next_start_page_org

    # Case 3: Different Level & Different Parent 
    if curr_level != next_level:
        df_sub.at[i, "End Page"] = last_page
        
df_sub.at[len(df_sub) - 1, "End Page"] = last_page

# Restore original index order before merging
df_sub = df_sub.sort_values(by="Original Index").set_index("Original Index")
df["End Page"] = df_sub["End Page"]
df = df[["Parent Section", "Section", "Section Title", "Section Body Text", "Start Page", "End Page"]]

# Save final DataFrame as CSV
df.to_csv("final_output_from_json_GPT.csv", index=False, encoding="utf-8-sig")