In [1]:
#############################################################
#  CELL 1: Markdown
#############################################################

# Example text for a Markdown cell:
"""
# Data Quality Report for N-PX CSV Outputs

This notebook performs basic data quality checks on the CSV files generated
by the N-PX parsing workflow. We’ll look for:

1. **Row counts**: Ensure each table has at least some rows (unless empty is valid).
2. **Column-level stats**: For each column, how many nulls or missing values?
3. **Duplicate checks**: Are there any suspicious duplicates (e.g., form_id repeated with same data)?
4. **Value distributions**: (Optional) Look at distinct categories in certain key fields.

Feel free to add more custom checks, such as:
- Checking valid ranges for numeric columns.
- Checking that foreign key relationships align (e.g., `vote_id` in `proxy_voting_record_category` is found in `proxy_voting_record`).
- Checking that enumerated columns contain only valid values.

Let's begin!
"""


"\n# Data Quality Report for N-PX CSV Outputs\n\nThis notebook performs basic data quality checks on the CSV files generated\nby the N-PX parsing workflow. We’ll look for:\n\n1. **Row counts**: Ensure each table has at least some rows (unless empty is valid).\n2. **Column-level stats**: For each column, how many nulls or missing values?\n3. **Duplicate checks**: Are there any suspicious duplicates (e.g., form_id repeated with same data)?\n4. **Value distributions**: (Optional) Look at distinct categories in certain key fields.\n\nFeel free to add more custom checks, such as:\n- Checking valid ranges for numeric columns.\n- Checking that foreign key relationships align (e.g., `vote_id` in `proxy_voting_record_category` is found in `proxy_voting_record`).\n- Checking that enumerated columns contain only valid values.\n\nLet's begin!\n"

In [2]:
#############################################################
#  CELL 2: Imports & Setup
#############################################################

import os
import pandas as pd

# Point this to the folder where your CSVs are stored
CSV_FOLDER = "output"

# List your tables and corresponding file names
CSV_FILES = {
    "form_npx": "form_npx.csv",
    "institutional_manager": "institutional_manager.csv",
    "series": "series.csv",
    "proxy_voting_record": "proxy_voting_record.csv",
    "matter_category": "matter_category.csv",
    "proxy_voting_record_category": "proxy_voting_record_category.csv",
    "voting_record_manager": "voting_record_manager.csv",
    "voting_record_series": "voting_record_series.csv"
}

# We'll load all of them into a dict of DataFrames.
dfs = {}
for table_name, file_name in CSV_FILES.items():
    path = os.path.join(CSV_FOLDER, file_name)
    if os.path.exists(path):
        df = pd.read_csv(path)
        dfs[table_name] = df
    else:
        print(f"[Warning] {file_name} not found in {CSV_FOLDER}!")
        dfs[table_name] = None

print("DataFrames loaded:")
for k,v in dfs.items():
    if v is not None:
        print(f" - {k}: {len(v)} rows")
    else:
        print(f" - {k}: No data (file missing).")


  df = pd.read_csv(path)


DataFrames loaded:
 - form_npx: 100 rows
 - institutional_manager: 28 rows
 - series: 110 rows
 - proxy_voting_record: 185277 rows
 - matter_category: 15 rows
 - proxy_voting_record_category: 195763 rows
 - voting_record_manager: 52029 rows
 - voting_record_series: 177519 rows


In [3]:
#############################################################
#  CELL 3: Basic Data Quality Checks
#############################################################

# We’ll create a function to do quick checks on a given DataFrame.
def data_quality_report(df, df_name):
    """
    Print a small summary of data quality for a given DataFrame:
    1. Number of rows & columns
    2. Column data types & non-null counts
    3. Percent missing per column
    4. Sample records (head)
    """
    print(f"\n=== Data Quality Report: {df_name} ===")

    if df is None or df.empty:
        print("No data or empty DataFrame!")
        return

    # 1) Shape
    print(f"Shape: {df.shape[0]} rows x {df.shape[1]} columns")

    # 2) Info
    print("\n--- Info ---")
    print(df.info())

    # 3) Missing values
    print("\n--- Missing Values (absolute / percent) ---")
    missing_count = df.isna().sum()
    missing_percent = (df.isna().mean() * 100).round(2)
    for col in df.columns:
        print(f"   {col}: {missing_count[col]} / {missing_percent[col]}%")

    # 4) Quick sample
    print("\n--- Sample Rows ---")
    print(df.head(3))

    # 5) (Optional) Check for duplicates. For small tables, a naive approach:
    #    We won't fix them here, just report.
    dup_count = df.duplicated().sum()
    if dup_count > 0:
        print(f"\n[Warning] {dup_count} duplicate row(s) found in {df_name}.")
    else:
        print("\nNo duplicate rows found.")

    # 6) (Optional) If you have a PK column, check duplicates in that PK:
    # if "form_id" in df.columns:
    #    pk_dup_count = df.duplicated(subset=["form_id"]).sum()
    #    if pk_dup_count > 0:
    #        print(f"[Warning] form_id has {pk_dup_count} duplicates!")
    #    else:
    #        print("form_id column is unique.")


# Now run the data_quality_report for each table
for table_name, df in dfs.items():
    data_quality_report(df, table_name)



=== Data Quality Report: form_npx ===
Shape: 100 rows x 39 columns

--- Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 39 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   form_type                       100 non-null    object 
 1   registrant_type                 100 non-null    object 
 2   live_test_flag                  100 non-null    object 
 3   cik                             100 non-null    int64  
 4   phone_number                    100 non-null    object 
 5   investment_company_type         30 non-null     object 
 6   conformed_period                100 non-null    object 
 7   year_or_quarter                 100 non-null    object 
 8   report_calendar_year            99 non-null     float64
 9   report_quarter_year             1 non-null      float64
 10  report_type                     100 non-null    object 
 11  confidential_tre

In [4]:
#############################################################
#  CELL 4: Additional Cross-Checks (Optional)
#############################################################

# Example cross-check: Ensure that proxy_voting_record_category.vote_id 
# actually exists in proxy_voting_record.vote_id
if dfs["proxy_voting_record"] is not None and not dfs["proxy_voting_record"].empty:
    if dfs["proxy_voting_record_category"] is not None and not dfs["proxy_voting_record_category"].empty:
        # Put them in sets for quick membership test
        valid_vote_ids = set(dfs["proxy_voting_record"]["vote_id"].dropna().unique())
        cat_vote_ids = set(dfs["proxy_voting_record_category"]["vote_id"].dropna().unique())
        
        missing_vote_ids = cat_vote_ids - valid_vote_ids
        if missing_vote_ids:
            print(f"\n[Cross-check] Found {len(missing_vote_ids)} vote_id(s) in proxy_voting_record_category not in proxy_voting_record.")
            print("Missing IDs:", missing_vote_ids)
        else:
            print("\n[Cross-check] All category vote_ids exist in proxy_voting_record!")

# You can similarly check that 'form_id' in institutional_manager, series, etc. 
# actually appears in form_npx.



[Cross-check] All category vote_ids exist in proxy_voting_record!
