In [23]:
import sys
import os
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '../..'))

if parent_dir not in sys.path:
    sys.path.append(parent_dir)

from src.utils.initial_generation_statistics import missing_person_patterns


In [24]:
# COMBINE MP
import glob
import pandas as pd
import os
import numpy as np

def concatenate_full_data_csvs(output_filename='../../data/full/fake_mp_data_correct.csv'):
    file_paths = glob.glob('../../data/**/processed/add_circumstances/mp_full_data.csv', recursive=True)
    print(file_paths)
    if not file_paths:
        print("No 'dataframe_circumstances.csv' files found in any subdirectories.")
        return
    
    filtered_file_paths = [
        f for f in file_paths
        if 'processed' in os.path.normpath(f).replace(os.sep, '/')
    ]
    
    if not filtered_file_paths:
        print("No 'full_data.csv' files matching the 'add_cir' pattern were found.")
        return
    
    print(f"Found {len(filtered_file_paths)} matching files:")
    for f in filtered_file_paths:
        print(f"- {f}")
    
    all_dataframes = []
    for file_path in filtered_file_paths:
        try:
            df = pd.read_csv(file_path)
            all_dataframes.append(df)
        except Exception as e:
            print(f"Error reading {file_path}: {e}. Skipping this file.")
    
    if not all_dataframes:
        print("No DataFrames were successfully read to concatenate.")
        return
    
    # Concatenate all DataFrames into a single DataFrame
    combined_df = pd.concat(all_dataframes, ignore_index=True)
    
    # Convert float columns that are actually integers back to integers
    for col in combined_df.columns:
        if combined_df[col].dtype == 'float64':
            # Check if all non-null values are whole numbers
            non_null_values = combined_df[col].dropna()
            if len(non_null_values) > 0 and all(non_null_values == non_null_values.astype(int)):
                # Convert to nullable integer type to handle potential NaN values
                combined_df[col] = combined_df[col].astype('Int64')
    
    # Save the combined DataFrame to a new CSV file
    try:
        combined_df.to_csv(output_filename, index=False)
        print(f"\nAll matching 'full_data.csv' files have been concatenated into '{output_filename}'.")
        print(f"Combined DataFrame shape: {combined_df.shape}")
        
        # Print data types for verification
        print("\nColumn data types:")
        for col, dtype in combined_df.dtypes.items():
            print(f"  {col}: {dtype}")
            
    except Exception as e:
        print(f"Error saving the combined CSV to '{output_filename}': {e}")

    ids = set()
    for f in filtered_file_paths:
        ids.add(f.split('/')[3])
    return ids
id_mp=concatenate_full_data_csvs()
id_mp

['../../data/4386/processed/add_circumstances/mp_full_data.csv', '../../data/1864/processed/add_circumstances/mp_full_data.csv', '../../data/5433/processed/add_circumstances/mp_full_data.csv', '../../data/8542/processed/add_circumstances/mp_full_data.csv', '../../data/4512/processed/add_circumstances/mp_full_data.csv', '../../data/4182/processed/add_circumstances/mp_full_data.csv', '../../data/6681/processed/add_circumstances/mp_full_data.csv', '../../data/157/processed/add_circumstances/mp_full_data.csv', '../../data/2470/processed/add_circumstances/mp_full_data.csv', '../../data/2817/processed/add_circumstances/mp_full_data.csv', '../../data/8723/processed/add_circumstances/mp_full_data.csv', '../../data/6827/processed/add_circumstances/mp_full_data.csv', '../../data/8940/processed/add_circumstances/mp_full_data.csv', '../../data/2674/processed/add_circumstances/mp_full_data.csv', '../../data/1839/processed/add_circumstances/mp_full_data.csv', '../../data/3938/processed/add_circumsta

{'1019',
 '1077',
 '1143',
 '1160',
 '1188',
 '1195',
 '1210',
 '1216',
 '1274',
 '1276',
 '1289',
 '1314',
 '1375',
 '1381',
 '1466',
 '1488',
 '157',
 '1612',
 '1618',
 '1634',
 '1682',
 '1724',
 '1727',
 '1790',
 '1799',
 '1839',
 '1864',
 '190',
 '1940',
 '1953',
 '1961',
 '1976',
 '1989',
 '2043',
 '2063',
 '2075',
 '212',
 '2280',
 '2324',
 '2334',
 '2338',
 '2454',
 '2458',
 '2470',
 '2516',
 '2528',
 '253',
 '2641',
 '266',
 '2667',
 '2674',
 '2681',
 '2686',
 '270',
 '2738',
 '281',
 '2817',
 '283',
 '2854',
 '2910',
 '2915',
 '2916',
 '2923',
 '2937',
 '2943',
 '2971',
 '3051',
 '3145',
 '3148',
 '3171',
 '3183',
 '3199',
 '3213',
 '3298',
 '3299',
 '3319',
 '3394',
 '3419',
 '3465',
 '3518',
 '3520',
 '3602',
 '3614',
 '3633',
 '3651',
 '3705',
 '3717',
 '3795',
 '3809',
 '3811',
 '3815',
 '3827',
 '3853',
 '386',
 '3891',
 '3938',
 '3947',
 '3956',
 '4009',
 '403',
 '4115',
 '4142',
 '416',
 '4182',
 '42',
 '422',
 '4260',
 '4295',
 '4314',
 '4350',
 '4386',
 '4392',
 '4412

In [25]:
import glob
import pandas as pd
import os

def concatenate_full_data_csvs(output_filename='../../data/full/fake_vp_data_correct.csv'):
    file_paths = glob.glob('../../data/**/processed/vpd/vpd_full_data.csv', recursive=True)
    print(file_paths)
    if not file_paths:
        print("No 'full_data.csv' files found in any subdirectories.")
        return

    filtered_file_paths = [
        f for f in file_paths
        if 'vpd' in os.path.normpath(f).replace(os.sep, '/')
    ]

    if not filtered_file_paths:
        print("No 'full_data.csv' files matching the 'add_cir' pattern were found.")
        print("Ensure your directory structure is like 'your_id-add_cir/full_data.csv'.")
        return

    print(f"Found {len(filtered_file_paths)} matching files:")
    for f in filtered_file_paths:
        print(f"- {f}")

    all_dataframes = []
    # Read each filtered CSV file into a pandas DataFrame
    for file_path in filtered_file_paths:
        try:
            df = pd.read_csv(file_path)
            all_dataframes.append(df)
        except Exception as e:
            print(f"Error reading {file_path}: {e}. Skipping this file.")

    if not all_dataframes:
        print("No DataFrames were successfully read to concatenate.")
        return

    # Concatenate all DataFrames into a single DataFrame
    combined_df = pd.concat(all_dataframes, ignore_index=True)
    for col in combined_df.columns:
        if combined_df[col].dtype == 'float64':
            # Check if all non-null values are whole numbers
            non_null_values = combined_df[col].dropna()
            if len(non_null_values) > 0 and all(non_null_values == non_null_values.astype(int)):
                # Convert to nullable integer type to handle potential NaN values
                combined_df[col] = combined_df[col].astype('Int64')
    # Save the combined DataFrame to a new CSV file
    try:
        start_number = 100001

        # Fix for the incident ids
        new_id_sequence = np.arange(start_number, start_number + len(combined_df))
        combined_df['VPD_NOMINALINCIDENTID_PK'] = new_id_sequence
        combined_df
        combined_df.to_csv(output_filename, index=False)
        print(f"\nAll matching 'full_data.csv' files have been concatenated into '{output_filename}'.")
        print(f"Combined DataFrame shape: {combined_df.shape}")
    except Exception as e:
        print(f"Error saving the combined CSV to '{output_filename}': {e}")
    
    ids = set()
    for f in filtered_file_paths:
        ids.add(f.split('/')[3])
    return ids
ids_vp = concatenate_full_data_csvs()

['../../data/4386/processed/vpd/vpd_full_data.csv', '../../data/1864/processed/vpd/vpd_full_data.csv', '../../data/5433/processed/vpd/vpd_full_data.csv', '../../data/8542/processed/vpd/vpd_full_data.csv', '../../data/4512/processed/vpd/vpd_full_data.csv', '../../data/4182/processed/vpd/vpd_full_data.csv', '../../data/6681/processed/vpd/vpd_full_data.csv', '../../data/157/processed/vpd/vpd_full_data.csv', '../../data/2470/processed/vpd/vpd_full_data.csv', '../../data/2817/processed/vpd/vpd_full_data.csv', '../../data/8723/processed/vpd/vpd_full_data.csv', '../../data/6827/processed/vpd/vpd_full_data.csv', '../../data/8940/processed/vpd/vpd_full_data.csv', '../../data/2674/processed/vpd/vpd_full_data.csv', '../../data/1839/processed/vpd/vpd_full_data.csv', '../../data/3938/processed/vpd/vpd_full_data.csv', '../../data/1466/processed/vpd/vpd_full_data.csv', '../../data/962/processed/vpd/vpd_full_data.csv', '../../data/9062/processed/vpd/vpd_full_data.csv', '../../data/6875/processed/vpd/v

In [1]:
# sanity check to see if generated for all
list(id_mp-ids_vp)

NameError: name 'id_mp' is not defined