In [95]:
import os
import pandas as pd
import numpy as np
import random
import logging

pd.set_option('display.max_columns', None)

In [13]:
logging.basicConfig(filename='household_data_mismatches.log', level=logging.WARNING, format='%(asctime)s - %(levelname)s - %(message)s')

In [109]:
def process_census_data(df):
    """
    Process the census data to create a lookup for working probabilities based on gender and prefecture.

    Parameters:
    - df: DataFrame containing the census data.

    Returns:
    - A dictionary with (gender, prefecture) tuples as keys and working probabilities as values.
    """
    df['pref'] = df['地域名'].str.split("-").str[0].str[:2]
    df['gender'] = df['男女'].str.split("_").str[0]
    return df.set_index(['gender', 'pref'])['労働力率'].to_dict()

def transform_pref_code(city_code):
    """
    Transform the city code to a prefecture code.

    Parameters:
    - city_code: The city code as an integer.

    Returns:
    - The prefecture code as a string.
    """
    city_code_str = str(city_code)
    return city_code_str[:2] if len(city_code_str) == 5 else '0'+city_code_str[0]

def get_occupation(age, gender, prefecture, working_probability_lookup):
    """
    Determine the working status based on age, gender, and prefecture.

    Parameters:
    - age: Age of the individual.
    - gender: Gender of the individual.
    - prefecture: Prefecture code of the individual's location.
    - working_probability_lookup: A dictionary for looking up working probabilities.

    Returns:
    - An occupation code.
    """
    if age <= 4:
        return 10
    elif age <= 6:
        return 11
    elif age <= 12:
        return 12
    elif age <= 15:
        return 13
    elif age <= 18:
        return 14
    elif age <= 64:
        key = (str(int(gender)), str(prefecture))
        working_probability = working_probability_lookup.get(key, 0.5) / 100.0
        if working_probability == 0.5:
            print(key)
        return np.random.choice([23, 21], p=[working_probability, 1 - working_probability])
    else:
        return 23

def create_individual(base_info, age, gender, pref_code, person_counter, working_probability_lookup):
    """
    Create an individual's information based on the given parameters.

    Parameters:
    - base_info: A dictionary containing the base information shared by all individuals in the household.
    - age: Age of the individual.
    - gender: Gender of the individual.
    - pref_code: Prefecture code of the individual's location.
    - person_counter: A counter for assigning a unique person_id within the household.
    - working_probability_lookup: A dictionary for looking up working probabilities.

    Returns:
    - A dictionary with the individual's information.
    """
    individual = base_info.copy()
    individual['person_id'] = f'{person_counter}'
    age = age + random.randint(0, 4)
    individual['age'] = age
    individual['gender'] = gender
    individual['occupation'] = get_occupation(age, gender, pref_code, working_probability_lookup)
    
    return individual

def generate_individuals(household_data, city_code, working_probability_lookup, person_counter):
    """
    Generate individual records from household data.

    Parameters:
    - household_data: DataFrame containing household data.
    - city_code: City code for the household location.
    - working_probability_lookup: A dictionary for looking up working probabilities.

    Returns:
    - A DataFrame with individual records.
    """
    individuals = []

    for _, row in household_data.iterrows():
        base_info = {
            'household_id': f'{city_code}_{int(row["gid"])}',
            'family_type': row['family_group_code'],
            'city_code': row['city_code'],
            'lon': row['lon'],
            'lat': row['lat'],
        }
        pref_code = transform_pref_code(city_code)

        # Process householder, spouse, parents, children, and others
        for role in ['hh', 'spouse', 'parent_male', 'parent_female']:
            age_col = f'age_code_{role}'
            gender_col = f'gender_code_{role}'

            if pd.notna(row[age_col]) and pd.notna(row[gender_col]):
                individual = create_individual(base_info, row[age_col], row[gender_col], pref_code, person_counter, working_probability_lookup)
                individuals.append(individual)
                person_counter += 1

        # Process children and other members if applicable
        for member_type, col_step in [('child', 4), ('others', 5)]:
            start_index = household_data.columns.get_loc(f"gender_code_1st_{member_type}")
            end_index = household_data.columns.get_loc(f"{member_type}_counts_15th_grandchild") if member_type == 'child' else household_data.columns.get_loc(f"generation_flag_15th_{member_type}")

            for i in range(start_index, end_index, col_step):
                if i + 2 >= len(household_data.columns):  # Ensure we don't go beyond the DataFrame's columns
                    break

                gender, age = row.iloc[i], row.iloc[i + 1]
                if pd.notna(gender) and pd.notna(age):
                    individual = create_individual(base_info, age, gender, pref_code, person_counter, working_probability_lookup)
                    individuals.append(individual)
                    person_counter += 1

        # Check if the number of generated individuals matches the expected number
        if person_counter - 1 != int(row['n_household_member']):
            logging.warning(f"Mismatch in household {row['gid']} in city {city_code}: Expected {row['n_household_member']}, got {person_counter - 1}")

    return pd.DataFrame(individuals)

In [None]:
household_path = '/mnt/large/data/household_estimation_data_2020/世帯推計データ(加工済み)_202401/sisetu_plus_all/'
pseudo_pop_path = '/mnt/large/data/PseudoPFLOW/ver2.0/Population/'

# Load census data for working probabilities
census_data_path = '/mnt/large/data/PseudoPFLOW/Processing/2020NationalCensusLabor.csv'
census_data = pd.read_csv(census_data_path)
working_probability_lookup = process_census_data(census_data)

dtype_conversion = {
    'household_id': 'str',
    'family_type': 'Int64',
    'city_code': 'Int64',
    'age': 'Int64', 
    'gender': 'Int64'
}

person_counter = 1

for subdir, dirs, files in os.walk(household_path):
    dirs.sort()  # Sort directories in-place
    files.sort()  # Sort files in-place
    
    pref_folder = os.path.basename(subdir)

    pseudo_dir_path = os.path.join(pseudo_pop_path, pref_folder)

    # Check if the directory exists, if not, create it
    if not os.path.exists(pseudo_dir_path):
        os.makedirs(pseudo_dir_path)
        print(f"Created directory: {pseudo_dir_path}")

    for file in files:
        # Construct the full file path
        print(file, person_counter)
        city_code = file.split('_')[0]
        household_data_path = os.path.join(subdir, file)
        household_data = pd.read_csv(household_data_path)
        
        # Generate individual records
        try:
            individuals_df = generate_individuals(household_data, city_code, working_probability_lookup, person_counter)
            individuals_df = individuals_df.astype(dtype_conversion)
            individuals_df.to_csv(pseudo_dir_path + '/person_' + city_code + '.csv', index=False)
        except KeyError as e:
            print(f"Error: {e}. This column name does not exist in individuals_df DataFrame for file {file}.")

01101_household_estimation_data_2020.csv 1
01102_household_estimation_data_2020.csv 1
01103_household_estimation_data_2020.csv 1
01104_household_estimation_data_2020.csv 1
01105_household_estimation_data_2020.csv 1
01106_household_estimation_data_2020.csv 1
01107_household_estimation_data_2020.csv 1
01108_household_estimation_data_2020.csv 1
01109_household_estimation_data_2020.csv 1
01110_household_estimation_data_2020.csv 1
01202_household_estimation_data_2020.csv 1
01203_household_estimation_data_2020.csv 1
01204_household_estimation_data_2020.csv 1
01205_household_estimation_data_2020.csv 1
01206_household_estimation_data_2020.csv 1
01207_household_estimation_data_2020.csv 1
01208_household_estimation_data_2020.csv 1
01209_household_estimation_data_2020.csv 1
01210_household_estimation_data_2020.csv 1
01211_household_estimation_data_2020.csv 1
01212_household_estimation_data_2020.csv 1
01213_household_estimation_data_2020.csv 1
01214_household_estimation_data_2020.csv 1
01215_house