In [1]:
import os
import json
import pandas as pd

# Set display options for easier DataFrame visualization
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
def process_json_files(directory_path, output_csv_path):
    """
    Process JSON files in the specified directory, normalize the data, and save it to a CSV file.

    Parameters:
    - directory_path: str, path to the directory containing JSON files.
    - output_csv_path: str, path to save the output CSV file.

    Returns:
    - None
    """
    # List all JSON files in the specified directory
    files = [f for f in os.listdir(directory_path) if f.endswith('.json')]
    print("Files in directory:")
    print(files)

    # Initialize an empty list to store dataframes
    df_list = []

    # Iterate over each JSON file in the directory
    for file_name in files:
        file_path = os.path.join(directory_path, file_name)
        
        # Open and read the JSON file
        with open(file_path, 'r') as file:
            data = json.load(file)
        
        # First level normalization
        df = pd.json_normalize(data['data']['results'])
        
        # Second level normalization
        if 'branding' in df.columns:
            branding = pd.json_normalize(df['branding'].explode())
            branding.columns = [f'branding.{col}' for col in branding.columns]
            df = df.drop('branding', axis=1).join(branding)

        if 'description' in df.columns:
            description = pd.json_normalize(df['description'])
            description.columns = [f'description.{col}' for col in description.columns]
            df = df.drop('description', axis=1).join(description)

        if 'flags' in df.columns:
            flags = pd.json_normalize(df['flags'])
            flags.columns = [f'flags.{col}' for col in flags.columns]
            df = df.drop('flags', axis=1).join(flags)

        if 'lead_attributes' in df.columns:
            lead_attributes = pd.json_normalize(df['lead_attributes'])
            lead_attributes.columns = [f'lead_attributes.{col}' for col in lead_attributes.columns]
            df = df.drop('lead_attributes', axis=1).join(lead_attributes)

        if 'location.address' in df.columns:
            location_address = pd.json_normalize(df['location.address'])
            location_address.columns = [f'location.address.{col}' for col in location_address.columns]
            df = df.drop('location.address', axis=1).join(location_address)

        if 'location.county' in df.columns:
            location_county = pd.json_normalize(df['location.county'])
            location_county.columns = [f'location.county.{col}' for col in location_county.columns]
            df = df.drop('location.county', axis=1).join(location_county)

        if 'products' in df.columns:
            products = pd.json_normalize(df['products'])
            products.columns = [f'products.{col}' for col in products.columns]
            df = df.drop('products', axis=1).join(products)

        # Third level normalization
        if 'location.address.coordinate' in df.columns:
            location_coordinates = pd.json_normalize(df['location.address.coordinate'])
            location_coordinates.columns = [f'location.address.coordinate.{col}' for col in location_coordinates.columns]
            df = df.drop('location.address.coordinate', axis=1).join(location_coordinates)

        if 'source.agents' in df.columns:
            source_agents = pd.json_normalize(df['source.agents'].explode())
            source_agents.columns = [f'source.agents.{col}' for col in source_agents.columns]
            df = df.drop('source.agents', axis=1).join(source_agents)
            
            # Handle duplicate office_name values
            if 'source.agents.office_name' in df.columns:
                df['source.agents.office_name'] = df['source.agents.office_name'].apply(lambda x: list(set(x)) if isinstance(x, list) else x)

        if 'other_listings.rdc' in df.columns:
            other_listings_rdc = pd.json_normalize(df['other_listings.rdc'].explode())
            other_listings_rdc.columns = [f'other_listings.rdc.{col}' for col in other_listings_rdc.columns]
            df = df.drop('other_listings.rdc', axis=1).join(other_listings_rdc)

        # Ensure all-bool object columns are cast to bool dtype
        for col in df.select_dtypes(include=['object']):
            if df[col].dropna().isin([True, False]).all():
                df[col] = df[col].astype(bool)
        
        # Append the dataframe to the list, skipping 'tags' column normalization
        df_list.append(df)

    # Combine all dataframes
    df = pd.concat(df_list, ignore_index=True)

    # Save the combined DataFrame to a CSV file
    df.to_csv(output_csv_path, index=False)
    print(f"Data saved to {output_csv_path}")

In [5]:
# example usage
directory_path = r'C:\Users\16476\Downloads\ML-supervised_real_estate_data\data'
output_csv_path = r'C:\Users\16476\Downloads\ML-supervised_real_estate_data\data\processed_data.csv'
process_json_files(directory_path, output_csv_path)

Files in directory:
['AK_Juneau_0.json', 'AK_Juneau_1.json', 'AK_Juneau_2.json', 'AK_Juneau_3.json', 'AK_Juneau_4.json', 'AL_Montgomery_0.json', 'AL_Montgomery_1.json', 'AL_Montgomery_2.json', 'AL_Montgomery_3.json', 'AL_Montgomery_4.json', 'AR_LittleRock_0.json', 'AR_LittleRock_1.json', 'AR_LittleRock_2.json', 'AR_LittleRock_3.json', 'AR_LittleRock_4.json', 'AZ_Phoenix_0.json', 'AZ_Phoenix_1.json', 'AZ_Phoenix_2.json', 'AZ_Phoenix_3.json', 'AZ_Phoenix_4.json', 'CA_Sacramento_0.json', 'CA_Sacramento_1.json', 'CA_Sacramento_2.json', 'CA_Sacramento_3.json', 'CA_Sacramento_4.json', 'CO_Denver_0.json', 'CO_Denver_1.json', 'CO_Denver_2.json', 'CO_Denver_3.json', 'CO_Denver_4.json', 'CT_Hartford_0.json', 'CT_Hartford_1.json', 'CT_Hartford_2.json', 'CT_Hartford_3.json', 'CT_Hartford_4.json', 'DE_Dover_0.json', 'DE_Dover_1.json', 'DE_Dover_2.json', 'DE_Dover_3.json', 'DE_Dover_4.json', 'FL_Tallahassee_0.json', 'FL_Tallahassee_1.json', 'FL_Tallahassee_2.json', 'FL_Tallahassee_3.json', 'FL_Talla