In [1]:
'''
The following script takes raw data files (containing identical field names) collected via Zillow, 
originally grouped in individual directories by listing type (apartments, condos, etc.) and does the 
following:

1. For each directory, all raw files are combined into one CSV. (In the case of houses, over 60 files) 
2. Adds a "Listing Type" column to the combined sheet, equating to the name of the directory that each
   listing lives in
3. Further aggregates the combined CSVs from each directory, now with a "Listing Type" column, into
   one large CSV
'''

import os
import pandas as pd

# 1. 

# List of directories containing "raw" CSV files
directories = [
    'C:/Users/ryanm/Desktop/Projects/3/Datasets/raw/apartments',
    'C:/Users/ryanm/Desktop/Projects/3/Datasets/raw/condos',
    'C:/Users/ryanm/Desktop/Projects/3/Datasets/raw/houses',
    'C:/Users/ryanm/Desktop/Projects/3/Datasets/raw/manufactured',
    'C:/Users/ryanm/Desktop/Projects/3/Datasets/raw/multi',
    'C:/Users/ryanm/Desktop/Projects/3/Datasets/raw/townhouses'
]

# Iterating through and listing all CSVs within each directory, initializing empty lists to store dataframes 
# for each CSV file, concatenating all dataframes along rows, then saving the combined raw data to one file 
# per directory

for directory in directories:
    csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]
    
    dataframes = []

    for csv_file in csv_files:
        file_path = os.path.join(directory, csv_file)
        df = pd.read_csv(file_path)
        dataframes.append(df)

    combined_data = pd.concat(dataframes, ignore_index=True)

    output_directory = 'C:/Users/ryanm/Desktop/Projects/3/Datasets/raw/1aggregated'  
    output_file = os.path.join(output_directory, os.path.basename(directory) + '.csv')
    combined_data.to_csv(output_file, index=False)

# 2.

# Identifying directory where newly combined CSV files are located, and listing file names to process
directory = 'C:/Users/ryanm/Desktop/Projects/3/Datasets/raw/1aggregated'

file_names = [
    'apartments.csv',
    'condos.csv',
    'houses.csv',
    'manufactured.csv',
    'multi.csv',
    'townhouses.csv'
]

# Iterating through the list of files, and reading each CSV into a dataframe.
for file_name in file_names:
    file_path = os.path.join(directory, file_name)

    df = pd.read_csv(file_path)

    # Adding the "Listing Type" column and populating with the name of the directory, 
    # then saving each modified CSV to the same file
    listing_type = file_name.replace('.csv', '')
    df['Listing Type'] = listing_type

    df.to_csv(file_path, index=False)

# 3. 

# Resetting the directory with the new CSVs, listing all CSVs in the directory, then 
# setting empty list to store than dfs 
directory = 'C:/Users/ryanm/Desktop/Projects/3/Datasets/raw/1aggregated'

csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]

dataframes = []

# Loop through each CSV and append its content to the list, concatenate into one, then save. 
for csv_file in csv_files:
    file_path = os.path.join(directory, csv_file)
    df = pd.read_csv(file_path)
    dataframes.append(df)

combined_data = pd.concat(dataframes, ignore_index=True)

combined_data.to_csv('C:/Users/ryanm/Desktop/Projects/3/Datasets/full_housing_data.csv', index=False)

print("Success")

Success
