# code for DP05

### compare row 1 across all files

In [2]:
import os
import pandas as pd

# Define directory
source_dir = r"H:\GY350\File Downloads"

# List to store file groups
file_groups = []
headers_checked = {}

# Iterate over DP05 files
for filename in os.listdir(source_dir):
    if filename.startswith("DP05") and filename.endswith(".csv"):
        file_path = os.path.join(source_dir, filename)
        
        try:
            # Read only the first row (headers)
            df = pd.read_csv(file_path, nrows=1, header=None)
            header_signature = tuple(df.iloc[0].dropna())  # Convert header row to tuple for uniqueness
            
            # Check if the header already exists
            if header_signature in headers_checked:
                headers_checked[header_signature].append(filename)
            else:
                headers_checked[header_signature] = [filename]
        except Exception as e:
            print(f"Error reading {filename}: {e}")

# Convert to list format
file_groups = list(headers_checked.values())

# Output results
for i, group in enumerate(file_groups, 1):
    print(f"Group {i}:")
    print(group)
    print("-")

Group 1:
['DP05_06_2010.csv', 'DP05_32_2010.csv']
-
Group 2:
['DP05_06_2011.csv', 'DP05_06_2012.csv', 'DP05_06_2013.csv', 'DP05_06_2014.csv', 'DP05_32_2011.csv', 'DP05_32_2012.csv', 'DP05_32_2013.csv', 'DP05_32_2014.csv']
-
Group 3:
['DP05_06_2015.csv', 'DP05_06_2016.csv', 'DP05_32_2015.csv', 'DP05_32_2016.csv']
-
Group 4:
['DP05_06_2017.csv', 'DP05_06_2019.csv', 'DP05_32_2017.csv', 'DP05_32_2019.csv']
-
Group 5:
['DP05_06_2018.csv', 'DP05_32_2018.csv']
-


### after manually adding headers to first item of each group, automate replicating for rest of groups

In [3]:
import os
import pandas as pd

# Define directory
source_dir = r"H:\GY350\File Downloads"

# Groups from previous run
file_groups = [
    ['DP05_06_2010.csv', 'DP05_32_2010.csv'],
    ['DP05_06_2011.csv', 'DP05_06_2012.csv', 'DP05_06_2013.csv', 'DP05_06_2014.csv', 'DP05_32_2011.csv', 'DP05_32_2012.csv', 'DP05_32_2013.csv', 'DP05_32_2014.csv'],
    ['DP05_06_2015.csv', 'DP05_06_2016.csv', 'DP05_32_2015.csv', 'DP05_32_2016.csv'],
    ['DP05_06_2017.csv', 'DP05_06_2019.csv', 'DP05_32_2017.csv', 'DP05_32_2019.csv'],
    ['DP05_06_2018.csv', 'DP05_32_2018.csv']
]

# Iterate over each group and apply the row copying
for group in file_groups:
    reference_file = os.path.join(source_dir, group[0])  # First file in the group
    
    # Read the reference file
    df_ref = pd.read_csv(reference_file, header=None)
    new_row = df_ref.iloc[1].copy()  # Copy manually added second row
    
    # Apply to rest of the group
    for filename in group[1:]:
        file_path = os.path.join(source_dir, filename)
        df = pd.read_csv(file_path, header=None)
        
        # Insert the copied row as a new second row
        df.loc[len(df)] = None  # Create an empty row at the end to shift everything
        df = df.sort_index().reset_index(drop=True)  # Reset index to shift rows correctly
        df.loc[1] = new_row  # Assign new row to second row
        
        # Save the updated file
        df.to_csv(file_path, index=False, header=False)
        print(f"Updated: {filename} with new row from {group[0]}")


  df_ref = pd.read_csv(reference_file, header=None)


Updated: DP05_32_2010.csv with new row from DP05_06_2010.csv


  df_ref = pd.read_csv(reference_file, header=None)
  df = pd.read_csv(file_path, header=None)


Updated: DP05_06_2012.csv with new row from DP05_06_2011.csv


  df = pd.read_csv(file_path, header=None)


Updated: DP05_06_2013.csv with new row from DP05_06_2011.csv


  df = pd.read_csv(file_path, header=None)


Updated: DP05_06_2014.csv with new row from DP05_06_2011.csv
Updated: DP05_32_2011.csv with new row from DP05_06_2011.csv
Updated: DP05_32_2012.csv with new row from DP05_06_2011.csv
Updated: DP05_32_2013.csv with new row from DP05_06_2011.csv
Updated: DP05_32_2014.csv with new row from DP05_06_2011.csv


  df_ref = pd.read_csv(reference_file, header=None)
  df = pd.read_csv(file_path, header=None)


Updated: DP05_06_2016.csv with new row from DP05_06_2015.csv
Updated: DP05_32_2015.csv with new row from DP05_06_2015.csv
Updated: DP05_32_2016.csv with new row from DP05_06_2015.csv


  df_ref = pd.read_csv(reference_file, header=None)
  df = pd.read_csv(file_path, header=None)


Updated: DP05_06_2019.csv with new row from DP05_06_2017.csv
Updated: DP05_32_2017.csv with new row from DP05_06_2017.csv
Updated: DP05_32_2019.csv with new row from DP05_06_2017.csv


  df_ref = pd.read_csv(reference_file, header=None)


Updated: DP05_32_2018.csv with new row from DP05_06_2018.csv


### remove unecessary columns

In [4]:
import os
import pandas as pd

# Define directories
source_dir = r"H:\GY350\File Downloads"
destination_dir = r"H:\GY350\CSVs Cleaned"
os.makedirs(destination_dir, exist_ok=True)  # Ensure the directory exists

# Iterate over DP05 files
for filename in os.listdir(source_dir):
    if filename.startswith("DP05") and filename.endswith(".csv"):
        source_file = os.path.join(source_dir, filename)
        destination_file = os.path.join(destination_dir, filename)
        
        if not os.path.exists(source_file):
            print(f"File not found: {source_file}")
            continue
        
        # Load and process the CSV
        cleaned_df = pd.read_csv(source_file, header=None)
        
        # Identify columns to keep (columns where row 2 has a value)
        columns_to_keep = cleaned_df.iloc[1].notna()
        
        # Drop columns that do not meet the condition
        cleaned_df = cleaned_df.loc[:, columns_to_keep]
        
        # Drop the first row (original row 1)
        cleaned_df = cleaned_df.iloc[1:].reset_index(drop=True)
        
        # Save cleaned CSV
        cleaned_df.to_csv(destination_file, index=False, header=False)
        
        print(f"Cleaned CSV saved at: {destination_file}")

  cleaned_df = pd.read_csv(source_file, header=None)


Cleaned CSV saved at: H:\GY350\CSVs Cleaned\DP05_06_2010.csv


  cleaned_df = pd.read_csv(source_file, header=None)


Cleaned CSV saved at: H:\GY350\CSVs Cleaned\DP05_06_2011.csv


  cleaned_df = pd.read_csv(source_file, header=None)


Cleaned CSV saved at: H:\GY350\CSVs Cleaned\DP05_06_2012.csv


  cleaned_df = pd.read_csv(source_file, header=None)


Cleaned CSV saved at: H:\GY350\CSVs Cleaned\DP05_06_2013.csv


  cleaned_df = pd.read_csv(source_file, header=None)


Cleaned CSV saved at: H:\GY350\CSVs Cleaned\DP05_06_2014.csv


  cleaned_df = pd.read_csv(source_file, header=None)


Cleaned CSV saved at: H:\GY350\CSVs Cleaned\DP05_06_2015.csv


  cleaned_df = pd.read_csv(source_file, header=None)


Cleaned CSV saved at: H:\GY350\CSVs Cleaned\DP05_06_2016.csv


  cleaned_df = pd.read_csv(source_file, header=None)


Cleaned CSV saved at: H:\GY350\CSVs Cleaned\DP05_06_2017.csv


  cleaned_df = pd.read_csv(source_file, header=None)


Cleaned CSV saved at: H:\GY350\CSVs Cleaned\DP05_06_2018.csv


  cleaned_df = pd.read_csv(source_file, header=None)


Cleaned CSV saved at: H:\GY350\CSVs Cleaned\DP05_06_2019.csv
Cleaned CSV saved at: H:\GY350\CSVs Cleaned\DP05_32_2010.csv
Cleaned CSV saved at: H:\GY350\CSVs Cleaned\DP05_32_2011.csv
Cleaned CSV saved at: H:\GY350\CSVs Cleaned\DP05_32_2012.csv
Cleaned CSV saved at: H:\GY350\CSVs Cleaned\DP05_32_2013.csv
Cleaned CSV saved at: H:\GY350\CSVs Cleaned\DP05_32_2014.csv
Cleaned CSV saved at: H:\GY350\CSVs Cleaned\DP05_32_2015.csv
Cleaned CSV saved at: H:\GY350\CSVs Cleaned\DP05_32_2016.csv
Cleaned CSV saved at: H:\GY350\CSVs Cleaned\DP05_32_2017.csv
Cleaned CSV saved at: H:\GY350\CSVs Cleaned\DP05_32_2018.csv
Cleaned CSV saved at: H:\GY350\CSVs Cleaned\DP05_32_2019.csv


### check all columns present

In [5]:
import os
import pandas as pd

# Define directory
cleaned_dir = r"H:\GY350\CSVs Cleaned"

# Store column sets
column_sets = {}
num_columns = {}

# Iterate over DP05 files
for filename in os.listdir(cleaned_dir):
    if filename.startswith("DP05") and filename.endswith(".csv"):
        file_path = os.path.join(cleaned_dir, filename)
        
        try:
            # Read CSV
            df = pd.read_csv(file_path, header=None)
            columns_set = set(df.columns)
            num_cols = len(df.columns)
            
            # Store column details
            column_sets[filename] = columns_set
            num_columns[filename] = num_cols
        except Exception as e:
            print(f"Error reading {filename}: {e}")

# Check for column consistency
reference_file = next(iter(column_sets))  # Pick first file as reference
reference_columns = column_sets[reference_file]
reference_num_cols = num_columns[reference_file]

consistent = True
for filename, col_set in column_sets.items():
    if col_set != reference_columns:
        print(f"Column mismatch in {filename}")
        consistent = False
    if num_columns[filename] != reference_num_cols:
        print(f"Column count mismatch in {filename}: {num_columns[filename]} instead of {reference_num_cols}")
        consistent = False

if consistent:
    print("All DP05 files have the same number of columns and column names (regardless of order).")
else:
    print("Some DP05 files have inconsistencies in column count or names.")

All DP05 files have the same number of columns and column names (regardless of order).


### check each year has same number of tracts

In [6]:
import os
import pandas as pd

# Define directory
cleaned_dir = r"H:\GY350\CSVs Cleaned"

# Store row counts separately for DP05_06 and DP05_32
row_counts_06 = {}
row_counts_32 = {}

# Iterate over DP05 files
for filename in os.listdir(cleaned_dir):
    if filename.startswith("DP05_06") and filename.endswith(".csv"):
        file_path = os.path.join(cleaned_dir, filename)
        try:
            df = pd.read_csv(file_path, header=None)
            row_counts_06[filename] = len(df)
        except Exception as e:
            print(f"Error reading {filename}: {e}")
    elif filename.startswith("DP05_32") and filename.endswith(".csv"):
        file_path = os.path.join(cleaned_dir, filename)
        try:
            df = pd.read_csv(file_path, header=None)
            row_counts_32[filename] = len(df)
        except Exception as e:
            print(f"Error reading {filename}: {e}")

# Check for consistency in DP05_06
if len(set(row_counts_06.values())) == 1:
    print("All DP05_06 files have the same number of rows.")
else:
    print("Inconsistencies in row counts for DP05_06 files:")
    for file, count in row_counts_06.items():
        print(f"{file}: {count} rows")

# Check for consistency in DP05_32
if len(set(row_counts_32.values())) == 1:
    print("All DP05_32 files have the same number of rows.")
else:
    print("Inconsistencies in row counts for DP05_32 files:")
    for file, count in row_counts_32.items():
        print(f"{file}: {count} rows")

All DP05_06 files have the same number of rows.
All DP05_32 files have the same number of rows.
