In [123]:
import pandas as pd

# Load the data from CSV file
df = pd.read_csv('source files/ACSST1Y2021.S2503-2023-04-21T042648.csv')

# Filter the columns containing "Renter-occupied housing units!!Estimate"
renter_columns = [col for col in df.columns if "Renter-occupied housing units!!Estimate" in col]
renter_df = df[['Label (Grouping)'] + renter_columns]

# Remove extra spaces from the 'Label (Grouping)' column and rename it to 'households'
renter_df['Label (Grouping)'] = renter_df['Label (Grouping)'].str.strip()
renter_df = renter_df.rename(columns={'Label (Grouping)': 'households'})

# Filter rows based on the specified text chunks
desired_labels = [
    'Less than $5,000',
    '$5,000 to $9,999',
    '$10,000 to $14,999',
    '$15,000 to $19,999',
    'Occupied housing units',
    '$20,000 to $24,999',
    '$25,000 to $34,999',
    '$50,000 to $74,999',
    '$75,000 to $99,999',
    '$100,000 to $149,999',
    '$150,000 or more'

]

filtered_df = renter_df[renter_df['households'].isin(desired_labels)]

# Convert string values with commas to integers
for col in filtered_df.columns[1:]:
    filtered_df[col] = filtered_df[col].apply(lambda x: int(x.replace(',', '')) if isinstance(x, str) else x)

# Create a DataFrame without the 'Occupied housing units' row for the total calculation
calculation_df = filtered_df[filtered_df['households'] != 'Occupied housing units']

# Calculate the total for each column (except the first one) and create a new row with the "total" label
total_row = pd.DataFrame({'households': 'Total'}, index=[0])
for col in calculation_df.columns[1:]:
    total_row[col] = calculation_df[col].sum()

# Append the total row to the filtered DataFrame
filtered_df = pd.concat([filtered_df, total_row], ignore_index=True)

# Find the index of the "Total" and "Occupied housing units" rows
total_index = filtered_df.index[filtered_df['households'] == 'Total'][0]
occupied_index = filtered_df.index[filtered_df['households'] == 'Occupied housing units'][0]

# Calculate the percentage for each column (except the first one) and create a new row with the "percentage" label
percentage_row = pd.DataFrame({'households': 'Percentage under 24,999'}, index=[0])
for col in filtered_df.columns[1:]:
    percentage = (filtered_df.loc[total_index, col] / filtered_df.loc[occupied_index, col]) * 100
    percentage_row[col] = round(percentage)

# Append the percentage row to the filtered DataFrame
filtered_df = pd.concat([filtered_df, percentage_row], ignore_index=True)

# Strip out the ", New Jersey!!Renter-occupied housing units!!Estimate" from each column header except the first
filtered_df.columns = ['households'] + [col.replace(', New Jersey!!Renter-occupied housing units!!Estimate', '') for col in filtered_df.columns[1:]]
filtered_df.columns = ['households'] + [col.replace('!!Renter-occupied housing units!!Estimate', '') for col in filtered_df.columns[1:]]


# Format the numbers as strings with commas (except for the first column)
formatted_df = filtered_df.copy()
formatted_df.iloc[:, 1:] = formatted_df.iloc[:, 1:].applymap(lambda x: '{:,}'.format(x))


# Strip the word "County", remove spaces, and change the remaining text to uppercase
formatted_df.columns = ['households'] + [col.replace(' County', '').upper() for col in formatted_df.columns[1:]]


# Transpose the DataFrame
transposed_df = formatted_df.T


# Update the index with the desired text
transposed_df.index = transposed_df.index.to_series().replace({'households': 'COUNTY',
                                                               'Total': 'TotalBelow25000',
                                                               'Occupied housing units': 'Occupied rental housing units'})

# Find and replace the desired text in the transposed DataFrame
transposed_df = transposed_df.replace({'households': 'COUNTY',
                                       'Total': 'TotalBelow25000',
                                       'Occupied housing units': 'Occupied rental housing units'})


# Save the resulting DataFrame to a CSV file
transposed_df.to_csv('renter_household_income_filtered.csv', header=False)

# Display the result
display(pd.concat([transposed_df.head(50)]))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  renter_df['Label (Grouping)'] = renter_df['Label (Grouping)'].str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df[col] = filtered_df[col].apply(lambda x: int(x.replace(',', '')) if isinstance(x, str) else x)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
COUNTY,Occupied rental housing units,"Less than $5,000","$5,000 to $9,999","$10,000 to $14,999","$15,000 to $19,999","$20,000 to $24,999","$25,000 to $34,999","$50,000 to $74,999","$75,000 to $99,999","$100,000 to $149,999","$150,000 or more","$50,000 to $74,999",TotalBelow25000,"Percentage under 24,999"
NEW JERSEY,1244971,66661,46056,81661,66926,60494,117830,216411,146478,157918,135531,211793,1307759,105
ATLANTIC,37101,2842,1435,3289,3238,3156,5426,5300,3148,2836,2062,4990,37722,102
BERGEN,117928,6936,3683,4251,4934,4158,10165,23789,14736,18145,14638,23108,128543,109
BURLINGTON,39718,2275,1309,2076,1488,1817,4755,6645,6087,4992,3196,6383,41023,103
CAMDEN,69773,3730,4021,4836,4905,3924,7218,13266,6410,8302,3591,12808,73011,105
CAPE MAY,10697,443,417,1072,430,1029,1969,1712,1154,804,343,1331,10704,100
CUMBERLAND,18617,1370,832,2043,878,1389,3218,3790,1699,329,445,3754,19747,106
ESSEX,178779,12895,9138,13239,14079,9479,16942,34325,17562,16616,11790,34049,190114,106
GLOUCESTER,21823,1043,565,2299,1767,1129,2132,3879,2430,1992,2115,3879,23230,106
