In [1]:
import pandas as pd
import re
from collections import defaultdict

In [2]:
census_file_path = "census_data.csv"

In [3]:
census_df = pd.read_csv(census_file_path)

In [4]:
census_df.head()

Unnamed: 0,SUMLEV,STATE,COUNTY,PLACE,COUSUB,CONCIT,PRIMGEO_FLAG,FUNCSTAT,NAME,STNAME,...,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015,POPESTIMATE2016,POPESTIMATE2017,POPESTIMATE2018,POPESTIMATE2019,POPESTIMATE042020,POPESTIMATE2020
0,40,1,0,0,0,0,0,A,Alabama,Alabama,...,4816632,4831586,4843737,4854803,4866824,4877989,4891628,4907965,4920706,4921532
1,162,1,0,124,0,0,0,A,Abbeville city,Alabama,...,2645,2629,2610,2602,2587,2578,2565,2555,2555,2553
2,162,1,0,460,0,0,0,A,Adamsville city,Alabama,...,4453,4430,4399,4371,4335,4304,4285,4254,4224,4211
3,162,1,0,484,0,0,0,A,Addison town,Alabama,...,745,744,742,734,734,728,725,723,719,717
4,162,1,0,676,0,0,0,A,Akron town,Alabama,...,347,344,338,338,335,332,332,328,328,327


In [5]:
not_used_columns = ['STATE', 'COUNTY', 'PLACE', 'COUSUB', 'CONCIT', 'PRIMGEO_FLAG', 'FUNCSTAT', 'CENSUS2010POP', 'ESTIMATESBASE2010','POPESTIMATE042020']

In [6]:
cleaned_census_df = census_df.drop(columns=not_used_columns, axis=1)

In [7]:
cleaned_census_df.head()

Unnamed: 0,SUMLEV,NAME,STNAME,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015,POPESTIMATE2016,POPESTIMATE2017,POPESTIMATE2018,POPESTIMATE2019,POPESTIMATE2020
0,40,Alabama,Alabama,4785514,4799642,4816632,4831586,4843737,4854803,4866824,4877989,4891628,4907965,4921532
1,162,Abbeville city,Alabama,2699,2694,2645,2629,2610,2602,2587,2578,2565,2555,2553
2,162,Adamsville city,Alabama,4481,4474,4453,4430,4399,4371,4335,4304,4285,4254,4211
3,162,Addison town,Alabama,751,750,745,744,742,734,734,728,725,723,717
4,162,Akron town,Alabama,355,347,347,344,338,338,335,332,332,328,327


In [8]:
cleaned_census_df.rename({"NAME": "city", 
           "STNAME": "state",
            "SUMLEV": "sum_level"}, 
          axis = "columns", inplace = True)

In [9]:
columns = cleaned_census_df.columns

for column in columns:
    if "POPESTIMATE" in column:
        year_value = re.sub("[^0-9]", "", column)
        cleaned_census_df.rename(columns={column:f"{year_value}-07-01"}, inplace=True)
cleaned_census_df.head()

Unnamed: 0,sum_level,city,state,2010-07-01,2011-07-01,2012-07-01,2013-07-01,2014-07-01,2015-07-01,2016-07-01,2017-07-01,2018-07-01,2019-07-01,2020-07-01
0,40,Alabama,Alabama,4785514,4799642,4816632,4831586,4843737,4854803,4866824,4877989,4891628,4907965,4921532
1,162,Abbeville city,Alabama,2699,2694,2645,2629,2610,2602,2587,2578,2565,2555,2553
2,162,Adamsville city,Alabama,4481,4474,4453,4430,4399,4371,4335,4304,4285,4254,4211
3,162,Addison town,Alabama,751,750,745,744,742,734,734,728,725,723,717
4,162,Akron town,Alabama,355,347,347,344,338,338,335,332,332,328,327


In [10]:
sum_levels = {
    "40": "State",
    "50": "County",
    "61": "Minor Civil Division",
    "71": "Minor Civil Division place part",
    "157": "County place part",
    "162": "Incorporated place",
    "170": "Consolidated city",
    "172": "Consolidated city -- place within consolidated city"
}

In [13]:
cleaned_census_df['sum_level'] = cleaned_census_df['sum_level'].apply(str)

In [20]:
cleaned_census_df.replace({"sum_level":sum_levels}, inplace=True)

In [21]:
def condense_date_columns(df):
    # Grab non-date columns from dataframe
    non_date_columns = [x for x in df.columns if x[0].isalpha()]
    
    # Create data and non-date dataframes based on original df
    non_date_df = df[non_date_columns]
    date_df = df.drop(columns=non_date_columns)
    
    # Initialize final data dictionary
    data = defaultdict(list)
    
    # Loop through dates
    for col_date in date_df.columns:
        
        # Loop through values for each date
        for i, x in enumerate(list(date_df[col_date])):
            
            # Loop through non-date values
            for col_non_date in non_date_df.columns:
                
                # Append non-date value to non-date column
                data[str(col_non_date)].append(non_date_df[col_non_date][i])
            
            # Append date and price values
            data["year"].append(str(col_date))
            data["population"].append(x)
            
    return pd.DataFrame(data)

In [22]:
final_census_df = condense_date_columns(cleaned_census_df)

In [24]:
reordered_columns = ['city', 'state', 'sum_level', 'year', 'population']

In [25]:
final_census_df = final_census_df[reordered_columns]
final_census_df.head()

Unnamed: 0,city,state,sum_level,year,population
0,Alabama,Alabama,State,2010-07-01,4785514
1,Abbeville city,Alabama,Incorporated place,2010-07-01,2699
2,Adamsville city,Alabama,Incorporated place,2010-07-01,4481
3,Addison town,Alabama,Incorporated place,2010-07-01,751
4,Akron town,Alabama,Incorporated place,2010-07-01,355


In [26]:
final_census_df.to_csv("resources/final_census_data.csv", index=False)