In [None]:
import pandas as pd

## Importing Census Data Sets

In [None]:
#Importing the raw census csv files for county data
c2018 = pd.read_csv('CountyCensus2018.csv')
c2019 = pd.read_csv('CountyCensus2019.csv')
c2020 = pd.read_csv('CountyCensus2020.csv')
c2021 = pd.read_csv('CountyCensus2021.csv')

In [None]:
#List of Tennessee Counties
county = ['Anderson County', 'Bedford County', 'Benton County',
          'Bledsoe County', 'Blount County', 'Bradley County',
          'Campbell County', 'Cannon County', 'Carroll County',
          'Carter County', 'Cheatham County', 'Chester County',
          'Claiborne County', 'Clay County', 'Cocke County',
          'Coffee County', 'Crockett County', 'Cumberland County',
          'Davidson County', 'Decatur County', 'DeKalb County',
          'Dickson County', 'Dyer County', 'Fayette County',
          'Fentress County', 'Franklin County', 'Gibson County',
          'Giles County', 'Grainger County', 'Greene County',
          'Grundy County', 'Hamblen County', 'Hamilton County',
          'Hancock County', 'Hardeman County', 'Hardin County',
          'Hawkins County', 'Haywood County', 'Henderson County',
          'Henry County', 'Hickman County', 'Houston County',
          'Humphreys County', 'Jackson County', 'Jefferson County',
          'Johnson County', 'Knox County', 'Lake County',
          'Lauderdale County', 'Lawrence County', 'Lewis County',
          'Lincoln County', 'Loudon County', 'Macon County',
          'Madison County', 'Marion County', 'Marshall County',
          'Maury County', 'McMinn County', 'McNairy County',
          'Meigs County', 'Monroe County', 'Montgomery County',
          'Moore County', 'Morgan County', 'Obion County',
          'Overton County', 'Perry County', 'Pickett County',
          'Polk County', 'Putnam County', 'Rhea County',
          'Roane County', 'Robertson County', 'Rutherford County',
          'Scott County', 'Sequatchie County', 'Sevier County',
          'Shelby County', 'Smith County', 'Stewart County',
          'Sullivan County', 'Sumner County', 'Tipton County',
          'Trousdale County', 'Unicoi County', 'Union County',
          'Van Buren County', 'Warren County', 'Washington County',
          'Wayne County', 'Weakley County', 'White County',
          'Williamson County', 'Wilson County']


In [None]:
#List of updated column names
col = ['population',
            'age', 
                    '<6', '6-18', '19-25', '26-34', '35-44', '45-54', '55-64', '65-74', '>75', '<19','19-64', '>65',
            'sex',
                    'male', 'female',
            'race', 
                    'white','african american','american indian and alaska native','asian','native hawaiian and pacific islander','other', 'multiple','hispanic or latino','not hispanic or latino',
            'living arrangements', 
                    'family housholds','married families','other families','male no spouse','female no spouse','non-family',
            'citizenship status', 
                    'native','foreign', 'naturalized','not citizen',
            'disability status',
                    'disability', 'no disability',
            'education level',
                'population(26 over)',
                    'less than high school',
                    'high school',
                    'some college',
                    'bachelors degree or higher', 
            'employment status',
                'population(19-64)',
                    'in labor force', 'employed','unemployed', 'not in labor force',
           'work experience',
                'population(19-65)',
                    'full time','not full time','no work',
           'household income',
                'total population', 
                    'Under 25000','25000-49999', '50000-74999','75000-99999', 'over 100000',
           'income poverty ratio',
                'total population poverty status',
       '<138',
       '138-399',
       '>400',
       '<100']

In [None]:
#List of filtered column names
new_names = ['population','insured','percent_insured','uninsured','percent_uninsured']

## Cleaning Funtions

In [None]:
#Creates year column
def add_year_column(df, year):
    df['year'] = year
    return df

In [None]:
#Cleaning the Columns
def type_cleaning(df):
    #Making 'population' column int
    df['population'] = df['population'].str.replace(',', '')
    df['population'] = pd.to_numeric(df['population'])
    
    #Making 'insured' column int
    df['insured'] = df['insured'].str.replace(',', '')
    df['insured'] = pd.to_numeric(df['insured'])
    
    #Making 'percent insured' column float types
    df['percent_insured'] = df['percent_insured'].str.replace('%', '')
    df['percent_insured'] = df['percent_insured'].astype(float)

    #Making 'percent uninsured' column float types
    df['percent_uninsured'] = df['percent_uninsured'].str.replace('%', '')
    df['percent_uninsured'] = df['percent_uninsured'].astype(float) 
    
    #Making 'uninsured' column int
    df['uninsured'] = df['uninsured'].str.replace(',', '')
    df['uninsured'] = pd.to_numeric(df['uninsured'])

In [None]:
#Slices the dataframes in groups of 5 by county
def slice_dataframe(df):
    num_columns = df.shape[1]
    sliced_dfs = []
    for start in range(0, num_columns, 5):
        end = min(start + 5, num_columns)
        sliced_df = df.iloc[:, start:end]
        sliced_dfs.append(sliced_df)
    return sliced_dfs

In [None]:
#Fill nan values in uninsured column
def fill_uninsured(df):
    #Filling NaN values in the 'uninsured' column
    nan = df[df.isna().any(axis=1)]
    if not nan.empty:
        df['uninsured'] = df['uninsured'].fillna((df['percent_uninsured'].astype(float) * df['population'].astype(float) / 100).round(decimals=0).astype(int))
    
    #Converting 'uninsured' column to int
    df['uninsured'] = df['uninsured'].astype(int)

In [None]:
#rename columns
def rename_columns(dataframes, new_column_names):
    renamed_dataframes = []
    for df in dataframes:
        df.columns=new_column_names

    return dataframes

In [None]:
#merge dataframes
def merge_dataframes(dataframes):
    merged_df = pd.concat(dataframes, axis=1)
    return merged_df

## Total Census Dataframe

In [None]:
def general_census(df, year):
    #Change the labels
    df['Label (Grouping)']=col
    #Transpose the dataframe
    df = df.transpose()
    #Select the specific label grouping
    df = df.loc[:, 0:1]
    #Transpose the dataframe
    df = df.transpose()
    #Set the labels to the index
    df.set_index(df.columns[0], inplace=True)
    #Slice the data frame between counties
    new = slice_dataframe(df)
    #Rename each individual dataframe's columns
    new = rename_columns(new, new_names)
    #Create county column
    for i, df in enumerate(new):  
        df['county'] = county[i]
    #Reset index
    for i, df in enumerate(new):
        df.reset_index(drop=True,inplace=True)
    #Drop the NaN value row
    for df in new:
        df.drop(1, inplace=True)
        df.reset_index(drop=True, inplace=True)
    #Combine all of the individual dataframes
    census_age = pd.concat(new)
    #Add a year column to the final dataframe
    add_year_column(census_age, year)
    #Cleaning the perentage columns and column data types
    type_cleaning(census_age)
    #Fill in the null values in the uninsured column
    fill_uninsured(census_age)
    #Return the final dataframe
    return census_age

## Age Census Dataframe

In [None]:
def age_census(df, year):
    #Change the labels
    df['Label (Grouping)']=col
    #Transpose the dataframe
    df = df.transpose()
    #Select the specific label grouping
    df = df.loc[:, 0:13]
    #Transpose the dataframe
    df = df.transpose()
    #Set the labels to the index
    df.set_index(df.columns[0], inplace=True)
    #Slice the data frame between counties
    new = slice_dataframe(df)
    #Rename each individual dataframe's columns
    new = rename_columns(new, new_names)
    #Create an age column using the index
    for i, df in enumerate(new):
        df['county'] = county[i]
        df.reset_index(inplace=True)
        df.rename(columns={'Label (Grouping)': 'age'}, inplace=True)
    #Drop the NaN value row
    for df in new:
        df.drop(1, inplace=True)
        df.reset_index(drop=True, inplace=True)
    #Change the population label
    for df in new:
        df.at[0, 'age'] = 'all'
    #Combine all of the individual dataframes
    census_age = pd.concat(new)
    #Add a year column to the final dataframe
    add_year_column(census_age, year)
    #Cleaning the perentage columns and column data types
    type_cleaning(census_age)
    #Fill in the null values in the uninsured column
    fill_uninsured(census_age)
    #Return the final dataframe
    return census_age

## Employment Census Dataframe

In [None]:
def employment_census(df, year):    
    #Change the labels
    df['Label (Grouping)']=col
    #Transpose the dataframe
    df = df.transpose()
    #Select the specific label grouping
    df = df.loc[:, 49:53]
    #Transpose the dataframe
    df = df.transpose()
    #Set the labels to the index
    df.set_index(df.columns[0], inplace=True)
    #Slice the data frame between counties
    new = slice_dataframe(df)
    #Rename each individual dataframe's columns
    new = rename_columns(new, new_names)
    #Create an category column using the index
    for i, df in enumerate(new):
        df['county'] = county[i]
        df.reset_index(inplace=True)
        df.rename(columns={'Label (Grouping)': 'employment_status (19-64)'}, inplace=True)
    #Change the population label
    for df in new:
        df.at[0, 'employment_status (19-64)'] = 'all'
    #Combine all of the individual dataframes
    census_employment = pd.concat(new)
    #Add a year column to the final dataframe
    add_year_column(census_employment, year)
    #Cleaning the perentage columns and column data types
    type_cleaning(census_employment)
    #Fill in the null values in the uninsured column
    fill_uninsured(census_employment)
    #Return the final dataframe
    return census_employment

## Disability Census Dataframe

In [None]:
def disability_census(df, year):
    #Change the labels
    df['Label (Grouping)']=col
    #Transpose the dataframe
    df = df.transpose()
    #Select the specific label grouping
    df = df.loc[:, 39:41]
    #Transpose the dataframe
    df = df.transpose()
    #Set the labels to index
    df.set_index(df.columns[0], inplace=True)
    #Slice the dataframe between counties
    new = slice_dataframe(df)
    #Rename each individual dataframes's columns
    new = rename_columns(new, new_names)
    #Create a category column using the index
    for i, df in enumerate(new):
        df['county'] = county[i]
        df.reset_index(inplace=True)
        df.rename(columns={'Label (Grouping)': 'disability_status'}, inplace=True)
    #Drop the NaN value row
    for df in new:
        df.drop(0, inplace=True)
        df.reset_index(drop=True, inplace=True)
    #Combine all individual dataframes
    census_disability = pd.concat(new)
    #Add a year column to the final dataframe
    census_disability = add_year_column(census_disability, year)
    #Cleaning the percentage columns and column data types
    type_cleaning(census_disability)
    #Fill uninsured null values
    fill_uninsured(census_disability)
    #Return the final dataframe
    return census_disability

## Poverty Level Census Dataframe

In [None]:
def poverty_census(df, year):
    #Change the labels
    df['Label (Grouping)']=col
    #Transpose the dataframe
    df = df.transpose()
    #Select specific label grouping
    df = df.loc[:, 66:]
    #Transpose the dataframe
    df = df.transpose()
    #Set the labels to index
    df.set_index(df.columns[0], inplace=True)
    #Slice the dataframe between counties
    new = slice_dataframe(df)
    #Rename each individual dataframes's columns
    new = rename_columns(new, new_names)
    #Create a category column using the index
    for i, df in enumerate(new):
        df['county'] = county[i]
        df.reset_index(inplace=True)
        df.rename(columns={'Label (Grouping)': 'poverty_level (%)'}, inplace=True)
    #Drop the NaN value row
    for df in new:
        df.drop(0, inplace=True)
        df.reset_index(drop=True, inplace=True)
    #Change the population label
    for df in new:
        df.at[0, 'poverty_level (%)'] = 'all'
    #Combine all individual dataframes
    census_poverty = pd.concat(new)
    #Add a year column to the final dataframe
    census_poverty = add_year_column(census_poverty, year)
    #Cleaning the percentage columns and column data types
    type_cleaning(census_poverty)
    #Fill uninsured null values
    fill_uninsured(census_poverty)
    #Return the final dataframe
    return census_poverty
    

## Combine Cleaned Dataframes

In [None]:
#Use each function to create the new coresponding dataframes
g2018 = general_census(c2018,2018)
g2019 = general_census(c2019, 2019)
g2020 = general_census(c2020,2020)
g2021 = general_census(c2021, 2021)

a2018 = age_census(c2018,2018)
a2019 = age_census(c2019, 2019)
a2020 = age_census(c2020,2020)
a2021 = age_census(c2021, 2021)

e2018 = employment_census(c2018,2018)
e2019 = employment_census(c2019, 2019)
e2020 = employment_census(c2020,2020)
e2021 = employment_census(c2021, 2021)

d2018 = disability_census(c2018,2018)
d2019 = disability_census(c2019, 2019)
d2020 = disability_census(c2020,2020)
d2021 = disability_census(c2021, 2021)

p2018 = poverty_census(c2018,2018)
p2019 = poverty_census(c2019, 2019)
p2020 = poverty_census(c2020,2020)
p2021 = poverty_census(c2021, 2021)

In [None]:
#Function to create list of dataframes
def create_dataframe_list(*dataframes):
    dataframe_list = list(dataframes)
    return dataframe_list

In [None]:
#Function to concat list of dataframes
def df_combine(dataframes):
    concatenated_df = pd.concat(dataframes, ignore_index=True)
    return concatenated_df

In [None]:
#Creates a list of the dataframes
gen_lst = create_dataframe_list(g2018,g2019,g2020,g2021)
age_lst = create_dataframe_list(a2018,a2019,a2020,a2021)
emp_lst = create_dataframe_list(e2018,e2019,e2020,e2021)
dis_lst = create_dataframe_list(d2018,d2019,d2020,d2021)
pov_lst = create_dataframe_list(p2018,p2019,p2020,p2021)

In [None]:
#Concats the list of dataframes
gen_census = df_combine(gen_lst)
age_census = df_combine(age_lst)
emp_census = df_combine(emp_lst)
dis_census = df_combine(dis_lst)
pov_census = df_combine(pov_lst)

In [None]:
#Saves the individual dataframes to new csv files
gen_census.to_csv('01general_census.csv')
age_census.to_csv('01age_census.csv')
emp_census.to_csv('01employment_census.csv')
dis_census.to_csv('01disability_census.csv')
pov_census.to_csv('01poverty_census.csv')