In [4]:
# Import the Pandas package
import pandas as pd
import numpy as np
import re 
import warnings
warnings.filterwarnings("ignore")  # Suppress all warnings

In [68]:
def clean_text(string,replace='_'):
    regex = '[^A-Za-z0-9,]+'
    str1 = re.sub(regex, replace, string)
    #regex = '[^A-Za-z_]+'
    #return re.sub(regex, '', str1)
    return str1

def load_files(year):

    file_str = "data/demographics/poverty_census_{}.csv"
    temp_df = pd.read_csv(file_str.format(year), skiprows = 1)
    temp_df = temp_df.rename(columns = lambda col: col.lower())
    return temp_df
    
def remove_digits(string):
    return re.sub('\d', '', string)


def replace_emdash(string):
    return string.replace('–','-',100)

def replace_all_special(string):
    return re.sub('[^A-Za-z0-9, ]+', '-', string)

def replace_msa(string):
    string= string.replace('Micro Area','')
    string= string.replace('Metro Area','')
    #string= string.replace('M.S.A','')
    return string.strip()

def split_name(string):
    split = string.split(',')
    city= split[0].strip()
    state = split[1].strip()
    
    city_split=city.split('-')
    state_split=state.split('-')
    return city_split[0]+', '+state_split[0]

def clean_string(col):
    return col.apply(remove_digits).apply(replace_emdash).apply(replace_msa).apply(replace_all_special).apply(split_name)#.apply(replace_all_special)


In [10]:
dfs = []
for  i in range(2010,2018):
    temp = load_files(i)
    cols = temp.columns[~temp.columns.str.contains('margin')]
    temp = temp[cols] 
    dfs.append(temp)
    print(temp.shape)

(525, 140)
(529, 140)
(530, 140)
(515, 140)
(515, 140)
(516, 185)
(518, 185)
(518, 185)


In [25]:
columns = [
    ['geographic area name'],
    ['total', 'estimate', 'sex', 'male'],
    ['total', 'estimate', 'sex', 'female'],
]

In [43]:
all_columns = []
count_ocurrences = {}
for ddf in dfs:
    for i in ddf.columns:
        clean_column_name_list = i.split('!!')
        clean_2 = []
        for j in range(len(clean_column_name_list)):
            clean_2.append(clean_column_name_list[j].strip().replace(' ','_'))
        col_name = '_'.join(clean_2)
        all_columns.append(col_name)
        count_ocurrences[col_name]=count_ocurrences.get(col_name,0)+1
        
unique_cols = list(set(all_columns))
print(len(unique_cols))
for i in unique_cols:
    print(i)
    

272
total_estimate_race_and_hispanic_or_latino_origin_asian_alone
percent_below_poverty_level_estimate_age_65_years_and_over
below_poverty_level_estimate_55_to_64_years
percent_below_poverty_level_estimate_all_individuals_with_income_below_the_following_poverty_ratios_300_percent_of_poverty_level
total_estimate_educational_attainment_population_25_years_and_over_high_school_graduate_(includes_equivalency)
below_poverty_level_estimate_employment_status_civilian_labor_force_16_years_and_over_unemployed_male
percent_below_poverty_level_estimate_age_under_18_years
total_estimate_educational_attainment_population_25_years_and_over_less_than_high_school_graduate
total_estimate_percent_imputed_poverty_status_for_individuals
below_poverty_level_estimate_educational_attainment_bachelor's_degree_or_higher
below_poverty_level_estimate_educational_attainment_high_school_graduate_(includes_equivalency)
percent_below_poverty_level_estimate_white_alone,_not_hispanic_or_latino
below_poverty_level_esti

In [48]:
def clean_cols(col):
    clean_column_name_list = col.split('!!')
    clean_2 = []
    for j in range(len(clean_column_name_list)):
        clean_2.append(clean_column_name_list[j].strip().replace(' ','_'))
    col_name = '_'.join(clean_2)
    return col_name
    

In [93]:
def convert_to_int(item):
    try:
        return int(item)
    except:
        return None

In [99]:
test = []

for key in count_ocurrences.keys():
    if(count_ocurrences[key]==8):
        #print(key)
        test.append(key)
        

        
# relevant cols
cols = [
  'geographic_area_name',
  'total_estimate_population_for_whom_poverty_status_is_determined',
  'total_estimate_age_under_18_years',
  'total_estimate_age_18_to_64_years',
  'total_estimate_age_65_years_and_over',
  'total_estimate_educational_attainment_population_25_years_and_over',
  'total_estimate_sex_male',
  'total_estimate_sex_female',
   'percent_below_poverty_level_estimate_population_for_whom_poverty_status_is_determined',
   'percent_below_poverty_level_estimate_age_under_18_years',
   'percent_below_poverty_level_estimate_age_18_to_64_years',
   'percent_below_poverty_level_estimate_age_65_years_and_over',
   'percent_below_poverty_level_estimate_worked_full-time,_year-round_in_the_past_12_months',
   "total_estimate_employment_status_civilian_labor_force_16_years_and_over",
   "total_estimate_work_experience_population_16_years_and_over",
   'total_estimate_mean_income_deficit_for_unrelated_individuals_(dollars)', 
]
        
        
clean_dfs = []
for i,ddf in enumerate(dfs):
    year = i+2010
    please = ddf.rename(columns = lambda col: clean_cols(col))
    temp  = please[test]
    temp =  temp[list(set(cols))]
    temp['Year'] = str(year)
    clean_dfs.append(temp)

    
clean_df = pd.concat(clean_dfs)
clean_df['geographic_area_name'] = clean_df['geographic_area_name'].apply(remove_digits).apply(replace_emdash).apply(replace_msa).apply(split_name)
clean_df


clean_df['total_estimate_mean_income_deficit_for_unrelated_individuals_(dollars)'] = clean_df['total_estimate_mean_income_deficit_for_unrelated_individuals_(dollars)'].apply(convert_to_int)
clean_df['city_name']= clean_df['geographic_area_name']
clean_df = clean_df.drop(columns='geographic_area_name')
for i in clean_df.columns:
    if(clean_df[i].dtype=='object'):
        print(i,clean_df[i].dtype)

Year object
city_name object


In [101]:
clean_df.to_csv('CleanData/Demographics.csv',index=False)