In [1]:
# Import dependencies
import pandas as pd
import numpy as np

In [2]:
#1 Read in census csv 
census_data = pd.read_csv("../00_input/census_data.csv")
census_data.head()

Unnamed: 0,Median Household Income,Population,Median Age,Per Capita Income,Poverty Count,Unemployment Count,ZCTA
0,13092.0,17242.0,40.5,6999.0,10772.0,2316.0,601
1,16358.0,38442.0,42.3,9277.0,19611.0,1927.0,602
2,16603.0,48814.0,41.1,11307.0,24337.0,3124.0,603
3,12832.0,6437.0,43.3,5943.0,4163.0,230.0,606
4,19309.0,27073.0,42.1,10220.0,11724.0,1290.0,610


## Invalid/Null Values in the dataset 
The Median Household Income, Median Age and Per Capita Income columns contain similar extreme values of -666,666,666.0. The rows that contain these values happen to have seemingly unreasonable values in the other columns: Population, Poverty Count and Unemployment Count. As such, we have made the decision to drop the rows that contain the values -666,666,666.0 due to concern that the other values in those rows are inaccurate. 

In [3]:
#2 Get min values for each column 
min_income = census_data['Median Household Income'].min()
min_pop = census_data['Population'].min()
min_age = census_data['Median Age'].min()
min_per_capita = census_data['Per Capita Income'].min()
min_poverty = census_data['Poverty Count'].min()
min_unemp = census_data['Unemployment Count'].min()

print(f"The min Median Household Income is {min_income}")
print(f"The min Population is {min_pop}")
print(f"The min Median Age is {min_age}")
print(f"The min Per Capita Income is {min_per_capita}")
print(f"The min Poverty Count is {min_poverty}")
print(f"The min Unemployment Count is {min_unemp}")

The min Median Household Income is -666666666.0
The min Population is 0.0
The min Median Age is -666666666.0
The min Per Capita Income is -666666666.0
The min Poverty Count is 0.0
The min Unemployment Count is 0.0


In [19]:
# Check the rows for Median Age that contain -666,666,666.0
df_age = census_data[census_data['Median Age'] == -666666666.0 ]
df_age.head()

Unnamed: 0,Median Household Income,Population,Median Age,Per Capita Income,Poverty Count,Unemployment Count,ZCTA
110,-666666666.0,0.0,-666666666.0,-666666666.0,0.0,0.0,950
111,-666666666.0,0.0,-666666666.0,-666666666.0,0.0,0.0,951
167,-666666666.0,9.0,-666666666.0,,0.0,0.0,1066
206,-666666666.0,0.0,-666666666.0,-666666666.0,0.0,0.0,1199
514,-666666666.0,25.0,-666666666.0,,13.0,0.0,2203


In [5]:
#3 Drop rows where Median  Age = -666666666.0 (564 rows dropped --> 32,556 rows left) 
clean_census = census_data[census_data['Median Age'] != -666666666.0]
clean_census

Unnamed: 0,Median Household Income,Population,Median Age,Per Capita Income,Poverty Count,Unemployment Count,ZCTA
0,13092.0,17242.0,40.5,6999.0,10772.0,2316.0,601
1,16358.0,38442.0,42.3,9277.0,19611.0,1927.0,602
2,16603.0,48814.0,41.1,11307.0,24337.0,3124.0,603
3,12832.0,6437.0,43.3,5943.0,4163.0,230.0,606
4,19309.0,27073.0,42.1,10220.0,11724.0,1290.0,610
...,...,...,...,...,...,...,...
33114,,759.0,45.7,,,,87523
33115,,363.0,44.2,,,,87515
33117,,2896.0,36.0,,,,87511
33118,,245.0,48.0,,,,87578


In [6]:
#4 Reset the index 
clean_census = clean_census.reset_index(drop = True)
clean_census

Unnamed: 0,Median Household Income,Population,Median Age,Per Capita Income,Poverty Count,Unemployment Count,ZCTA
0,13092.0,17242.0,40.5,6999.0,10772.0,2316.0,601
1,16358.0,38442.0,42.3,9277.0,19611.0,1927.0,602
2,16603.0,48814.0,41.1,11307.0,24337.0,3124.0,603
3,12832.0,6437.0,43.3,5943.0,4163.0,230.0,606
4,19309.0,27073.0,42.1,10220.0,11724.0,1290.0,610
...,...,...,...,...,...,...,...
32551,,759.0,45.7,,,,87523
32552,,363.0,44.2,,,,87515
32553,,2896.0,36.0,,,,87511
32554,,245.0,48.0,,,,87578


In [9]:
#5 Counting all null values per column  
null_counts = clean_census.isnull().sum()
null_counts[null_counts > 0].sort_values(ascending=False)

Per Capita Income          100
Unemployment Count          31
Poverty Count               31
Median Household Income     31
dtype: int64

In [11]:
# Identify the last 31 rows to find the 31 NaN values found in Unemployment, Poverty and Median HH Income columns
clean_census.tail(31)

Unnamed: 0,Median Household Income,Population,Median Age,Per Capita Income,Poverty Count,Unemployment Count,ZCTA
32525,,133.0,25.8,,,,87533
32526,,255.0,47.5,,,,87012
32527,,84.0,57.7,,,,87064
32528,,162.0,38.1,,,,87046
32529,,1341.0,37.8,,,,87548
32530,,775.0,48.6,,,,87575
32531,,1110.0,41.5,,,,87530
32532,,385.0,57.0,,,,87577
32533,,761.0,30.6,,,,87527
32534,,518.0,41.9,,,,87549


In [16]:
#6 Drop the 31 rows as there are NaN values in: Median HH Income, Per Capita Income, Poverty and Unemployment Count 
# We have no use for this data 
clean_census = clean_census.dropna(subset=['Median Household Income'])
clean_census

Unnamed: 0,Median Household Income,Population,Median Age,Per Capita Income,Poverty Count,Unemployment Count,ZCTA
0,13092.0,17242.0,40.5,6999.0,10772.0,2316.0,601
1,16358.0,38442.0,42.3,9277.0,19611.0,1927.0,602
2,16603.0,48814.0,41.1,11307.0,24337.0,3124.0,603
3,12832.0,6437.0,43.3,5943.0,4163.0,230.0,606
4,19309.0,27073.0,42.1,10220.0,11724.0,1290.0,610
...,...,...,...,...,...,...,...
32520,34028.0,330.0,39.5,18213.0,129.0,11.0,99922
32521,57375.0,927.0,43.6,25840.0,172.0,89.0,99925
32522,53409.0,1635.0,34.5,22453.0,235.0,138.0,99926
32523,-666666666.0,38.0,55.5,13658.0,28.0,0.0,99927


In [17]:
# Check if the right number of rows have been dropped (should be 594)
print(len(census_data))
print(len(clean_census))

33120
32525


In [18]:
# Counting all null values per column after Null values are removed 
null_counts = clean_census.isnull().sum()
null_counts[null_counts > 0].sort_values(ascending=False)

Per Capita Income    69
dtype: int64

In [25]:
# Checking the rows that have -666666666.0 as Median HH income 
df_HH_income = clean_census[clean_census['Median Household Income'] == -666666666.0]
df_HH_income

Unnamed: 0,Median Household Income,Population,Median Age,Per Capita Income,Poverty Count,Unemployment Count,ZCTA
42,-666666666.0,69.0,73.6,6880.0,24.0,0.0,694
86,-666666666.0,348.0,64.0,14278.0,46.0,0.0,786
107,-666666666.0,79.0,32.8,16956.0,0.0,26.0,934
108,-666666666.0,812.0,77.3,7132.0,0.0,0.0,936
115,-666666666.0,2221.0,32.8,3069.0,23.0,6.0,960
...,...,...,...,...,...,...,...
32469,-666666666.0,14.0,60.0,25536.0,3.0,0.0,99757
32479,-666666666.0,60.0,17.5,18652.0,14.0,3.0,99767
32486,-666666666.0,11.0,25.8,6964.0,9.0,0.0,99774
32500,-666666666.0,10.0,50.5,68700.0,0.0,2.0,99790


In [35]:
#7 Change remaining -666666666.0 values in Median HH Income to NaN 
clean_census = clean_census.replace(-666666666.0, np.nan)

In [36]:
# Counting all null values per column after -666666666.0 is changed to NaN 
null_counts = clean_census.isnull().sum()
null_counts[null_counts > 0].sort_values(ascending=False)

Median Household Income    1634
Per Capita Income            69
dtype: int64

In [39]:
# Format Standardization: Change Population, Poverty and Unemployment Count to int 
clean_census['Population'] = clean_census['Population'].astype(int)
clean_census['Poverty Count'] = clean_census['Poverty Count'].astype(int)
clean_census['Unemployment Count'] = clean_census['Unemployment Count'].astype(int)

In [40]:
clean_census.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32525 entries, 0 to 32524
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Median Household Income  30891 non-null  float64
 1   Population               32525 non-null  int32  
 2   Median Age               32525 non-null  float64
 3   Per Capita Income        32456 non-null  float64
 4   Poverty Count            32525 non-null  int32  
 5   Unemployment Count       32525 non-null  int32  
 6   ZCTA                     32525 non-null  int64  
dtypes: float64(3), int32(3), int64(1)
memory usage: 1.6 MB


In [43]:
# 595 rows have been dropped --> 
len(clean_census)

32525

In [None]:
#EXTRA CODE TO CHANGE BLANK VALUES 
census_data = census_data.replace(r'^\s*$', np.nan, regex=True)