In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# 1 Read in census csv
census_data = pd.read_csv("../00_input/census_data.csv", dtype={'ZCTA': 'str'})
census_data.head()

Unnamed: 0,Median Household Income,Population,Median Age,Per Capita Income,Poverty Count,Unemployment Count,ZCTA
0,13092.0,17242.0,40.5,6999.0,10772.0,2316.0,601
1,16358.0,38442.0,42.3,9277.0,19611.0,1927.0,602
2,16603.0,48814.0,41.1,11307.0,24337.0,3124.0,603
3,12832.0,6437.0,43.3,5943.0,4163.0,230.0,606
4,19309.0,27073.0,42.1,10220.0,11724.0,1290.0,610


In [3]:
# Column Reordering
census_data = census_data[["ZCTA", "population", "median_age", "median_household_income", "per_capita_income",
                           "poverty_count", "unemployment_count"]]
census_data.head()

KeyError: "['per_capita_income', 'median_household_income', 'population', 'unemployment_count', 'poverty_count', 'median_age'] not in index"

In [None]:
# Overview of the census dataframe
census_data.info()

### Invalid/Null Values in the dataset 
The Median Household Income, Median Age and Per Capita Income columns contain similar extreme values of -666,666,666.0. Some of the rows that contain these values happen to have seemingly unreasonable values in the other columns: Population, Poverty Count and Unemployment Count. As such, we have made the decision to drop some of the rows that contain the values -666,666,666.0 due to concern that the other values in those rows are inaccurate.

In [None]:
# 2 Get min values for each column
min_income = census_data['Median Household Income'].min()
min_pop = census_data['Population'].min()
min_age = census_data['Median Age'].min()
min_per_capita = census_data['Per Capita Income'].min()
min_poverty = census_data['Poverty Count'].min()
min_unemp = census_data['Unemployment Count'].min()

print(f"The min Median Household Income is {min_income}")
print(f"The min Population is {min_pop}")
print(f"The min Median Age is {min_age}")
print(f"The min Per Capita Income is {min_per_capita}")
print(f"The min Poverty Count is {min_poverty}")
print(f"The min Unemployment Count is {min_unemp}")

In [None]:
# Check the rows for Median Age that contain -666,666,666.0
df_age = census_data[census_data['Median Age'] == min_age]
df_age

In [None]:
# Check unique values in Median Household Income when Median Age = -666,666,666.0
df_age["Median Household Income"].unique()

In [None]:
# Check unique values in Per Capita Income when Median Age = -666,666,666.0
df_age["Per Capita Income"].unique()

As can be seen above, when Median Age equals to -666,666,666.0, the values in Median Household Income and Per Capita Income are either -666,666,666.0 or null. Hence, we'll drop all of those rows.

In [None]:
# 3 Drop rows where Median Age = -666666666.0 (564 rows dropped --> 32,556 rows left)
clean_census = census_data[census_data['Median Age'] != min_age]
clean_census

In [None]:
# 4 Reset the index
clean_census = clean_census.reset_index(drop=True)
clean_census

In [None]:
# 5 Counting all null values per column
null_counts = clean_census.isnull().sum()
null_counts[null_counts > 0].sort_values(ascending=False)

In [None]:
# Identify the last 31 rows to find the 31 NaN values found in Unemployment, Poverty and Median HH Income columns
clean_census.tail(31)

In [None]:
# 6 Drop the 31 rows as there are NaN values in: Median HH Income, Per Capita Income, Poverty and Unemployment Count
# We have no use for this data
clean_census = clean_census.dropna(subset=['Median Household Income'])
clean_census

In [None]:
# Check if the right number of rows have been dropped (should be 594)
print(len(census_data))
print(len(clean_census))

In [None]:
# Counting all null values per column after Null values are removed
null_counts = clean_census.isnull().sum()
null_counts[null_counts > 0].sort_values(ascending=False)

In [None]:
# Checking the rows that have -666666666.0 as Median HH income 
df_HH_income = clean_census[clean_census['Median Household Income'] == min_income]
df_HH_income

The 1634 rows with the values in "Median Household Income" column of -666666666.0 seem to have the other columns with normal-looking values. Hence, we'll only change these -666666666.0 values into NaN to keep the data for further analysis.

In [None]:
#7 Change remaining -666666666.0 values in Median HH Income to NaN 
clean_census = clean_census.replace(min_income, np.nan)

In [None]:
# Counting all null values per column after -666666666.0 is changed to NaN 
null_counts = clean_census.isnull().sum()
null_counts[null_counts > 0].sort_values(ascending=False)

In [None]:
# Format Standardization: Change Population, Poverty and Unemployment Count to int 
clean_census['Population'] = clean_census['Population'].astype(int)
clean_census['Poverty Count'] = clean_census['Poverty Count'].astype(int)
clean_census['Unemployment Count'] = clean_census['Unemployment Count'].astype(int)

In [None]:
# Reset the index
clean_census = clean_census.reset_index(drop=True)
clean_census

In [None]:
# Overview of the census dataframe after cleaning
clean_census.info()

In [None]:
# Use a .describe() to get summary stats of the clean dataset 
summary_stat = round(clean_census.describe(),2)
summary_stat

In [None]:
# Export the census data to csv to load to database
clean_census.to_csv('clean_census.csv', index=False)

## Plots 
We are going to look at different plots that map out relationships between different socio-eocnomic measures and outliers in the dataset 

In [None]:
# Store columns in variables 
med_age = clean_census['Median Age']
med_inc = clean_census['Median Household Income']
pop = clean_census['Population']
per_inc = clean_census['Per Capita Income']
pov_count = clean_census['Poverty Count']
unemp_count = clean_census['Unemployment Count']

## Scatter Plots 
The scatter plots below map the relationships between various variables in the census dataset


In [None]:
# Generate scatter plot - Population V Median Age
plt.scatter(med_age, pop, marker="x", facecolors="blue", alpha=0.75)
plt.xlabel("Median Age")
plt.ylabel("Population")
plt.title("Median Age V. Population", weight="bold")
plt.tight_layout()

In [None]:
# Generate scatter plot - Median Age V Median HH Income 
plt.scatter(med_age, per_inc, marker="x", facecolors="red", alpha=0.75)
plt.xlabel("Median Age")
plt.ylabel("Per Capita")
plt.title("Median Age V. Per Capita Income", weight="bold")
plt.tight_layout()

In [None]:
# Generate scatter plot - Median Age V Poverty Count 
plt.scatter(med_age, pov_count, marker="x", facecolors="green", alpha=0.75)
plt.xlabel("Median Age")
plt.ylabel("Poverty Count")
plt.title("Median Age V. Poverty Count", weight="bold")
plt.tight_layout()

In [None]:
# Generate scatter plot - Median Age V Unemployment Count
plt.scatter(med_age, unemp_count, marker="x", facecolors="grey", alpha=0.75)
plt.xlabel("Median Age")
plt.ylabel("Unemployment Count")
plt.title("Median Age V. Unemployment Count", weight="bold")
plt.tight_layout()

In [None]:
#scatter plot comparing 2 variables together/histograms to see distribution/box plots
#e.g.