In [1]:
# !pip install [package-name]
# !conda install [package-name]
# conda install pandas (already installed with Anaconda)

import pandas as pd

In [3]:
census = pd.read_excel('datasets/census.xlsx', sheet_name='census')
census.head()

Unnamed: 0,state,year,population,land_area
0,Alabama,2010,4785437,52420
1,Alabama,2011,4799069,52420
2,Alabama,2012,4815588,52420
3,Alabama,2013,4830081,52420
4,Alabama,2014,4841799,52420


In [4]:
# Descriptive stats
census.describe()

Unnamed: 0,year,population,land_area
count,306.0,306.0,306.0
mean,2012.5,6175292.0,74445.960784
std,1.710623,6945165.0,96135.285447
min,2010.0,564487.0,68.0
25%,2011.0,1636099.0,35380.0
50%,2012.5,4395502.0,56273.0
75%,2014.0,6828914.0,84897.0
max,2015.0,38918040.0,665384.0


In [5]:
# Column values & types
census.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306 entries, 0 to 305
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   state       306 non-null    object
 1   year        306 non-null    int64 
 2   population  306 non-null    int64 
 3   land_area   306 non-null    int64 
dtypes: int64(3), object(1)
memory usage: 9.7+ KB


In [6]:
# Calculate a column 

census['density'] = census['population']/census['land_area']
census.head()

Unnamed: 0,state,year,population,land_area,density
0,Alabama,2010,4785437,52420,91.29029
1,Alabama,2011,4799069,52420,91.550343
2,Alabama,2012,4815588,52420,91.865471
3,Alabama,2013,4830081,52420,92.14195
4,Alabama,2014,4841799,52420,92.36549


In [7]:
# Read a different worksheet from same workbook
divisions = pd.read_excel('datasets/census.xlsx', sheet_name='divisions')
divisions.head()

Unnamed: 0,state,postal_code,region,division
0,Connecticut,CT,Northeast,New England
1,Maine,ME,Northeast,New England
2,Massachusetts,MA,Northeast,New England
3,New Hampshire,NH,Northeast,New England
4,Rhode Island,RI,Northeast,New England


In [8]:
# Merge datasets, left outer join

census = census.merge(divisions, how='left')
census.head()

Unnamed: 0,state,year,population,land_area,density,postal_code,region,division
0,Alabama,2010,4785437,52420,91.29029,AL,South,East South Central
1,Alabama,2011,4799069,52420,91.550343,AL,South,East South Central
2,Alabama,2012,4815588,52420,91.865471,AL,South,East South Central
3,Alabama,2013,4830081,52420,92.14195,AL,South,East South Central
4,Alabama,2014,4841799,52420,92.36549,AL,South,East South Central


In [10]:
# Get total population by region for 2015


# 1. Filter only 2015 data
census_2015 = census[census['year']==2015]

# QA check -- list unique values in year column
census_2015['year'].unique()

array([2015], dtype=int64)

In [13]:
# Selection region and population columns, group by region, take sum of population
census_2015_agg = census_2015[['region','population']].groupby('region').sum()
census_2015_agg

Unnamed: 0_level_0,population
region,Unnamed: 1_level_1
Midwest,67860583
Northeast,56034684
South,120997341
West,75742555


In [14]:
# Write to Excel
census_2015_agg.to_excel('output/census-report.xlsx', sheet_name='analysis')