# US State Populations

In [1]:
import requests
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re

In [2]:
!mkdir data

!curl https://www2.census.gov/programs-surveys/popest/tables/2000-2010/intercensal/state/st-est00int-01.xls -o ./data/us_state_populations_2000_2010.xls
!curl https://www2.census.gov/programs-surveys/popest/tables/2010-2016/state/totals/nst-est2016-01.xlsx -o ./data/us_state_populations_2010_2016.xls

mkdir: data: File exists
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 35328  100 35328    0     0  35328      0  0:00:01 --:--:--  0:00:01 99796
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 17944  100 17944    0     0  17944      0  0:00:01 --:--:--  0:00:01 77679


## State Populations 1970-1979

This cleans and tidies the dataset on the populations of all US states from 1970-1979 from the US Census Bureau.

In [3]:
# This is a list of US states.
us_states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']

# This function takes the name of a state, finds the data on that state and adds it to a string named stripped_data.
def state_strip(state_name) :
    url = 'https://www2.census.gov/programs-surveys/popest/tables/1900-1980/counties/totals/e7079co.txt'
    r = requests.get(url)
    text = r.text
    stripped_data = ''
    for item in text.split("\n") :
        if state_name in item and "Co." not in item and "Beach" not in item and "Par." not in item:
            stripped_data = stripped_data + '\n' + item.strip()
    return stripped_data

# This function allows the state_strip function to loop over the us_states list.
def state_strip_loop(states) :
    all_data = ''
    for state in states :
        add_to_list = state_strip(state)
        all_data = all_data + '\n' + add_to_list
    return all_data

data = state_strip_loop(us_states)

# This writes the output into a CSV file.
file_70_79 = open('./data/US State Populations (1970-1979).csv','w')
file_70_79.write(data)
file_70_79.close()

# Reads the output of this and gets rid of unnecessary data. This uses regex as there is
# no consistent seperator in this dataset.
df_70_79 = pd.read_csv('./data/US State Populations (1970-1979).csv', sep=r'\s\s+', header=None, engine='python')

# Because the dataset has split entries from 1970-74 and 75-79 into two seperate rows, two seperate dataframes
# have been made for these two year ranges by indexing the odd and even rows of df_70_79 
df_70_74 = df_70_79.loc[::2]
df_70_74 = df_70_74.reset_index()
df_75_79 = df_70_79.loc[1::2]
df_75_79 = df_75_79.reset_index()

# This joins the two datasets together.
df_70_79 = pd.concat([df_70_74, df_75_79], ignore_index=True, axis=1)

# This creates dummy labels for unnecessary columns so that they can be later dropped. 
df_70_79.columns = ['Index 1', 'State 1', '1970', '1971', '1972', '1973', '1974', 'Index 2', 'State 2', '1975', '1976', '1977', '1978', '1979']

# This drops a duplicate entry for West Virginia.
df_70_79.drop(df_70_79.index[46], inplace=True)

# This drops the unnecessary columns.
df_70_79.drop(['Index 1', 'State 1', 'Index 2', 'State 2'], axis=1, inplace=True)

# This imports the us_states list into the dataframe.
df_70_79['State'] = us_states

# This melts the dataframe.
df_70_79 = pd.melt(frame=df_70_79, id_vars='State', var_name='Year', value_name='Population')

In [4]:
df_70_79.tail()

Unnamed: 0,State,Year,Population
495,Virginia,1979,5324000
496,Washington,1979,4013000
497,West Virginia,1979,1939000
498,Wisconsin,1979,4666000
499,Wyoming,1979,452000


## State Populations 1980-1989

This codes cleans and tidies the dataset on the populations of all US states from 1980-1989 from the US Census Bureau.

In [5]:
# This sends an HTTP request to the dataset and saves it as text_1.
url = 'https://www2.census.gov/programs-surveys/popest/tables/1980-1990/state/asrh/8090com.txt'
r = requests.get(url)
text = r.text

# This get all the needed information and stores it in stripped_data_1
stripped_data = ''
for item in text.split("\n") :
    if "POPULATION" in item and "CURRENT" not in item :
        stripped_data = stripped_data + '\n' + item.strip()

# This writes the output into a CSV file.
file_80_90 = open('./data/US State Populations (1980-1990).csv','w')
file_80_90.write(stripped_data)
file_80_90.close()

# This reads the CSV file into a DataFrame, using regex to parse the values.
df_80_90 = pd.read_csv('./data/US State Populations (1980-1990).csv', sep=r'\s\s*', header=None, engine='python')

# This creates dummy labels to be dropped later.
df_80_90.columns = ['Dummy State', 'Population', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990']

# This drops these dummy labels and drops unnecessary entries for the total population and Washington, D.C.
df_80_90.drop(['Dummy State', 'Population', '1990'], axis=1, inplace=True)
df_80_90.drop(df_80_90.index[[0, 9]], inplace=True)

# This resets the index.
df_80_90 = df_80_90.reset_index()

# This drops the 'index' column.
df_80_90.drop(['index'], axis=1, inplace=True)

# This imports the us_states list and sets it as an index.
df_80_90['State'] = us_states

# This melts the dataframe.
df_80_90 = pd.melt(frame=df_80_90, id_vars='State', var_name='Year', value_name='Population')

In [6]:
df_80_90.tail()

Unnamed: 0,State,Year,Population
495,Virginia,1989,6120246
496,Washington,1989,4746316
497,West Virginia,1989,1806568
498,Wisconsin,1989,4856574
499,Wyoming,1989,458374


## State Populations 1990-1999

This codes cleans and tidies the dataset on the populations of all US states from 1990-1999 from the US Census Bureau.

In [7]:
# Import the pre-converted CSV file and remove commas
data_90_00_path = './data/US State Populations (1990-2000).csv'
df_90_00 = pd.read_csv(data_90_00_path, header=None)
df_90_00 = df_90_00.replace(',', '', regex=True)

# This drops the unnecessary rows and columns.
df_90_00 = df_90_00.drop(df_90_00.index[[0, 1, 2, 3, 4, 5, 14, 46, 47, 48, 49, 50, 51, 63, 64, 65, 66, 67]])
df_90_00 = df_90_00.reset_index()
df_90_00 = df_90_00.drop('index', axis=1)
df_90_00.columns = ['State', '1990 Estimate', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000']
df_90_00 = df_90_00.drop(['1990 Estimate', '2000'], axis=1)

# This melts the DataFrame.
df_90_00 = pd.melt(frame=df_90_00, id_vars='State', var_name='Year', value_name='Population')

In [8]:
df_90_00.tail()

Unnamed: 0,State,Year,Population
495,Virginia,1999,7000174
496,Washington,1999,5842564
497,West Virginia,1999,1811799
498,Wisconsin,1999,5332666
499,Wyoming,1999,491780


## State Populations 2000-2009

This codes cleans and tidies the dataset on the populations of all US states from 2000-2009 from the US Census Bureau.

In [9]:
# This reads the Excel file from this path.
df_00_10 = pd.read_excel('./data/us_state_populations_2000_2010.xls', header=None)

# This drops unnecessary rows and columns and resets the indez.
df_00_10 = df_00_10.drop(df_00_10.index[[0, 1, 2, 3, 4, 5, 6, 7, 8, 17, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69]])
df_00_10.columns = ['State', 'April 2000', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', 'April 2010', '2010']
df_00_10 = df_00_10.reset_index()
df_00_10 = df_00_10.drop(['index', 'State', 'April 2000', 'April 2010', '2010'], axis=1)

# The values were in floats for some reason, this resets them to ints.
df_00_10 = df_00_10.astype(int)

# This imports the us_states column.
df_00_10['State'] = us_states

# This melts the DataFrame.
df_00_10 = pd.melt(frame=df_00_10, id_vars='State', var_name='Year', value_name='Population')

df_00_10.tail()

Unnamed: 0,State,Year,Population
495,Virginia,2009,7925937
496,Washington,2009,6667426
497,West Virginia,2009,1847775
498,Wisconsin,2009,5669264
499,Wyoming,2009,559851


## State Populations 2010-2016

This codes cleans and tidies the dataset on the populations of all US states from 2010-2016 from the US Census Bureau.

In [10]:
# This reads the Excel file from this path.
df_10_16 = pd.read_excel('./data/us_state_populations_2010_2016.xls', header=None)

# This drops unnecessary rows and columns and resets the index.
df_10_16 = df_10_16.drop([0, 1, 2, 3, 4, 5, 6, 7, 8, 17, 60, 61, 62, 63, 64, 65, 66])
df_10_16 = df_10_16.drop([0, 1, 2], axis=1)

# This turns the floats into integers.
df_10_16 = df_10_16.astype(int)

# This creates a column with the states' names.
df_10_16['State'] = us_states

# This gives names to the columns.
df_10_16.columns = ['2010', '2011', '2012', '2013', '2014', '2015', '2016', 'State']

# This melts the DataFrame.
df_10_16 = pd.melt(frame=df_10_16, id_vars='State', var_name='Year', value_name='Population')

In [11]:
df_10_16.tail()

Unnamed: 0,State,Year,Population
345,Virginia,2016,8411808
346,Washington,2016,7288000
347,West Virginia,2016,1831102
348,Wisconsin,2016,5778708
349,Wyoming,2016,585501


In [14]:
# This brings all the datasets together and sets the state and the year as the index.
us_state_pop = pd.concat([df_70_79, df_80_90, df_90_00, df_00_10, df_10_16])
us_state_pop['Population'] = us_state_pop['Population']
us_state_pop.to_csv('./data/US State Populations (1970-2016).csv')