In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

### Scrape table of United States cities by population from wikipedia

In [2]:
# Set url and make sure we get a code of 200 which means success
url = 'https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population'
response = requests.get(url)
response.status_code

200

In [3]:
# Get table from html page
page = response.text
soup = BeautifulSoup(page, "lxml")
table = soup.find('table', class_='wikitable sortable')

In [4]:
# Get column names of table to see what data you want to grab
column_names = []
for row in table.findAll("th"):
    column_names.append(row.text.split('\n')[0])

# Clean names
column_names = [x.replace('[c]', '') for x in column_names]
print(column_names)

['2018rank', 'City', 'State', '2018estimate', '2010Census', 'Change', '2016 land area', '2016 population density', 'Location']


In [5]:
# write rows to a pandas dataframe
rank = []
City = []
State = []
estimate2018 = []
population_density_sqmi = []

for row in table.findAll('tr')[1:]:
    cells = row.findAll('td')
    # For each "tr", assign each "td" to a variable.
    rank.append(row.find('td').text.split('\n')[0])
    City.append(cells[1].findAll(text=True)[0])
    State.append(cells[2].findAll(text=True)[1])
    estimate2018.append(cells[3].findAll(text=True)[0].split('\n')[0])
    population_density_sqmi.append(
        cells[8].findAll(text=True)[0].split('/')[0])

df = pd.DataFrame(list(zip(rank, City, State, estimate2018, population_density_sqmi)), 
                  columns=['rank', 'City', 'State', 'estimate2018', 'population_density_sqmi'])
df.head()

Unnamed: 0,rank,City,State,estimate2018,population_density_sqmi
0,1,New York,New York,8398748,28317
1,2,Los Angeles,California,3990456,8484
2,3,Chicago,Illinois,2705994,11900
3,4,Houston,Texas,2325502,3613
4,5,Phoenix,Arizona,1660272,3120


In [6]:
df.to_csv('data/most_populous_cities.csv')