# Sourcing Data from HTML Tables with Pandas

In [2]:
import pandas as pd

We can use the `read_html` function in Pandas to automatically extract any tabular data from a page.

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_Australian_capital_cities'

In [4]:
tables = pd.read_html(url)

tables

[                                                   0
 0  WESTERN AUSTRALIA NORTHERN TERRITORY SOUTH AUS...,
                 State/territory    Capital  City population[2]  \
 0               New South Wales     Sydney             5029768   
 1                      Victoria  Melbourne             4725316   
 2                    Queensland   Brisbane             2360241   
 3             Western Australia      Perth             2022044   
 4               South Australia   Adelaide             1324279   
 5                      Tasmania     Hobart              224462   
 6  Australian Capital Territory   Canberra              403468   
 7            Northern Territory     Darwin              145916   
 
    State/territory population[3]  \
 0                        7759274   
 1                        6179249   
 2                        4848877   
 3                        2558951   
 4                        1713054   
 5                         517588   
 6                         

In [11]:
tables[0]

Unnamed: 0,0
0,WESTERN AUSTRALIA NORTHERN TERRITORY SOUTH AUS...


In [12]:
tables[1]

Unnamed: 0,State/territory,Capital,City population[2],State/territory population[3],Percentage of state/territory population in capital city,Established,Capital since,Image
0,New South Wales,Sydney,5029768,7759274,64.82%,1788,1788,
1,Victoria,Melbourne,4725316,6179249,76.47%,1835,1851,
2,Queensland,Brisbane,2360241,4848877,48.68%,1825,1860,
3,Western Australia,Perth,2022044,2558951,79.02%,1829,1829,
4,South Australia,Adelaide,1324279,1713054,77.31%,1836,1836,
5,Tasmania,Hobart,224462,517588,43.37%,1804,1826,
6,Australian Capital Territory,Canberra,403468,403468,100.00%,1913,1913,
7,Northern Territory,Darwin,145916,245740,59.38%,1869,1911,


In [13]:
tables[2]

Unnamed: 0,vteAustralia articles,vteAustralia articles.1
0,History,Timeline Bibliography Prehistory Archaeology E...
1,Timeline Bibliography Prehistory Archaeology E...,Timeline Bibliography Prehistory Archaeology E...
2,By topic,Asian Australians Constitutional Diplomatic Ec...
3,Geography,Climate Climate change Continent Deserts Envir...
4,Climate Climate change Continent Deserts Envir...,Climate Climate change Continent Deserts Envir...
5,Subdivisions,States and territories Capitals Cities
6,Politics,Asylum Constitution Courts Elections Donations...
7,Asylum Constitution Courts Elections Donations...,Asylum Constitution Courts Elections Donations...
8,Economy,Agriculture Dollar Energy Gross state product ...
9,Agriculture Dollar Energy Gross state product ...,Agriculture Dollar Energy Gross state product ...


In [14]:
tables[3]

Unnamed: 0,0,1
0,Timeline Bibliography Prehistory Archaeology E...,Timeline Bibliography Prehistory Archaeology E...
1,By topic,Asian Australians Constitutional Diplomatic Ec...


In [15]:
tables[4]

Unnamed: 0,0,1
0,Climate Climate change Continent Deserts Envir...,Climate Climate change Continent Deserts Envir...
1,Subdivisions,States and territories Capitals Cities


#### What we get in return is a list of DataFrames for any tabular data that Pandas found.

In [17]:
type(tables)
len(tables)

10

#### We can slice off any of those dataframes that we want using normal indexing.

In [18]:
df = tables[1]
df.head(5)

Unnamed: 0,State/territory,Capital,City population[2],State/territory population[3],Percentage of state/territory population in capital city,Established,Capital since,Image
0,New South Wales,Sydney,5029768,7759274,64.82%,1788,1788,
1,Victoria,Melbourne,4725316,6179249,76.47%,1835,1851,
2,Queensland,Brisbane,2360241,4848877,48.68%,1825,1860,
3,Western Australia,Perth,2022044,2558951,79.02%,1829,1829,
4,South Australia,Adelaide,1324279,1713054,77.31%,1836,1836,


#### Fix column names

In [22]:
df.columns

Index(['State/territory', 'Capital', 'City population[2]',
       'State/territory population[3]',
       'Percentage of state/territory population in capital city',
       'Established', 'Capital since', 'Image'],
      dtype='object')

In [23]:
df

Unnamed: 0,State/territory,Capital,City population[2],State/territory population[3],Percentage of state/territory population in capital city,Established,Capital since,Image
0,New South Wales,Sydney,5029768,7759274,64.82%,1788,1788,
1,Victoria,Melbourne,4725316,6179249,76.47%,1835,1851,
2,Queensland,Brisbane,2360241,4848877,48.68%,1825,1860,
3,Western Australia,Perth,2022044,2558951,79.02%,1829,1829,
4,South Australia,Adelaide,1324279,1713054,77.31%,1836,1836,
5,Tasmania,Hobart,224462,517588,43.37%,1804,1826,
6,Australian Capital Territory,Canberra,403468,403468,100.00%,1913,1913,
7,Northern Territory,Darwin,145916,245740,59.38%,1869,1911,


In [24]:
cols = list(df.columns)
cols[2] = "City Population"
cols[3] = "State/territory population"

df.columns = cols

df.head(5)

Unnamed: 0,State/territory,Capital,City Population,State/territory population,Percentage of state/territory population in capital city,Established,Capital since,Image
0,New South Wales,Sydney,5029768,7759274,64.82%,1788,1788,
1,Victoria,Melbourne,4725316,6179249,76.47%,1835,1851,
2,Queensland,Brisbane,2360241,4848877,48.68%,1825,1860,
3,Western Australia,Perth,2022044,2558951,79.02%,1829,1829,
4,South Australia,Adelaide,1324279,1713054,77.31%,1836,1836,


In [None]:
# df.rename(columns={})

#### Drop a column

In [26]:
df = df.drop(['Image'], axis=1)

#### Reset an index

In [27]:
df = df.reset_index(drop=True)
df

Unnamed: 0,State/territory,Capital,City Population,State/territory population,Percentage of state/territory population in capital city,Established,Capital since
0,New South Wales,Sydney,5029768,7759274,64.82%,1788,1788
1,Victoria,Melbourne,4725316,6179249,76.47%,1835,1851
2,Queensland,Brisbane,2360241,4848877,48.68%,1825,1860
3,Western Australia,Perth,2022044,2558951,79.02%,1829,1829
4,South Australia,Adelaide,1324279,1713054,77.31%,1836,1836
5,Tasmania,Hobart,224462,517588,43.37%,1804,1826
6,Australian Capital Territory,Canberra,403468,403468,100.00%,1913,1913
7,Northern Territory,Darwin,145916,245740,59.38%,1869,1911


In [28]:
df.iloc[0]

State/territory                                             New South Wales
Capital                                                              Sydney
City Population                                                     5029768
State/territory population                                          7759274
Percentage of state/territory population in capital city             64.82%
Established                                                            1788
Capital since                                                          1788
Name: 0, dtype: object

In [30]:
df.loc[df['State/territory']=='New South Wales']

Unnamed: 0,State/territory,Capital,City Population,State/territory population,Percentage of state/territory population in capital city,Established,Capital since
0,New South Wales,Sydney,5029768,7759274,64.82%,1788,1788


## Export DataFrame as CSV

In [31]:
df.to_csv('aussie_wiki.csv', index=False)