# Scraping with Pandas

In [None]:
import pandas as pd

We can use the `read_html` function in Pandas to automatically scrape any tabular data from a page.

In [None]:
url = 'https://en.wikipedia.org/wiki/List_of_capitals_in_the_United_States'

In [None]:
tables = pd.read_html(url)
tables

#### What we get in return is a list of dataframes for any tabular data that Pandas found.

In [None]:
type(tables)

#### We can slice off any of those dataframes that we want using normal indexing.

In [None]:
df = tables[0]
df.head()

#### Drop all single header rows

In [None]:
df.columns = df.columns.get_level_values(0)
df = df.loc[df.Ref.str.startswith("[")]
df.head()

#### Slipt column values into two separate columns

In [None]:
columnsplit = df['City'].str.split(", ", expand=True)
df = df.assign(City=columnsplit[0],State=columnsplit[1])
df.head()

#### Drop a column

In [None]:
df = df.drop(['Ref'], axis=1)
df.head()

#### Reset an index

In [None]:
df = df.reset_index(drop=True)
df.head()

In [None]:
df.loc[df.State=="New York"]

## DataFrames as HTML

#### Pandas also had a `to_html` method that we can use to generate HTML tables from DataFrames.

In [None]:
html_table = df.to_html()
html_table

#### You may have to strip unwanted newlines to clean up the table.

In [None]:
html_table.replace('\n', '')

You can also save the table directly to a file.

In [None]:
df.to_html('table.html')

In [None]:
# OSX Users can run this to open the file in a browser, 
# or you can manually find the file and open it in the browser
!open table.html