https://www.dataquest.io/blog/web-scraping-tutorial-python/

In [1]:
import requests

In [2]:
page = requests.get("http://dataquestio.github.io/web-scraping-pages/simple.html")
page

<Response [200]>

In [5]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')

In [8]:
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <p>
   Here is some simple content for this page.
  </p>
 </body>
</html>


In [10]:
list(soup.children)

['html', '\n', <html>
 <head>
 <title>A simple example page</title>
 </head>
 <body>
 <p>Here is some simple content for this page.</p>
 </body>
 </html>]

In [12]:
[type(item) for item in list(soup.children)]

[bs4.element.Doctype, bs4.element.NavigableString, bs4.element.Tag]

In [17]:
html = list(soup.children)[2]

In [21]:
body = list(html.children)[3]

In [26]:
p = list(body.children)[1]

In [29]:
p.get_text()

'Here is some simple content for this page.'

In [31]:
soup.find_all('p')[0].get_text()

'Here is some simple content for this page.'

In [34]:
soup.find('p').get_text()

'Here is some simple content for this page.'

In [37]:
page = requests.get("http://dataquestio.github.io/web-scraping-pages/ids_and_classes.html")
soup = BeautifulSoup(page.content,'html.parser')
soup

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<div>
<p class="inner-text first-item" id="first">
                First paragraph.
            </p>
<p class="inner-text">
                Second paragraph.
            </p>
</div>
<p class="outer-text first-item" id="second">
<b>
                First outer paragraph.
            </b>
</p>
<p class="outer-text">
<b>
                Second outer paragraph.
            </b>
</p>
</body>
</html>

In [38]:
soup.find_all('p',class_='outer-text')

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>, <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [39]:
soup.find_all(class_='outer-text')

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>, <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [43]:
soup.find_all(id='first')

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>]

In [44]:
soup.select('div p')

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>, <p class="inner-text">
                 Second paragraph.
             </p>]

In [95]:
pagec = requests.get('https://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168#.XA70oVX7TIU').content
ss = BeautifulSoup(pagec,'html.parser')


In [52]:
import pandas as pd

In [98]:
forecast = pd.DataFrame()
forecast['desc'] = [x['title'] for x in ss.select('img.forecast-icon')]
forecast['period'] = [x.get_text() for x in ss.select('p.period-name')]
forecast['sdesc'] = [x.get_text() for x in ss.select('p.short-desc')]
forecast['temp'] = [x.get_text() for x in ss.select('p.temp')]

forecast

Unnamed: 0,desc,period,sdesc,temp
0,"Tonight: Mostly clear, with a low around 47. W...",Tonight,Mostly Clear,Low: 47 °F
1,"Tuesday: Sunny, with a high near 59. Light and...",Tuesday,Sunny,High: 59 °F
2,"Tuesday Night: Mostly cloudy, with a low aroun...",TuesdayNight,Mostly Cloudy,Low: 50 °F
3,"Wednesday: Mostly sunny, with a high near 60. ...",Wednesday,Mostly Sunny,High: 60 °F
4,"Wednesday Night: Partly cloudy, with a low aro...",WednesdayNight,Partly Cloudy,Low: 46 °F
5,"Thursday: Mostly sunny, with a high near 58.",Thursday,Mostly Sunny,High: 58 °F
6,"Thursday Night: Partly cloudy, with a low arou...",ThursdayNight,Partly Cloudy,Low: 48 °F
7,"Friday: A chance of rain. Mostly cloudy, with...",Friday,Chance Rain,High: 58 °F
8,Friday Night: A chance of rain. Mostly cloudy...,FridayNight,Chance Rain,Low: 49 °F


In [114]:
temp_num = forecast.temp.str.extract("(\d+)",expand=False)
forecast['temp_num'] = temp_num.astype('int')

In [116]:
forecast.temp_num.mean()

52.77777777777778

In [118]:
forecast[forecast.temp.str.contains('Low')]


Unnamed: 0,desc,period,sdesc,temp,temp_num
0,"Tonight: Mostly clear, with a low around 47. W...",Tonight,Mostly Clear,Low: 47 °F,47
2,"Tuesday Night: Mostly cloudy, with a low aroun...",TuesdayNight,Mostly Cloudy,Low: 50 °F,50
4,"Wednesday Night: Partly cloudy, with a low aro...",WednesdayNight,Partly Cloudy,Low: 46 °F,46
6,"Thursday Night: Partly cloudy, with a low arou...",ThursdayNight,Partly Cloudy,Low: 48 °F,48
8,Friday Night: A chance of rain. Mostly cloudy...,FridayNight,Chance Rain,Low: 49 °F,49


-----

#### Coursera Capstone
##### without bs4

In [139]:
import pandas as pd
import requests
import numpy as np
import re

strres = str(requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').content)

strres = re.sub(r'\\n|<a .+?>|</a>','',strres)
strres = re.sub(r"\\'","\'",strres)

titles = re.findall(r'<th>(.+?)</th>',strres)
matches = re.findall(r'<td>(.+?)</td>',strres)
matches = matches[0:-1]

In [140]:
ttable = pd.DataFrame()
for i in range(3):
    ttable[titles[i]] = [matches[x] for x in np.arange(i,len(matches),3)]

In [143]:
ttable.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
