In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import requests
from bs4 import BeautifulSoup

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os


# Any results you write to the current directory are saved as output.

The components of a web page
-  HTML — contain the main content of the page.
- CSS — add styling to make the page look nicer.
- JS — Javascript files add interactivity to web pages.
- Images — image formats, such as JPG and PNG allow web pages to show pictures.



In [2]:
# Example
#<html>
#    <head>
#    </head>
#    <body>
#        <p>
#            Here's a paragraph of text!
#            <a href="https://www.dataquest.io">Learn Data Science Online</a>
#        </p>
#        <p>
#            Here's a second paragraph of text!
#            <a href="https://www.python.org">Python</a>
#        </p>
#    </body>
#</html>

- div — indicates a division, or area, of the page.
- b — bolds any text inside.
- i — italicizes any text inside.
- table — creates a table.
- form — creates an input form.

In [5]:
# first thing to do web scraping is to downlode the page
# we can do that with a lib 'requests'
page = requests.get("http://dataquestio.github.io/web-scraping-pages/simple.html")
page

<Response [200]>

In [6]:
# Status code indicates if the page is downloded successfully
page.status_code

# starting digit 2 means successfull
# 4 & 5 indicates error

200

In [7]:
page.content

b'<!DOCTYPE html>\n<html>\n    <head>\n        <title>A simple example page</title>\n    </head>\n    <body>\n        <p>Here is some simple content for this page.</p>\n    </body>\n</html>'

In [8]:
# now we use beautifulSoup to parse the data

# creating an instance
data = BeautifulSoup(page.content, 'html.parser')

In [9]:
# prettify - used to print html content in proper format

print(data.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <p>
   Here is some simple content for this page.
  </p>
 </body>
</html>


In [10]:
 # selecting all the elements at the top level of the page.

list(data.children)

['html', '\n', <html>
 <head>
 <title>A simple example page</title>
 </head>
 <body>
 <p>Here is some simple content for this page.</p>
 </body>
 </html>]

In [11]:
# finding the type of elements 

[type(i) for i in list(data.children)]

# Doctype object, which contains information about the type of the document.
# NavigableString, which represents text found in the HTML document
# tag object, which contains other nested tags.

[bs4.element.Doctype, bs4.element.NavigableString, bs4.element.Tag]

In [12]:
html = list(data.children)[2]
html
head = list(html.children)[1]
body = list(head.children)
txt = body[1]
txt.get_text()

'A simple example page'

In [13]:
# finding all the tags at once

data.find_all('p')

# find all returns a list

[<p>Here is some simple content for this page.</p>]

In [14]:
data.find_all('p')[0].get_text()

'Here is some simple content for this page.'

In [15]:
# finding the first instance 

data.find('p')

<p>Here is some simple content for this page.</p>

In [19]:
# learning how to use class and id 

page = requests.get("http://dataquestio.github.io/web-scraping-pages/ids_and_classes.html")
pg_dt = BeautifulSoup(page.content, 'html.parser')
pg_dt

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<div>
<p class="inner-text first-item" id="first">
                First paragraph.
            </p>
<p class="inner-text">
                Second paragraph.
            </p>
</div>
<p class="outer-text first-item" id="second">
<b>
                First outer paragraph.
            </b>
</p>
<p class="outer-text">
<b>
                Second outer paragraph.
            </b>
</p>
</body>
</html>

In [21]:
# find all method using class
pg_dt.find_all('p', class_ = 'outer-text')

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>, <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [22]:
# all the tags with a given tag
pg_dt.find_all(class_ = "outer-text")

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>, <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [23]:
pg_dt.find_all(id = "first")

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>]

In [25]:
"""
p a — finds all a tags inside of a p tag.

body p a — finds all a tags inside of a p tag inside of a body tag.

html body — finds all body tags inside of an html tag.

p.outer-text — finds all p tags with a class of outer-text.

p#first — finds all p tags with an id of first.

body p.outer-text — finds any p tags with a class of outer-text inside of a body tag.
"""

'\np a — finds all a tags inside of a p tag.\n\nbody p a — finds all a tags inside of a p tag inside of a body tag.\n\nhtml body — finds all body tags inside of an html tag.\n\np.outer-text — finds all p tags with a class of outer-text.\n\np#first — finds all p tags with an id of first.\n\nbody p.outer-text — finds any p tags with a class of outer-text inside of a body tag.\n'

In [36]:
# extracting data from "https://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168#.W5mHm-hKiMo"

page = requests.get("https://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168#.W5mHm-hKiMo")
soup = BeautifulSoup(page.content, 'html.parser')

days = soup.find(id = 'seven-day-forecast')

forcast_items = days.find_all(class_ = 'tombstone-container')

for i in range(len(forcast_items)):
    print(forcast_items[i].find('p', class_ = 'period-name').get_text())

Tonight
Thursday
ThursdayNight
Friday
FridayNight
Saturday
SaturdayNight
Sunday
SundayNight


In [37]:
forcast_items[0]

<div class="tombstone-container">
<p class="period-name">Tonight<br/><br/></p>
<p><img alt="Tonight: Mostly cloudy, with a low around 55. Breezy, with a west wind 15 to 23 mph, with gusts as high as 30 mph. " class="forecast-icon" src="DualImage.php?i=nwind_bkn&amp;j=nbkn" title="Tonight: Mostly cloudy, with a low around 55. Breezy, with a west wind 15 to 23 mph, with gusts as high as 30 mph. "/></p><p class="short-desc">Mostly Cloudy<br/>and Breezy<br/>then Mostly<br/>Cloudy</p><p class="temp temp-low">Low: 55 °F</p></div>

In [40]:
# description of toningt is in img tag
img = forcast_items[0].find('img')
img

<img alt="Tonight: Mostly cloudy, with a low around 55. Breezy, with a west wind 15 to 23 mph, with gusts as high as 30 mph. " class="forecast-icon" src="DualImage.php?i=nwind_bkn&amp;j=nbkn" title="Tonight: Mostly cloudy, with a low around 55. Breezy, with a west wind 15 to 23 mph, with gusts as high as 30 mph. "/>

In [41]:
img['title']

'Tonight: Mostly cloudy, with a low around 55. Breezy, with a west wind 15 to 23 mph, with gusts as high as 30 mph. '

In [44]:
# now we will collect all at once

period_tags = days.select('.tombstone-container .period-name')
period_tags

[<p class="period-name">Tonight<br/><br/></p>,
 <p class="period-name">Thursday<br/><br/></p>,
 <p class="period-name">Thursday<br/>Night</p>,
 <p class="period-name">Friday<br/><br/></p>,
 <p class="period-name">Friday<br/>Night</p>,
 <p class="period-name">Saturday<br/><br/></p>,
 <p class="period-name">Saturday<br/>Night</p>,
 <p class="period-name">Sunday<br/><br/></p>,
 <p class="period-name">Sunday<br/>Night</p>]

In [45]:
periods = [x.get_text() for x in period_tags]
periods

['Tonight',
 'Thursday',
 'ThursdayNight',
 'Friday',
 'FridayNight',
 'Saturday',
 'SaturdayNight',
 'Sunday',
 'SundayNight']

In [53]:
description = [x['title'] for x in days.select(".tombstone-container img")]
short_description = [y.get_text() for y in days.select(".tombstone-container .short-desc")]
temp = [z.get_text() for z in days.select(".tombstone-container .temp")]
temp

['Low: 55 °F',
 'High: 63 °F',
 'Low: 54 °F',
 'High: 64 °F',
 'Low: 54 °F',
 'High: 64 °F',
 'Low: 54 °F',
 'High: 65 °F',
 'Low: 54 °F']

In [54]:
# creating a table

weather = pd.DataFrame({"period" : periods,
                       "short_desc" : short_description,
                       "description" : description,
                       "Temperature" : temp})
weather

Unnamed: 0,Temperature,description,period,short_desc
0,Low: 55 °F,"Tonight: Mostly cloudy, with a low around 55. ...",Tonight,Mostly Cloudyand Breezythen MostlyCloudy
1,High: 63 °F,"Thursday: Mostly sunny, with a high near 63. B...",Thursday,Partly Sunnythen MostlySunny andBreezy
2,Low: 54 °F,"Thursday Night: Partly cloudy, with a low arou...",ThursdayNight,Partly Cloudyand Breezythen MostlyCloudy
3,High: 64 °F,"Friday: Mostly cloudy, then gradually becoming...",Friday,DecreasingClouds
4,Low: 54 °F,"Friday Night: Partly cloudy, with a low around...",FridayNight,Partly Cloudy
5,High: 64 °F,"Saturday: Mostly sunny, with a high near 64. B...",Saturday,Mostly Sunnyand Breezy
6,Low: 54 °F,"Saturday Night: Partly cloudy, with a low arou...",SaturdayNight,Partly Cloudy
7,High: 65 °F,"Sunday: Mostly sunny, with a high near 65.",Sunday,Mostly Sunny
8,Low: 54 °F,"Sunday Night: Partly cloudy, with a low around...",SundayNight,Partly Cloudy
