In [1]:
import requests

In [3]:
page = requests.get("https://raw.githubusercontent.com/bernardonugroho/storage/master/first.html")

In [4]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <p>
   Here is some simple content for this page.
  </p>
 </body>
</html>



In [5]:
list(soup.children)

['html', '\n', <html>
 <head>
 <title>
    A simple example page
   </title>
 </head>
 <body>
 <p>
    Here is some simple content for this page.
   </p>
 </body>
 </html>, '\n']

In [6]:
[type(item) for item in list(soup.children)]

[bs4.element.Doctype,
 bs4.element.NavigableString,
 bs4.element.Tag,
 bs4.element.NavigableString]

In [7]:
html = list(soup.children)[2]

In [9]:
print(html)

<html>
<head>
<title>
   A simple example page
  </title>
</head>
<body>
<p>
   Here is some simple content for this page.
  </p>
</body>
</html>


In [10]:
list(html.children)

['\n', <head>
 <title>
    A simple example page
   </title>
 </head>, '\n', <body>
 <p>
    Here is some simple content for this page.
   </p>
 </body>, '\n']

In [13]:
body = list(html.children)[3]
list(body.children)

['\n', <p>
    Here is some simple content for this page.
   </p>, '\n']

# Finding all instances of a tag at once

In [14]:
soup = BeautifulSoup(page.content, 'html.parser')
soup.find_all('p')

[<p>
    Here is some simple content for this page.
   </p>]

# Searching for tags by class and id

In [17]:
page = requests.get("https://raw.githubusercontent.com/bernardonugroho/storage/master/second.html")
soup = BeautifulSoup(page.content, 'html.parser')
soup

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<div>
<p class="inner-text first-item" id="first">
                First paragraph.
            </p>
<p class="inner-text">
                Second paragraph.
            </p>
</div>
<p class="outer-text first-item" id="second">
<b>
                First outer paragraph.
            </b>
</p>
<p class="outer-text">
<b>
                Second outer paragraph.
            </b>
</p>
</body>
</html>

In [19]:
soup.find_all('p', class_='outer-text')

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>, <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [20]:
soup.find_all(id="first")

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>]

# Downloading weather data

In [21]:
page = requests.get("http://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168")
soup = BeautifulSoup(page.content, 'html.parser')
seven_day = soup.find(id="seven-day-forecast")
forecast_items = seven_day.find_all(class_="tombstone-container")
tonight = forecast_items[0]
print(tonight.prettify())

<div class="tombstone-container">
 <p class="period-name">
  Tonight
  <br>
   <br/>
  </br>
 </p>
 <p>
  <img alt="Tonight: A 20 percent chance of showers after midnight.  Mostly cloudy, with a low around 53. Breezy, with a west wind 20 to 22 mph, with gusts as high as 29 mph.  New precipitation amounts of less than a tenth of an inch possible. " class="forecast-icon" src="DualImage.php?i=nwind_bkn&amp;j=nshra&amp;jp=20" title="Tonight: A 20 percent chance of showers after midnight.  Mostly cloudy, with a low around 53. Breezy, with a west wind 20 to 22 mph, with gusts as high as 29 mph.  New precipitation amounts of less than a tenth of an inch possible. "/>
 </p>
 <p class="short-desc">
  Breezy.
  <br>
   Mostly Cloudy
   <br>
    then Slight
    <br>
     Chance
     <br>
      Showers
     </br>
    </br>
   </br>
  </br>
 </p>
 <p class="temp temp-low">
  Low: 53 °F
 </p>
</div>


In [22]:
img = tonight.find("img")
desc = img['title']
print(desc)

Tonight: A 20 percent chance of showers after midnight.  Mostly cloudy, with a low around 53. Breezy, with a west wind 20 to 22 mph, with gusts as high as 29 mph.  New precipitation amounts of less than a tenth of an inch possible. 


# Extracting all information from the page

In [24]:
period_tags = seven_day.select(".tombstone-container .period-name")
periods = [pt.get_text() for pt in period_tags]
periods

['Tonight',
 'Wednesday',
 'WednesdayNight',
 'Thursday',
 'ThursdayNight',
 'Friday',
 'FridayNight',
 'Saturday',
 'SaturdayNight']

In [37]:
short_descs = [sd.get_text() for sd in seven_day.select(".tombstone-container .short-desc")]
temps = [t.get_text() for t in seven_day.select(".tombstone-container .temp")]
descs = [d["title"] for d in seven_day.select(".tombstone-container img")]
print(short_descs)
print(temps)
print(descs)

['Breezy.Mostly Cloudythen SlightChanceShowers', 'Slight ChanceShowers thenPartly Sunnyand Breezy', 'Partly Cloudyand Breezythen PatchyFog', 'Patchy Fogthen MostlySunny', 'Mostly Clearand Breezythen PatchyFog', 'Patchy Fogthen MostlySunny', 'Mostly Clearthen PatchyFog', 'Patchy Fogthen MostlySunny', 'Partly Cloudythen PatchyFog']
['Low: 53 °F', 'High: 68 °F', 'Low: 54 °F', 'High: 69 °F', 'Low: 53 °F', 'High: 73 °F', 'Low: 53 °F', 'High: 70 °F', 'Low: 52 °F']
['Tonight: A 20 percent chance of showers after midnight.  Mostly cloudy, with a low around 53. Breezy, with a west wind 20 to 22 mph, with gusts as high as 29 mph.  New precipitation amounts of less than a tenth of an inch possible. ', 'Wednesday: A 20 percent chance of showers before noon.  Partly sunny, with a high near 68. Breezy, with a west wind 17 to 23 mph, with gusts as high as 30 mph.  New precipitation amounts of less than a tenth of an inch possible. ', 'Wednesday Night: Patchy fog after midnight.  Otherwise, mostly clo

# Combinig our data into Pandas DataFrame

In [38]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

weather = pd.DataFrame({"period": periods,"short_desc": short_descs,"temp": temps,"desc": descs})
weather

Unnamed: 0,desc,period,short_desc,temp
0,Tonight: A 20 percent chance of showers after ...,Tonight,Breezy.Mostly Cloudythen SlightChanceShowers,Low: 53 °F
1,Wednesday: A 20 percent chance of showers befo...,Wednesday,Slight ChanceShowers thenPartly Sunnyand Breezy,High: 68 °F
2,Wednesday Night: Patchy fog after midnight. O...,WednesdayNight,Partly Cloudyand Breezythen PatchyFog,Low: 54 °F
3,"Thursday: Patchy fog before noon. Otherwise, ...",Thursday,Patchy Fogthen MostlySunny,High: 69 °F
4,Thursday Night: Patchy fog after midnight. Ot...,ThursdayNight,Mostly Clearand Breezythen PatchyFog,Low: 53 °F
5,"Friday: Patchy fog before noon. Otherwise, mo...",Friday,Patchy Fogthen MostlySunny,High: 73 °F
6,Friday Night: Patchy fog after midnight. Othe...,FridayNight,Mostly Clearthen PatchyFog,Low: 53 °F
7,"Saturday: Patchy fog before noon. Otherwise, ...",Saturday,Patchy Fogthen MostlySunny,High: 70 °F
8,Saturday Night: Patchy fog after midnight. Ot...,SaturdayNight,Partly Cloudythen PatchyFog,Low: 52 °F
