In [1]:
import requests
page = requests.get("https://dataquestio.github.io/web-scraping-pages/simple.html")

In [2]:
page

<Response [200]>

In [3]:
page.status_code

200

In [4]:
page.content

b'<!DOCTYPE html>\n<html>\n    <head>\n        <title>A simple example page</title>\n    </head>\n    <body>\n        <p>Here is some simple content for this page.</p>\n    </body>\n</html>'

In [5]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(page.content,'html.parser')

In [6]:
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <p>
   Here is some simple content for this page.
  </p>
 </body>
</html>


In [7]:
list(soup.children)

['html',
 '\n',
 <html>
 <head>
 <title>A simple example page</title>
 </head>
 <body>
 <p>Here is some simple content for this page.</p>
 </body>
 </html>]

In [8]:
[type(item) for item in list(soup.children)]

[bs4.element.Doctype, bs4.element.NavigableString, bs4.element.Tag]

In [9]:
html = list(soup.children)[2]

In [10]:
html

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<p>Here is some simple content for this page.</p>
</body>
</html>

In [11]:
list(html.children)

['\n',
 <head>
 <title>A simple example page</title>
 </head>,
 '\n',
 <body>
 <p>Here is some simple content for this page.</p>
 </body>,
 '\n']

In [14]:
body = list(html.children)[3]

In [15]:
body

<body>
<p>Here is some simple content for this page.</p>
</body>

In [16]:
list(body.children)

['\n', <p>Here is some simple content for this page.</p>, '\n']

In [17]:
p=list(body.children)[1]

In [18]:
p.get_text()

'Here is some simple content for this page.'

In [20]:
soup.find_all('p')[0].get_text()

'Here is some simple content for this page.'

In [21]:
page = requests.get('https://dataquestio.github.io/web-scraping-pages/ids_and_classes.html')
soup = BeautifulSoup(page.content,'html.parser')

In [22]:
print(soup.prettify())

<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <div>
   <p class="inner-text first-item" id="first">
    First paragraph.
   </p>
   <p class="inner-text">
    Second paragraph.
   </p>
  </div>
  <p class="outer-text first-item" id="second">
   <b>
    First outer paragraph.
   </b>
  </p>
  <p class="outer-text">
   <b>
    Second outer paragraph.
   </b>
  </p>
 </body>
</html>


In [25]:
soup.find_all('p',class_='outer-text')

AttributeError: ResultSet object has no attribute 'get_text'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?

# Web Scaping

In [29]:
page = requests.get('https://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168#.Yem54OrMKUk')

In [30]:
soup = BeautifulSoup(page.content,'html.parser')

In [44]:
data=soup.find(id='seven-day-forecast-list')

In [59]:
tempData_today = data.find_all(class_='tombstone-container')

In [63]:
today=tempData_today[0]

In [64]:
print(today.prettify())

<div class="tombstone-container">
 <p class="period-name">
  Today
  <br/>
  <br/>
 </p>
 <p>
  <img alt="Today: Sunny, with a high near 62. Northwest wind 7 to 10 mph. " class="forecast-icon" src="newimages/medium/few.png" title="Today: Sunny, with a high near 62. Northwest wind 7 to 10 mph. "/>
 </p>
 <p class="short-desc">
  Sunny
 </p>
 <p class="temp temp-high">
  High: 62 °F
 </p>
</div>


In [65]:
period = today.find(class_='period-name').get_text()
short_desc = today.find(class_='short-desc').get_text()
temperature = today.find(class_='temp').get_text()

print(f'{period}\n{short_desc}\n{temperature}')

Today
Sunny
High: 62 °F


In [66]:
img_data = today.find('img')

In [67]:
desc = img_data['title']

In [68]:
print(desc)

Today: Sunny, with a high near 62. Northwest wind 7 to 10 mph. 


# Extracting all the information from the page

In [72]:
period_tag = data.select('#seven-day-forecast-list .tombstone-container .period-name')

In [75]:
days = [day.get_text() for day in period_tag]

In [76]:
days

['Today',
 'Tonight',
 'Friday',
 'FridayNight',
 'Saturday',
 'SaturdayNight',
 'Sunday',
 'SundayNight',
 'Monday']

In [77]:
desc_data = data.select('#seven-day-forecast-list .tombstone-container img') 

In [78]:
desc = [i['title'] for i in desc_data]

In [79]:
desc

['Today: Sunny, with a high near 62. Northwest wind 7 to 10 mph. ',
 'Tonight: Mostly clear, with a low around 47. West wind 5 to 8 mph becoming calm  after midnight. ',
 'Friday: Sunny, with a high near 65. North northwest wind 9 to 15 mph, with gusts as high as 20 mph. ',
 'Friday Night: Mostly clear, with a low around 48. Breezy, with a north northeast wind 21 to 26 mph decreasing to 14 to 19 mph after midnight. Winds could gust as high as 33 mph. ',
 'Saturday: Sunny, with a high near 63. Northeast wind 10 to 17 mph, with gusts as high as 22 mph. ',
 'Saturday Night: Mostly clear, with a low around 45.',
 'Sunday: Sunny, with a high near 61.',
 'Sunday Night: Mostly clear, with a low around 46.',
 'Monday: Sunny, with a high near 60.']

In [82]:
weather_data = data.select('#seven-day-forecast-list .tombstone-container .short-desc') 

In [83]:
weather = [waet.get_text() for waet in weather_data]

In [84]:
weather

['Sunny',
 'Mostly Clear',
 'Sunny',
 'Mostly Clearand Breezythen MostlyClear',
 'Sunny',
 'Mostly Clear',
 'Sunny',
 'Mostly Clear',
 'Sunny']

In [85]:
temp_reading = data.select('#seven-day-forecast-list .tombstone-container .temp') 

temp = [t.get_text() for t in temp_reading]

In [86]:
temp

['High: 62 °F',
 'Low: 47 °F',
 'High: 65 °F',
 'Low: 48 °F',
 'High: 63 °F',
 'Low: 45 °F',
 'High: 61 °F',
 'Low: 46 °F',
 'High: 60 °F']

# Combining the data into the Dataframe

In [87]:
import pandas as pd
df_weather = pd.DataFrame({
    "Period":days,
    "description":desc,
    "short_desc":weather,
    "temperature":temp
})

In [88]:
df_weather

Unnamed: 0,Period,description,short_desc,temperature
0,Today,"Today: Sunny, with a high near 62. Northwest w...",Sunny,High: 62 °F
1,Tonight,"Tonight: Mostly clear, with a low around 47. W...",Mostly Clear,Low: 47 °F
2,Friday,"Friday: Sunny, with a high near 65. North nort...",Sunny,High: 65 °F
3,FridayNight,"Friday Night: Mostly clear, with a low around ...",Mostly Clearand Breezythen MostlyClear,Low: 48 °F
4,Saturday,"Saturday: Sunny, with a high near 63. Northeas...",Sunny,High: 63 °F
5,SaturdayNight,"Saturday Night: Mostly clear, with a low aroun...",Mostly Clear,Low: 45 °F
6,Sunday,"Sunday: Sunny, with a high near 61.",Sunny,High: 61 °F
7,SundayNight,"Sunday Night: Mostly clear, with a low around 46.",Mostly Clear,Low: 46 °F
8,Monday,"Monday: Sunny, with a high near 60.",Sunny,High: 60 °F


In [89]:
df_weather.dtypes

Period         object
description    object
short_desc     object
temperature    object
dtype: object

In [91]:
temp_nums = df_weather["temperature"].str.extract("(?P<temp_num>\d+)", expand=False)

In [94]:
df_weather['temperature_value'] = temp_nums.astype('int')

In [95]:
df_weather

Unnamed: 0,Period,description,short_desc,temperature,temperature_value
0,Today,"Today: Sunny, with a high near 62. Northwest w...",Sunny,High: 62 °F,62
1,Tonight,"Tonight: Mostly clear, with a low around 47. W...",Mostly Clear,Low: 47 °F,47
2,Friday,"Friday: Sunny, with a high near 65. North nort...",Sunny,High: 65 °F,65
3,FridayNight,"Friday Night: Mostly clear, with a low around ...",Mostly Clearand Breezythen MostlyClear,Low: 48 °F,48
4,Saturday,"Saturday: Sunny, with a high near 63. Northeas...",Sunny,High: 63 °F,63
5,SaturdayNight,"Saturday Night: Mostly clear, with a low aroun...",Mostly Clear,Low: 45 °F,45
6,Sunday,"Sunday: Sunny, with a high near 61.",Sunny,High: 61 °F,61
7,SundayNight,"Sunday Night: Mostly clear, with a low around 46.",Mostly Clear,Low: 46 °F,46
8,Monday,"Monday: Sunny, with a high near 60.",Sunny,High: 60 °F,60


In [96]:
!git.init()

'git.init' is not recognized as an internal or external command,
operable program or batch file.
