## Extracting data using Web Scraping
We use beautiful soup to extract data from a webpage. Additionally, using the pandas library, the difference between printing and displaying a dataframe is shown

In [1]:
!pip install pandas==1.3.3
!pip install requests==2.26.0
!mamba install bs4==4.10.0 -y
!mamba install html5lib==1.1 -y
!pip install lxml==4.6.4
!pip install plotly==5.3.1

zsh:1: command not found: mamba
zsh:1: command not found: mamba


In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [3]:
#First assign URL to a variable
url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-PY0220EN-SkillsNetwork/labs/project/amazon_data_webpage.html"

#Then use the requests library to download the HTML webpage
html_data = requests.get(url).text

In [15]:
#If we print the html_data, we will need to use BeautifulSoup to interpret the data
print(html_data)

In [5]:
#Use Beautiful soup, parse (Data parsing is converting data from one format to another)the HTML data.
# html5lib is a pure-python library for parsing HTML

soup = BeautifulSoup(html_data, 'html5lib')

#Now we can turn the html table into a pandas dataframe

amazon_data = pd.DataFrame(columns=["Date", "Open", "High", "Low", "Close", "Volume"])

In [6]:


# First we isolate the body of the table which contains all the information
# Then we loop through each row and find all the column values for each row

for row in soup.find("tbody").find_all('tr'):
    col = row.find_all("td")
    date = col[0].text
    Open = col[1].text
    high = col[2].text
    low = col[3].text
    close = col[4].text
    adj_close = col[5].text
    volume = col[6].text
    
    # Finally we append the data of each row to the table
    amazon_data = amazon_data.append({"Date":date, "Open":Open, "High":high, "Low":low, "Close":close,
                                      "Adj Close":adj_close, "Volume":volume}, ignore_index=True)    


In [8]:
#Print the first 5 rows of the table
amazon_data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
0,"Jan 01, 2021",3270.0,3363.89,3086.0,3206.2,71528900,3206.2
1,"Dec 01, 2020",3188.5,3350.65,3072.82,3256.93,77556200,3256.93
2,"Nov 01, 2020",3061.74,3366.8,2950.12,3168.04,90810500,3168.04
3,"Oct 01, 2020",3208.0,3496.24,3019.0,3036.15,116226100,3036.15
4,"Sep 01, 2020",3489.58,3552.25,2871.0,3148.73,115899300,3148.73


In [13]:
print(amazon_data)

            Date      Open      High       Low     Close       Volume  \
0   Jan 01, 2021  3,270.00  3,363.89  3,086.00  3,206.20   71,528,900   
1   Dec 01, 2020  3,188.50  3,350.65  3,072.82  3,256.93   77,556,200   
2   Nov 01, 2020  3,061.74  3,366.80  2,950.12  3,168.04   90,810,500   
3   Oct 01, 2020  3,208.00  3,496.24  3,019.00  3,036.15  116,226,100   
4   Sep 01, 2020  3,489.58  3,552.25  2,871.00  3,148.73  115,899,300   
..           ...       ...       ...       ...       ...          ...   
56  May 01, 2016    663.92    724.23    656.00    722.79   90,614,500   
57  Apr 01, 2016    590.49    669.98    585.25    659.59   78,464,200   
58  Mar 01, 2016    556.29    603.24    538.58    593.64   94,009,500   
59  Feb 01, 2016    578.15    581.80    474.00    552.52  124,144,800   
60  Jan 01, 2016    656.29    657.72    547.18    587.00  130,200,900   

   Adj Close  
0   3,206.20  
1   3,256.93  
2   3,168.04  
3   3,036.15  
4   3,148.73  
..       ...  
56    722.79  
57 

In [14]:
display(amazon_data)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
0,"Jan 01, 2021",3270.00,3363.89,3086.00,3206.20,71528900,3206.20
1,"Dec 01, 2020",3188.50,3350.65,3072.82,3256.93,77556200,3256.93
2,"Nov 01, 2020",3061.74,3366.80,2950.12,3168.04,90810500,3168.04
3,"Oct 01, 2020",3208.00,3496.24,3019.00,3036.15,116226100,3036.15
4,"Sep 01, 2020",3489.58,3552.25,2871.00,3148.73,115899300,3148.73
...,...,...,...,...,...,...,...
56,"May 01, 2016",663.92,724.23,656.00,722.79,90614500,722.79
57,"Apr 01, 2016",590.49,669.98,585.25,659.59,78464200,659.59
58,"Mar 01, 2016",556.29,603.24,538.58,593.64,94009500,593.64
59,"Feb 01, 2016",578.15,581.80,474.00,552.52,124144800,552.52
