<a href="https://colab.research.google.com/github/rasyandazn/Learning/blob/main/Web_Scraping_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install yfinance
!pip install bs4
!pip install nbformat
!pip install --upgrade plotly

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4
Successfully installed bs4-0.0.2
Collecting plotly
  Downloading plotly-6.1.2-py3-none-any.whl.metadata (6.9 kB)
Downloading plotly-6.1.2-py3-none-any.whl (16.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m53.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: plotly
  Attempting uninstall: plotly
    Found existing installation: plotly 5.24.1
    Uninstalling plotly-5.24.1:
      Successfully uninstalled plotly-5.24.1
Successfully installed plotly-6.1.2


In [2]:
import yfinance as yf
import pandas as pd
import requests
from bs4 import BeautifulSoup
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import plotly.io as pio
pio.renderers.default = "iframe"

import warnings
# Ignore all warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Tesla Stock

In [3]:
# Use yfinance to extract stock data

tesla = yf.Ticker("TSLA")

In [4]:
# Retrieve historical stock data and set the period to 'max' to get data from the beginning until today

tesla_data = tesla.history(period="max")

In [5]:
# By default, the DataFrame returned by .history() uses Date as the index.
# Using reset_index(inplace=True) changes that index into a regular column named "Date".
tesla_data.reset_index(inplace=True)

# Show the first 5 columns
print(tesla_data.head())

                       Date      Open      High       Low     Close  \
0 2010-06-29 00:00:00-04:00  1.266667  1.666667  1.169333  1.592667   
1 2010-06-30 00:00:00-04:00  1.719333  2.028000  1.553333  1.588667   
2 2010-07-01 00:00:00-04:00  1.666667  1.728000  1.351333  1.464000   
3 2010-07-02 00:00:00-04:00  1.533333  1.540000  1.247333  1.280000   
4 2010-07-06 00:00:00-04:00  1.333333  1.333333  1.055333  1.074000   

      Volume  Dividends  Stock Splits  
0  281494500        0.0           0.0  
1  257806500        0.0           0.0  
2  123282000        0.0           0.0  
3   77097000        0.0           0.0  
4  103003500        0.0           0.0  


In [48]:
# Use webscraping to extract revenue data

# Step 1
import requests
url_tesla = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-PY0220EN-SkillsNetwork/labs/project/revenue.htm"
response_tesla = requests.get(url_tesla)

html_data = response_tesla.text

In [25]:
# Step 2: Parsing the data

soup = BeautifulSoup(html_data,'html.parser')

In [26]:
# Step 3: Identify the HTML tags

# Create an empty dataframe with columns Date, Open, High, Low, Close, Volume
tesla_revenue = pd.DataFrame(columns=["Date","Revenue"])

In [27]:
# Step 4: Use BeautifulSoup to extract data

# First we isolate the table
targeted_table = soup.find_all("tbody")[1]

# Then we loop through each row and find all the column values for each row
for row in targeted_table.find_all("tr"):
    col = row.find_all("td")
    date = col[0].text
    revenue = col[1].text

    # Finally we append the data of each row to the table
    tesla_revenue = pd.concat([tesla_revenue, pd.DataFrame({"Date":[date], "Revenue":[revenue]})], ignore_index=True)

In [49]:
tesla_revenue.tail()

Unnamed: 0,Date,Revenue
48,2010-09-30,31.0
49,2010-06-30,28.0
50,2010-03-31,21.0
52,2009-09-30,46.0
53,2009-06-30,27.0


In [29]:
# Remove null/empty strings in the Revenue column

tesla_revenue.dropna(inplace=True)
tesla_revenue = tesla_revenue[tesla_revenue['Revenue'] != ""]

In [30]:
# Remove the comma and $ from the Revenue column
tesla_revenue["Revenue"] = tesla_revenue['Revenue'].str.replace(',|\$',"", regex=True).astype(float)

In [31]:
# Display the last 5 rows of the tesla_revenue
tesla_revenue.tail()

Unnamed: 0,Date,Revenue
48,2010-09-30,31.0
49,2010-06-30,28.0
50,2010-03-31,21.0
52,2009-09-30,46.0
53,2009-06-30,27.0


In [32]:
# Plot tesla stock graph

import plotly.io as pio
pio.renderers.default = 'colab'

def make_graph(tesla_data, tesla_revenue, stock):
    fig = make_subplots(rows=2, cols=1, shared_xaxes=True,
                        subplot_titles=("Historical Share Price", "Historical Revenue"),
                        vertical_spacing=0.3)

    stock_data_specific = tesla_data[tesla_data.Date <= '2021-06-14']
    revenue_data_specific = tesla_revenue[tesla_revenue.Date <= '2021-04-30']

    fig.add_trace(go.Scatter(
        x=pd.to_datetime(stock_data_specific.Date, infer_datetime_format=True),
        y=stock_data_specific.Close.astype("float"),
        name="Share Price"), row=1, col=1)

    fig.add_trace(go.Scatter(
        x=pd.to_datetime(revenue_data_specific.Date, infer_datetime_format=True),
        y=revenue_data_specific.Revenue.astype("float"),
        name="Revenue"), row=2, col=1)

    fig.update_xaxes(title_text="Date", row=1, col=1)
    fig.update_xaxes(title_text="Date", row=2, col=1)
    fig.update_yaxes(title_text="Price ($US)", row=1, col=1)
    fig.update_yaxes(title_text="Revenue ($US Millions)", row=2, col=1)

    fig.update_layout(showlegend=False, height=900, title=stock, xaxis_rangeslider_visible=True)
    fig.show()


In [33]:
make_graph(tesla_data, tesla_revenue, "Tesla")


The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.


The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.



# GameStop Stock

In [47]:
# Use yfinance to extract stock data

GameStop = yf.Ticker("GME")

print(GameStop)

yfinance.Ticker object <GME>


In [22]:
# Retrieve historical stock data and set the period to 'max' to get data from the beginning until today

gme_data = GameStop.history(period="max")

In [23]:
# By default, the DataFrame returned by .history() uses Date as the index.
# Using reset_index(inplace=True) changes that index into a regular column named "Date".
gme_data.reset_index(inplace=True)

# Show the first 5 columns
print(gme_data.head())

                       Date      Open      High       Low     Close    Volume  \
0 2002-02-13 00:00:00-05:00  1.620128  1.693350  1.603296  1.691666  76216000   
1 2002-02-14 00:00:00-05:00  1.712708  1.716074  1.670626  1.683251  11021600   
2 2002-02-15 00:00:00-05:00  1.683251  1.687459  1.658002  1.674834   8389600   
3 2002-02-19 00:00:00-05:00  1.666418  1.666418  1.578047  1.607504   7410400   
4 2002-02-20 00:00:00-05:00  1.615920  1.662210  1.603296  1.662210   6892800   

   Dividends  Stock Splits  
0        0.0           0.0  
1        0.0           0.0  
2        0.0           0.0  
3        0.0           0.0  
4        0.0           0.0  


In [50]:
# Use webscraping to extract revenue data

# Step 1
import requests
url_game = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-PY0220EN-SkillsNetwork/labs/project/stock.html"
response_game = requests.get(url_game)

html_data_2 = response_game.text

In [51]:
# Step 2: Parsing the data

soup_2 = BeautifulSoup(html_data_2,'html.parser')

In [52]:
# Step 3: Identify the HTML tags

# Create an empty dataframe with columns Date, Open, High, Low, Close, Volume
gme_revenue = pd.DataFrame(columns=["Date","Revenue"])

In [53]:
# Step 4: Use BeautifulSoup to extract data

# First we isolate the table
targeted_table = soup_2.find_all("tbody")[1]

# Then we loop through each row and find all the column values for each row
for row in targeted_table.find_all("tr"):
    col = row.find_all("td")
    date = col[0].text
    revenue = col[1].text

    # Finally we append the data of each row to the table
    gme_revenue = pd.concat([gme_revenue, pd.DataFrame({"Date":[date], "Revenue":[revenue]})], ignore_index=True)

In [54]:
gme_revenue.tail()

Unnamed: 0,Date,Revenue
57,2006-01-31,"$1,667"
58,2005-10-31,$534
59,2005-07-31,$416
60,2005-04-30,$475
61,2005-01-31,$709


In [43]:
# Remove null/empty strings in the Revenue column

gme_revenue.dropna(inplace=True)
gme_revenue = gme_revenue[gme_revenue['Revenue'] != ""]

In [44]:
# Remove the comma and $ from the Revenue column
gme_revenue["Revenue"] = gme_revenue['Revenue'].str.replace(',|\$',"", regex=True).astype(float)

In [45]:
make_graph(gme_data, gme_revenue, "GameStop")


The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.


The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.

