In [None]:
# BeautifulSoup Documentation - https://beautiful-soup-4.readthedocs.io/en/latest/
# BeautifulSoup @ pypi.org - https://pypi.org/project/beautifulsoup4/

In [None]:
# !pip install datefinder
# !pip install yfinance

In [1]:
import re
from datetime import datetime, timedelta
import pandas as pd
from tqdm import tqdm
import requests
import datefinder
from bs4 import BeautifulSoup
import yfinance as yf
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import plotly.graph_objects as go
import ipywidgets as widgets
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Initialize constants.

ARTICLES_TO_EXTRACT_PER_DAY = 4

In [3]:
# Get start and end date to fetch articles.
current_time = datetime.now()
time_two_weeks_back = current_time - timedelta(days=13)
print(f"\nParsing data from " + time_two_weeks_back.strftime("%m/%d/%Y %H:00") + " to " + current_time.strftime("%m/%d/%Y %H:00") + "\n")

# Generate a list of links that fetches news release of past two weeks.
filtered_link_list = []
for single_date in pd.date_range(time_two_weeks_back, current_time):
    filtered_link_list.append(f"https://www.prnewswire.com/news-releases/news-releases-list/"
                              f"?page=1&pagesize={ARTICLES_TO_EXTRACT_PER_DAY}&month={single_date.month:02}"
                              f"&day={single_date.day:02}&year={single_date.year:04}&hour={single_date.hour:02}")

for itr in filtered_link_list:
    print(itr)


Parsing data from 11/17/2021 20:00 to 11/30/2021 20:00

https://www.prnewswire.com/news-releases/news-releases-list/?page=1&pagesize=4&month=11&day=17&year=2021&hour=20
https://www.prnewswire.com/news-releases/news-releases-list/?page=1&pagesize=4&month=11&day=18&year=2021&hour=20
https://www.prnewswire.com/news-releases/news-releases-list/?page=1&pagesize=4&month=11&day=19&year=2021&hour=20
https://www.prnewswire.com/news-releases/news-releases-list/?page=1&pagesize=4&month=11&day=20&year=2021&hour=20
https://www.prnewswire.com/news-releases/news-releases-list/?page=1&pagesize=4&month=11&day=21&year=2021&hour=20
https://www.prnewswire.com/news-releases/news-releases-list/?page=1&pagesize=4&month=11&day=22&year=2021&hour=20
https://www.prnewswire.com/news-releases/news-releases-list/?page=1&pagesize=4&month=11&day=23&year=2021&hour=20
https://www.prnewswire.com/news-releases/news-releases-list/?page=1&pagesize=4&month=11&day=24&year=2021&hour=20
https://www.prnewswire.com/news-release

In [4]:
# Generate a list of required blog links.

print("Fetching links to articles of past two weeks..\n")
blogs_link_list = []
for i in tqdm(filtered_link_list):
    response = requests.get(i)
    home_page = BeautifulSoup(response.text, 'html.parser')
    news_release_list = list(home_page.find_all("a", attrs={'class': 'newsreleaseconsolidatelink display-outline'}))
    # Generate links to each articles.
    blogs_link_list.extend([f"https://www.prnewswire.com{i.attrs.get('href')}" for i in news_release_list if i.attrs.get("href","")])
print("\nFetching complete.\n\n")

for itr in blogs_link_list:
    print(itr)

print(f"\n\nNumber of articles: {len(blogs_link_list)}")

  0%|          | 0/14 [00:00<?, ?it/s]

Fetching links to articles of past two weeks..



100%|██████████| 14/14 [00:12<00:00,  1.15it/s]


Fetching complete.


https://www.prnewswire.com/news-releases/tripcom-group-and-wyndham-hotels--resorts-sign-strategic-global-agreement-301427392.html
https://www.prnewswire.com/news-releases/southwest-gas-holdings-declares-first-quarter-2022-dividend-301427399.html
https://www.prnewswire.com/news-releases/system-integration-services-market-to-record-6-25-cagr-by-2025--accenture-plc-and-capgemini-se-among-key-vendors--technavio-301425724.html
https://www.prnewswire.com/news-releases/quattro-development-announces-settlement-of-lawsuit-with-mattress-firm-301427419.html
https://www.prnewswire.com/news-releases/dairy-free-market-size-to-grow-by-usd-11-12-bn-from-2020-to-2025--blue-diamond-growers-and-campbell-soup-co-among-key-market-contributors--technavio-301426859.html
https://www.prnewswire.com/news-releases/tennecos-cvsae-suspension-technology-to-be-featured-on-new-hybrid-suv-model-for-china-301428478.html
https://www.prnewswire.com/news-releases/glen-gery-celebrates-launch-of-its-20




In [5]:
# Loop through article links, parse article body, date and add to dataframe.

print("Extracting data from scrapped content..\n\n")

data = pd.DataFrame(columns=["url", "article_date", "article_content"])

for i in blogs_link_list:
    blog = requests.get(i)
    blog_soup = BeautifulSoup(blog.text, 'html.parser')
    blog_body = blog_soup.find("section", attrs={'class': 'release-body container'})
    if not blog_body:
        blog_body = blog_soup.find("section", attrs={'class': 'release-body container '})
    blog_body = blog_body.text if blog_body else ""

    # Fetch article date.
    blog_date = blog_soup.find("meta", attrs={'name': 'date'}).attrs.get("content")
    matches = list(datefinder.find_dates(blog_date))
    blog_date = str(matches[0]) if matches else ""

    # Append to dataframe.
    data = data.append({
        "url": i,
        "article_date": blog_date,
        "article_content": blog_body},
        ignore_index=True)



Extracting data from scrapped content..




In [6]:
data.head()

Unnamed: 0,url,article_date,article_content
0,https://www.prnewswire.com/news-releases/tripc...,2021-11-17 20:00:00-05:00,"\n\n\nSHANGHAI, Nov. 17, 2021 /PRNewswire/ -- ..."
1,https://www.prnewswire.com/news-releases/south...,2021-11-17 20:00:00-05:00,"\n\n\nLAS VEGAS, Nov. 17, 2021 /PRNewswire/ --..."
2,https://www.prnewswire.com/news-releases/syste...,2021-11-17 20:00:00-05:00,"\n\n\nNEW YORK, Nov. 17, 2021 /PRNewswire/ -- ..."
3,https://www.prnewswire.com/news-releases/quatt...,2021-11-17 20:00:00-05:00,"\n\n\nOAK BROOK, Ill., Nov. 17, 2021 /PRNewswi..."
4,https://www.prnewswire.com/news-releases/dairy...,2021-11-18 20:00:00-05:00,"\n\n\nNEW YORK, Nov. 18, 2021 /PRNewswire/ -- ..."


In [7]:
#Dropping duplicates if any
data.drop_duplicates(subset = "article_content",inplace = True,ignore_index = True)

In [8]:
tickers = set()
for i in range(len(data)):
    temp = re.findall(r':\s[A-Z]{1,5}[)]',data.iloc[i]["article_content"])
    for tick in temp: 
        tickers.add(tick[-(len(tick) - 2):-1])

In [9]:
stocks = {}
for tick in tickers:
    stocks[tick] = yf.Ticker(tick).history(period = "YTD")

- KHRN: No data found, symbol may be delisted
- DCRB: No data found, symbol may be delisted


In [12]:
def candlestick(ticker,increasing_line,decreasing_line):
        fig = go.Figure(data=[go.Candlestick(x=stocks[ticker].index,
                open=stocks[ticker]['Open'],
                high=stocks[ticker]['High'],
                low=stocks[ticker]['Low'],
                close=stocks[ticker]['Close'],increasing_line_color= increasing_line, decreasing_line_color= decreasing_line)])
        fig.update_layout(autosize=False,
        width=1000,
        height=800,)
        fig.show()
        
def plot_tick(ticker):
    fig = plt.figure(figsize=(30,21))
    ax1 = plt.subplot(2, 2, 1)
    plt.xticks(rotation=45)
    plt.xticks(fontsize = 20)
    plt.yticks(fontsize = 20)
    ax2 = plt.subplot(2, 2, 2)
    plt.xticks(rotation=45)
    plt.xticks(fontsize = 20)
    plt.yticks(fontsize = 20)
    ax2.yaxis.offsetText.set_fontsize(20)
    ax1.plot(stocks[ticker]["Close"])
    ax2.plot(stocks[ticker]["Volume"])
i_cl = ["gold","green","cyan"]
d_cl = ["gray","red","black"]

In [13]:
widgets.interact(candlestick, ticker = stocks.keys(),increasing_line = i_cl,decreasing_line = d_cl)

interactive(children=(Dropdown(description='ticker', options=('DM', 'HYZN', 'EVGDF', 'FB', 'HEPS', 'ZME', 'MAR…

<function __main__.candlestick(ticker, increasing_line, decreasing_line)>

In [14]:
widgets.interact(plot_tick, ticker = stocks.keys())

interactive(children=(Dropdown(description='ticker', options=('DM', 'HYZN', 'EVGDF', 'FB', 'HEPS', 'ZME', 'MAR…

<function __main__.plot_tick(ticker)>

In [15]:
data.to_excel("Scraping.xlsx",index = False)