In [None]:
# BeautifulSoup Documentation - https://beautiful-soup-4.readthedocs.io/en/latest/
# BeautifulSoup @ pypi.org - https://pypi.org/project/beautifulsoup4/

In [None]:
# !pip install datefinder
# !pip install yfinance

In [1]:
import re
from datetime import datetime, timedelta
import pandas as pd
from tqdm import tqdm
import requests
import datefinder
from bs4 import BeautifulSoup
import yfinance as yf
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import plotly.graph_objects as go
import ipywidgets as widgets
import warnings
warnings.filterwarnings("ignore")

ModuleNotFoundError: No module named 'datefinder'

In [2]:
# Initialize constants.

ARTICLES_TO_EXTRACT_PER_DAY = 4

In [46]:
# Get start and end date to fetch articles.
current_time = datetime.now()
time_two_weeks_back = current_time - timedelta(days=13)
print(f"\nParsing data from " + time_two_weeks_back.strftime("%m/%d/%Y %H:00") + " to " + current_time.strftime("%m/%d/%Y %H:00") + "\n")

# Generate a list of links that fetches news release of past two weeks.
filtered_link_list = []
for single_date in pd.date_range(time_two_weeks_back, current_time):
    filtered_link_list.append(f"https://www.prnewswire.com/news-releases/news-releases-list/"
                              f"?page=1&pagesize={ARTICLES_TO_EXTRACT_PER_DAY}&month={single_date.month:02}"
                              f"&day={single_date.day:02}&year={single_date.year:04}&hour={single_date.hour:02}")

for itr in filtered_link_list:
    print(itr)

100%|██████████| 14/14 [01:27<00:00,  6.22s/it]


In [48]:
# Generate a list of required blog links.

print("Fetching links to articles of past two weeks..\n")
blogs_link_list = []
for i in tqdm(filtered_link_list):
    response = requests.get(i)
    home_page = BeautifulSoup(response.text, 'html.parser')
    news_release_list = list(home_page.find_all("a", attrs={'class': 'newsreleaseconsolidatelink display-outline'}))
    # Generate links to each articles.
    blogs_link_list.extend([f"https://www.prnewswire.com{i.attrs.get('href')}" for i in news_release_list if i.attrs.get("href","")])
print("\nFetching complete.\n\n")

for itr in blogs_link_list:
    print(itr)

print(f"\n\nNumber of articles: {len(blogs_link_list)}")

100%|██████████| 204/204 [02:12<00:00,  1.53it/s]
100%|██████████| 204/204 [02:05<00:00,  1.63it/s]
100%|██████████| 204/204 [02:08<00:00,  1.59it/s]
100%|██████████| 204/204 [01:44<00:00,  1.94it/s]
100%|██████████| 204/204 [01:27<00:00,  2.32it/s]
100%|██████████| 204/204 [01:22<00:00,  2.46it/s]
100%|██████████| 204/204 [01:42<00:00,  1.99it/s]
100%|██████████| 204/204 [01:23<00:00,  2.44it/s]
100%|██████████| 204/204 [01:18<00:00,  2.59it/s]
100%|██████████| 204/204 [01:17<00:00,  2.62it/s]
100%|██████████| 204/204 [01:18<00:00,  2.59it/s]
100%|██████████| 204/204 [01:24<00:00,  2.40it/s]
100%|██████████| 204/204 [01:21<00:00,  2.51it/s]
100%|██████████| 204/204 [01:32<00:00,  2.20it/s]


In [49]:
# Loop through article links, parse article body, date and add to dataframe.

print("Extracting data from scrapped content..\n\n")

data = pd.DataFrame(columns=["url", "article_date", "article_content"])

for i in blogs_link_list:
    blog = requests.get(i)
    blog_soup = BeautifulSoup(blog.text, 'html.parser')
    blog_body = blog_soup.find("section", attrs={'class': 'release-body container'})
    if not blog_body:
        blog_body = blog_soup.find("section", attrs={'class': 'release-body container '})
    blog_body = blog_body.text if blog_body else ""

    # Fetch article date.
    blog_date = blog_soup.find("meta", attrs={'name': 'date'}).attrs.get("content")
    matches = list(datefinder.find_dates(blog_date))
    blog_date = str(matches[0]) if matches else ""

    # Append to dataframe.
    data = data.append({
        "url": i,
        "article_date": blog_date,
        "article_content": blog_body},
        ignore_index=True)



100%|██████████| 2673/2673 [00:07<00:00, 346.91it/s]


In [123]:
data.head()

Unnamed: 0,Article_Title,Article_id,Keyword,Article_Text,Publish Date,Url
0,Internet of Things News and Press Releases fro...,<newspaper.article.Article object at 0x0000021...,"[press, subject, sources, request, submit, pro...","Journalists and Bloggers\n\nThe news you need,...",,https://www.prnewswire.com/news-releases/consu...
1,All Heavy Industry & Manufacturing News and Pr...,<newspaper.article.Article object at 0x0000021...,"[press, manufacturing, subject, sources, reque...","Journalists and Bloggers\n\nThe news you need,...",,https://www.prnewswire.com/news-releases/heavy...
2,"All Machine Tools, Metalworking and Metallurgy...",<newspaper.article.Article object at 0x0000021...,"[press, subject, sources, request, machine, su...","Journalists and Bloggers\n\nThe news you need,...",,https://www.prnewswire.com/news-releases/heavy...
3,All Consumer Products & Retail News and Press ...,<newspaper.article.Article object at 0x0000021...,"[press, retail, subject, sources, request, sub...","Journalists and Bloggers\n\nThe news you need,...",,https://www.prnewswire.com/news-releases/consu...
4,All Amusement Parks and Tourist Attractions Ne...,<newspaper.article.Article object at 0x0000021...,"[press, subject, sources, request, parks, subm...","Journalists and Bloggers\n\nThe news you need,...",,https://www.prnewswire.com/news-releases/trave...
...,...,...,...,...,...,...
2668,What's the new normal for PR pros and press re...,<newspaper.article.Article object at 0x0000021...,"[press, release, cisions, normal, pr, releases...",What's the new normal for PR pros and press re...,2021-09-13 00:00:00,https://prnewswire.mediaroom.com/2021-09-13-Wh...
2669,"PR Newswire, Cision Communications Cloud rank ...",<newspaper.article.Article object at 0x0000021...,"[g2, ranked, reviews, pr, cloud, best, accordi...","PR Newswire, Cision Communications Cloud rank ...",2021-07-07 00:00:00,https://prnewswire.mediaroom.com/2021-07-07-PR...
2670,"PR Newswire, Cision Communications Cloud rank ...",<newspaper.article.Article object at 0x0000021...,"[g2, ranked, reviews, pr, cloud, best, accordi...","PR Newswire, Cision Communications Cloud rank ...",2021-07-07 00:00:00,https://prnewswire.mediaroom.com/2021-07-07-PR...
2671,Cision Announces Premium Sponsorship of the NI...,<newspaper.article.Article object at 0x0000021...,"[virtual, social, technology, solutions, commu...",Cision Announces Premium Sponsorship of the NI...,2021-06-21 00:00:00,https://prnewswire.mediaroom.com/2021-06-21-Ci...


In [124]:
#Dropping duplicates if any
data.drop_duplicates(subset = "article_content",inplace = True,ignore_index = True)

In [285]:
tickers = set()
for i in range(len(data)):
    temp = re.findall(r':\s[A-Z]{1,5}[)]',data.iloc[i]["article_content"])
    for tick in temp: 
        tickers.add(tick[-(len(tick) - 2):-1])

In [297]:
stocks = {}
for tick in tickers:
    stocks[tick] = yf.Ticker(tick).history(period = "YTD")

- RI: No data found for this date range, symbol may be delisted


In [None]:
def candlestick(ticker,increasing_line,decreasing_line):
        fig = go.Figure(data=[go.Candlestick(x=stocks[ticker].index,
                open=stocks[ticker]['Open'],
                high=stocks[ticker]['High'],
                low=stocks[ticker]['Low'],
                close=stocks[ticker]['Close'],increasing_line_color= increasing_line, decreasing_line_color= decreasing_line)])
        fig.update_layout(autosize=False,
        width=1000,
        height=800,)
        fig.show()
        
def plot_tick(ticker):
    fig = plt.figure(figsize=(30,21))
    ax1 = plt.subplot(2, 2, 1)
    plt.xticks(rotation=45)
    plt.xticks(fontsize = 20)
    plt.yticks(fontsize = 20)
    ax2 = plt.subplot(2, 2, 2)
    plt.xticks(rotation=45)
    plt.xticks(fontsize = 20)
    plt.yticks(fontsize = 20)
    ax2.yaxis.offsetText.set_fontsize(20)
    ax1.plot(stocks[ticker]["Close"])
    ax2.plot(stocks[ticker]["Volume"])

In [None]:
widgets.interact(candlestick, ticker = stocks.keys(),increasing_line = i_cl,decreasing_line = d_cl)

In [None]:
widgets.interact(plot_tick, ticker = stocks.keys())

In [3]:
data.to_excel("Scraping.xlsx",index = False)

NameError: name 'data' is not defined