In [19]:
# Mounting drive to fetch Stock prediction model.

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# AIDI_1100_02 Group - B

## Ram Raghu Sankar 
## Shrijit Rajesh Pendse
## Varun Kadian
## Vrushabh Shah
## Oscar Alan Lozada Villa

## December 2, 2021

<p> The program's function is to scan or parse through a website to save articles for a period of time and is able to track, store and search within the articles saved trying to find content related to a industry of your choice. Also, the code can retrieve data from Yahoo! Finance's page for a specific stock symbol that appeared in the news or articles stored before. The program includes a function to create visualisations showing the selected stock prices for the last 30 days. </p>

In [None]:
# BeautifulSoup Documentation - https://beautiful-soup-4.readthedocs.io/en/latest/
# BeautifulSoup @ pypi.org - https://pypi.org/project/beautifulsoup4/

In [2]:
# Please uncomment and install these packages while first time run. These packages are not available in colab by default.


# !pip install datefinder
# !pip install yfinance

In [1]:
import re
import warnings
from collections import deque
from datetime import datetime, timedelta

import datefinder
import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import requests
import tensorflow as tf
import yfinance as yf
from bs4 import BeautifulSoup
from tqdm import tqdm

warnings.filterwarnings("ignore")

In [2]:
# Initialize constants, setting the number of articles extracted from the website to 4.
ARTICLES_TO_EXTRACT_PER_DAY = 4

## Scan/Parse

<p> Scan and parse news from "newswire" website for the past two weeks </p>

In [9]:
class Parser:
    """
    Todo: Write the class desc.
    """

    def __init__(self):
        pass

    def generate_links_for_date_wise_news_releases(self) -> list:
        """
        Method to generate a list of links that fetches news release of past two weeks.
        :return: list of links to get date wise news releases.
        """

        # Get start and end date to fetch articles.
        current_time = datetime.now()
        time_two_weeks_back = current_time - timedelta(days=13)
        print(f"\nParsing news from " + time_two_weeks_back.strftime("%m/%d/%Y %H:00") + " to " + current_time.strftime(
            "%m/%d/%Y %H:00") + "\n")

        # Generate a list of links that shows news release of past two weeks.
        filtered_link_list = []
        for single_date in pd.date_range(time_two_weeks_back, current_time):
            filtered_link_list.append(f"https://www.prnewswire.com/news-releases/news-releases-list/"
                                      f"?page=1&pagesize={ARTICLES_TO_EXTRACT_PER_DAY}&month={single_date.month:02}"
                                      f"&day={single_date.day:02}&year={single_date.year:04}&hour={single_date.hour:02}")

        # Display links.
        for itr in filtered_link_list:
            print(itr)

        return filtered_link_list

    def generate_links_to_articles(self, filtered_news_links_list) -> list:
        """
        Method to generate a list of links to articles.

        :param filtered_news_links_list: list of links to news releases.
        :return articles_link_list: list of article links.
        """

        print("\n\nFetching links to articles from past two weeks..\n")

        # Looping through each date wise filtered news release links and fetching links to that days articles.
        articles_link_list = []
        for i in tqdm(filtered_news_links_list):
            response = requests.get(i)
            home_page = BeautifulSoup(response.text, 'html.parser')
            # Links to articles will be in the "a" tag with class "newsreleaseconsolidatelink display-outline".
            news_release_list = list(
                home_page.find_all("a", attrs={'class': 'newsreleaseconsolidatelink display-outline'}))
            # Adding all article links to one list.
            articles_link_list.extend(
                [f"https://www.prnewswire.com{i.attrs.get('href')}" for i in news_release_list if
                 i.attrs.get("href", "")])
        print("\n\nFetching complete.\n\n")

        # Display all article links.
        for itr in articles_link_list:
            print(itr)

        print(f"\n\nNumber of articles: {len(articles_link_list)}")

        return articles_link_list

    def parse_data_from_web(self, articles_link_list) -> pd.DataFrame:
        """
        Method to loop through article links, parse article body, date and add to dataframe.
        :param articles_link_list: list of links to articles.
        :return: Parsed data in dataframe.
        """

        print("\n\nExtracting information from scrapped content..\n")

        # Define dataframe.
        data = pd.DataFrame(columns=["url", "article_date", "article_content"])

        # Looping through article links and parsing, retrieving, storing information.
        for i in tqdm(articles_link_list):
            blog = requests.get(i)
            blog_soup = BeautifulSoup(blog.text, 'html.parser')
            # Article body is present in "section tag" with class "release-body container"
            blog_body = blog_soup.find("section", attrs={'class': 'release-body container'})
            if not blog_body:
                blog_body = blog_soup.find("section", attrs={'class': 'release-body container '})
            blog_body = blog_body.text if blog_body else ""

            # Fetch article date which is present in "meta" tag.
            blog_date = blog_soup.find("meta", attrs={'name': 'date'}).attrs.get("content", "")
            matches = list(datefinder.find_dates(blog_date))
            blog_date = str(matches[0]) if matches else ""

            # Append retrieved information to dataframe.
            data = data.append({
                "url": i,
                "article_date": blog_date,
                "article_content": blog_body},
                ignore_index=True)

        print("\n\nExtraction complete. All information added to dataframe.")

        return data

## Track/Store/Search

<p> Keep track of the news by storing the parsed news (CSV, DF, DB, XLSX). For all the parsed news, search the content to find at least 2 or 3 stock symbols in a specific industry. </p>

In [4]:
class Tracker:
    """
    Todo: Write the class desc.
    """

    def __init__(self, data):
        self.data = data

    def store_data_as_excel(self) -> None:
        """
        Method to store data in dataframe as excel.
        :return None:
        """

        # Saving dataframe as excel with pandas to_excel function.
        self.data.to_excel(f"{datetime.now().strftime('%Y%m%d-%H%M%S')}_scrapped_data.xlsx", index=False)

    def preprocess_data(self) -> None:
        """
        Method to do preprocessing of data before extracting tickers.
        :return:
        """

        # Dropping duplicates if present.
        self.data.drop_duplicates(subset="article_content", inplace=True, ignore_index=True)

    def fetch_tickers(self) -> set:
        """
        Method to fetch tickers from all article content.
        :return: Set of tickers.
        """

        tickers = set()
        for i in range(len(self.data)):
            temp = re.findall(r':\s[A-Z]{1,5}[)]', self.data.iloc[i]["article_content"])
            for tick in temp:
                tickers.add(tick[-(len(tick) - 2):-1])
        return tickers

## Retrieve Data Web(API)

<p> Scan Yahoo! Finance page for the stock symbols selected. Retrieve the stock price and volume for the last 30 days. </p>

In [5]:
class Retriever:
    """
    Todo: Write the class desc.
    """

    def __init__(self, tickers):
        self.tickers = tickers

    def retrieve(self) -> dict:
        """
        Method to retrieve ticker prices using yahoo finance api.
        :return: Stock information as dict.
        """
        stocks = {}
        for tick in self.tickers:
            stocks[tick] = yf.Ticker(tick).history(period="YTD")
        return stocks

## Visualization

<p> Prepare colorful visualization or graphs showing the stock prices for the last 30 days. It should be a plot time-series for: </p>

1.   Volume
2.   Daily Close Price



In [6]:
class Visualizer:
    """
    Todo: Write the class desc.
    """

    def __init__(self, stocks):
        self.stocks = stocks

    def generate_candle_stick_visualization(self, ticker, increasing_line, decreasing_line):
        fig = go.Figure(data=[go.Candlestick(
            x=self.stocks[ticker].index, 
            open=self.stocks[ticker]['Open'],
            high=self.stocks[ticker]['High'],
            low=self.stocks[ticker]['Low'],
            close=self.stocks[ticker]['Close'], 
            increasing_line_color=increasing_line,
            decreasing_line_color=decreasing_line)
            ])
        fig.update_layout(autosize=False,
                          width=1000,
                          height=800,)
        fig.show()

    def plot_tickers(self, ticker):
        fig = plt.figure(figsize=(30, 21))
        ax1 = plt.subplot(2, 2, 1)
        plt.xticks(rotation=45)
        plt.xticks(fontsize=20)
        plt.yticks(fontsize=20)
        ax2 = plt.subplot(2, 2, 2)
        plt.xticks(rotation=45)
        plt.xticks(fontsize=20)
        plt.yticks(fontsize=20)
        ax2.yaxis.offsetText.set_fontsize(20)
        ax1.plot(self.stocks[ticker]["Close"])
        ax2.plot(self.stocks[ticker]["Volume"])

## Recommending Stock's Purchase

<p> Is the stock worth purchasing or not? </p>

In [24]:
class StockRecommender:
    """
    Todo: Write desc.
    """

    def __init__(self, stocks):
        self.stocks = stocks

        # Loading stock recommendation model.
        self.model = tf.keras.models.load_model("/content/drive/MyDrive/LSTM_model")

    def model_predict(self, X):
        if np.argmax(self.model.predict(X)[-1]) == 0:
            print("Not Buy.")
        else:
            print("Buy")

    def preprocess(self, ticker):
        series = self.stocks[ticker]["Close"]
        series = series.pct_change()
        series = series.dropna()
        seq_data = []
        prev_days = deque(maxlen=100)
        for i in series:
            prev_days.append(i)
            if len(prev_days) == 100:
                seq_data.append([np.array(prev_days)])
            X = []
            for seq in seq_data:
                X.append(seq)
            X = np.array(X)
        return self.model_predict(X.reshape(-1, X.shape[2], X.shape[1]))

## Main

<p> Run all functions </p>

In [8]:
# Web scraping.
parser_obj = Parser()

date_wise_news_releases_links = parser_obj.generate_links_for_date_wise_news_releases()
articles_links = parser_obj.generate_links_to_articles(date_wise_news_releases_links)
data = parser_obj.parse_data_from_web(articles_links)

data


Parsing news from 11/19/2021 04:00 to 12/02/2021 04:00

https://www.prnewswire.com/news-releases/news-releases-list/?page=1&pagesize=4&month=11&day=19&year=2021&hour=04
https://www.prnewswire.com/news-releases/news-releases-list/?page=1&pagesize=4&month=11&day=20&year=2021&hour=04
https://www.prnewswire.com/news-releases/news-releases-list/?page=1&pagesize=4&month=11&day=21&year=2021&hour=04
https://www.prnewswire.com/news-releases/news-releases-list/?page=1&pagesize=4&month=11&day=22&year=2021&hour=04
https://www.prnewswire.com/news-releases/news-releases-list/?page=1&pagesize=4&month=11&day=23&year=2021&hour=04
https://www.prnewswire.com/news-releases/news-releases-list/?page=1&pagesize=4&month=11&day=24&year=2021&hour=04
https://www.prnewswire.com/news-releases/news-releases-list/?page=1&pagesize=4&month=11&day=25&year=2021&hour=04
https://www.prnewswire.com/news-releases/news-releases-list/?page=1&pagesize=4&month=11&day=26&year=2021&hour=04
https://www.prnewswire.com/news-release

100%|██████████| 14/14 [00:12<00:00,  1.15it/s]




Fetching complete.


https://www.prnewswire.com/news-releases/galvanized-steel-tubes-market-in-india-to-grow-at-a-cagr-of-above-6--apl-apollo-and-bhushan-power--steel-among-key-vendors--technavio-301428043.html
https://www.prnewswire.com/news-releases/zhangmen-education-inc-announces-change-of-auditor-301428908.html
https://www.prnewswire.com/news-releases/jamie-kay-helps-parents-navigate-the-black-friday-sales-with-pre-styled-looks-and-a-sneak-peek-for-vips-301421410.html
https://www.prnewswire.com/news-releases/the-first-phase-of-heco-demo-day-has-begun-301428921.html
https://www.prnewswire.com/news-releases/shareholder-investigation-halper-sadeh-llp-investigates-cor-drna-stxb-bcml-shareholders-are-encouraged-to-contact-the-firm-301429494.html
https://www.prnewswire.com/news-releases/shareholder-investigation-halper-sadeh-llp-investigates-ftsi-rrd-mnr-levl-shareholders-are-encouraged-to-contact-the-firm-301429493.html
https://www.prnewswire.com/news-releases/aspentech-investor-aler

100%|██████████| 56/56 [00:32<00:00,  1.71it/s]


Extraction complete. All information added to dataframe.





Unnamed: 0,url,article_date,article_content
0,https://www.prnewswire.com/news-releases/galva...,2021-11-19 04:00:00-05:00,"\n\n\nNEW YORK, Nov. 19, 2021 /PRNewswire/ -- ..."
1,https://www.prnewswire.com/news-releases/zhang...,2021-11-19 04:00:00-05:00,"\n\n\nSHANGHAI, Nov. 19, 2021 /PRNewswire/ -- ..."
2,https://www.prnewswire.com/news-releases/jamie...,2021-11-19 04:00:00-05:00,"\n\n\nCHRISTCHURCH, New Zealand, Nov. 19, 2021..."
3,https://www.prnewswire.com/news-releases/the-f...,2021-11-19 03:54:00-05:00,"\n\n\nLONDON, Nov. 19, 2021 /PRNewswire/ -- HE..."
4,https://www.prnewswire.com/news-releases/share...,2021-11-20 03:05:00-05:00,"\n\n\nNEW YORK, Nov. 20, 2021 /PRNewswire/ --H..."
5,https://www.prnewswire.com/news-releases/share...,2021-11-20 03:00:00-05:00,"\n\n\nNEW YORK, Nov. 20, 2021 /PRNewswire/ -- ..."
6,https://www.prnewswire.com/news-releases/aspen...,2021-11-20 02:14:00-05:00,"\n\n\nNEW ORLEANS, Nov. 20, 2021 /PRNewswire/ ..."
7,https://www.prnewswire.com/news-releases/fts-i...,2021-11-20 02:08:00-05:00,"\n\n\nNEW ORLEANS, Nov. 20, 2021 /PRNewswire/ ..."
8,https://www.prnewswire.com/news-releases/hisen...,2021-11-21 03:00:00-05:00,"\n\n\nQINGDAO, Chine, 21 novembre 2021 /PRNews..."
9,https://www.prnewswire.com/news-releases/hisen...,2021-11-21 03:00:00-05:00,"\n\n\nQINGDAO, China, 21. November 2021 /PRNew..."


In [11]:
# Tracking: storing data and generating tickers.
tracker_obj = Tracker(data)

tracker_obj.store_data_as_excel()
tracker_obj.preprocess_data()
tickers = tracker_obj.fetch_tickers()

tickers

{'AKABY',
 'AKBLF',
 'AZPN',
 'BCML',
 'COR',
 'DC',
 'DRNA',
 'EMR',
 'FTSI',
 'LALAB',
 'LEVL',
 'MNR',
 'PEGA',
 'PLTK',
 'RRD',
 'SPGI',
 'STXB',
 'ZME'}

In [15]:
# Retrieving stock info.
retriever_obj = Retriever(tickers)

stocks = retriever_obj.retrieve()

print("\nStock Information:\n", stocks)

- LALAB: No data found, symbol may be delisted
- DC: No data found for this date range, symbol may be delisted

Stock Information:
 {'AKABY':                   Open        High         Low  ...  Volume  Dividends  Stock Splits
Date                                            ...                                 
2021-01-04   65.389999   65.389999   65.389999  ...       0          0             0
2021-01-05   65.389999   65.389999   65.389999  ...       0          0             0
2021-01-06   65.389999   65.389999   65.389999  ...       0          0             0
2021-01-07   65.389999   65.389999   65.389999  ...       0          0             0
2021-01-08   65.389999   65.389999   65.389999  ...       0          0             0
...                ...         ...         ...  ...     ...        ...           ...
2021-11-24  101.250000  101.250000  101.250000  ...       0          0             0
2021-11-26   92.699997   92.699997   92.699997  ...     200          0             0
2021-11-

In [16]:
# Visualization

visualizer_obj = Visualizer(stocks)

i_cl = ["gold", "green", "cyan"]
d_cl = ["gray", "red", "black"]

widgets.interact(visualizer_obj.generate_candle_stick_visualization, ticker=stocks.keys(), increasing_line=i_cl, decreasing_line=d_cl)
widgets.interact(visualizer_obj.plot_tickers, ticker=stocks.keys())

interactive(children=(Dropdown(description='ticker', options=('AKABY', 'RRD', 'COR', 'DRNA', 'SPGI', 'LEVL', '…

interactive(children=(Dropdown(description='ticker', options=('AKABY', 'RRD', 'COR', 'DRNA', 'SPGI', 'LEVL', '…

<function ipywidgets.widgets.interaction._InteractFactory.__call__.<locals>.<lambda>>

In [23]:
# Stock recommendation
stock_recommender_obj = StockRecommender(stocks)

widgets.interact(stock_recommender_obj.preprocess, ticker=stocks.keys())



interactive(children=(Dropdown(description='ticker', options=('AKABY', 'RRD', 'COR', 'DRNA', 'SPGI', 'LEVL', '…

<function ipywidgets.widgets.interaction._InteractFactory.__call__.<locals>.<lambda>>