In [None]:
# BeautifulSoup Documentation - https://beautiful-soup-4.readthedocs.io/en/latest/
# BeautifulSoup @ pypi.org - https://pypi.org/project/beautifulsoup4/

In [None]:
# !pip install datefinder
# !pip install yfinance

In [70]:
import re
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import tensorflow as tf
from tqdm import tqdm
import requests
from sklearn import preprocessing
import datefinder
from bs4 import BeautifulSoup
from collections import deque
import yfinance as yf
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import plotly.graph_objects as go
import ipywidgets as widgets
import warnings

warnings.filterwarnings("ignore")
model = tf.keras.models.load_model("LSTM_stock")

In [2]:
# Initialize constants.

ARTICLES_TO_EXTRACT_PER_DAY = 4

In [3]:
# Get start and end date to fetch articles.
current_time = datetime.now()
time_two_weeks_back = current_time - timedelta(days=13)
print(f"\nParsing data from " + time_two_weeks_back.strftime("%m/%d/%Y %H:00") + " to " + current_time.strftime("%m/%d/%Y %H:00") + "\n")

# Generate a list of links that fetches news release of past two weeks.
filtered_link_list = []
for single_date in pd.date_range(time_two_weeks_back, current_time):
    filtered_link_list.append(f"https://www.prnewswire.com/news-releases/news-releases-list/"
                              f"?page=1&pagesize={ARTICLES_TO_EXTRACT_PER_DAY}&month={single_date.month:02}"
                              f"&day={single_date.day:02}&year={single_date.year:04}&hour={single_date.hour:02}")

for itr in filtered_link_list:
    print(itr)


Parsing data from 11/18/2021 03:00 to 12/01/2021 03:00

https://www.prnewswire.com/news-releases/news-releases-list/?page=1&pagesize=4&month=11&day=18&year=2021&hour=03
https://www.prnewswire.com/news-releases/news-releases-list/?page=1&pagesize=4&month=11&day=19&year=2021&hour=03
https://www.prnewswire.com/news-releases/news-releases-list/?page=1&pagesize=4&month=11&day=20&year=2021&hour=03
https://www.prnewswire.com/news-releases/news-releases-list/?page=1&pagesize=4&month=11&day=21&year=2021&hour=03
https://www.prnewswire.com/news-releases/news-releases-list/?page=1&pagesize=4&month=11&day=22&year=2021&hour=03
https://www.prnewswire.com/news-releases/news-releases-list/?page=1&pagesize=4&month=11&day=23&year=2021&hour=03
https://www.prnewswire.com/news-releases/news-releases-list/?page=1&pagesize=4&month=11&day=24&year=2021&hour=03
https://www.prnewswire.com/news-releases/news-releases-list/?page=1&pagesize=4&month=11&day=25&year=2021&hour=03
https://www.prnewswire.com/news-release

In [4]:
# Generate a list of required blog links.

print("Fetching links to articles of past two weeks..\n")
blogs_link_list = []
for i in tqdm(filtered_link_list):
    response = requests.get(i)
    home_page = BeautifulSoup(response.text, 'html.parser')
    news_release_list = list(home_page.find_all("a", attrs={'class': 'newsreleaseconsolidatelink display-outline'}))
    # Generate links to each articles.
    blogs_link_list.extend([f"https://www.prnewswire.com{i.attrs.get('href')}" for i in news_release_list if i.attrs.get("href","")])
print("\nFetching complete.\n\n")

for itr in blogs_link_list:
    print(itr)

print(f"\n\nNumber of articles: {len(blogs_link_list)}")

  0%|          | 0/14 [00:00<?, ?it/s]

Fetching links to articles of past two weeks..



100%|██████████| 14/14 [00:19<00:00,  1.40s/it]


Fetching complete.


https://www.prnewswire.com/news-releases/doosan-robotics-remporte-le-prix-de-l-innovation-ces-r-2022-pour-son-systeme-de-robot-camera-882142369.html
https://www.prnewswire.com/news-releases/common-criteria-eal4-zertifizierung-fur-sdot-security-gateway-cross-domain-solution-810383109.html
https://www.prnewswire.com/news-releases/the9-limited-announces-two-time-nba-mvp-giannis-antetokounmpo-and-three-brothers-as-global-ambassadors-and-shareholders-of-new-nftstar-platform-301427701.html
https://www.prnewswire.com/news-releases/neolith-materializes-its-commitment-to-the-us-market-by-multiplying-its-openings-301427378.html
https://www.prnewswire.com/news-releases/savory--partners-st-kitts--nevis-offshore-banking-and-trusts-your-optimal-asset-protection-tools-301426728.html
https://www.prnewswire.com/news-releases/squaretalk-is-now-accepting-crypto-as-a-form-of-payment-301428285.html
https://www.prnewswire.com/news-releases/modulaire-group-strengthens-spanish-business-w




In [5]:
# Loop through article links, parse article body, date and add to dataframe.

print("Extracting data from scrapped content..\n\n")

data = pd.DataFrame(columns=["url", "article_date", "article_content"])

for i in blogs_link_list:
    blog = requests.get(i)
    blog_soup = BeautifulSoup(blog.text, 'html.parser')
    blog_body = blog_soup.find("section", attrs={'class': 'release-body container'})
    if not blog_body:
        blog_body = blog_soup.find("section", attrs={'class': 'release-body container '})
    blog_body = blog_body.text if blog_body else ""

    # Fetch article date.
    blog_date = blog_soup.find("meta", attrs={'name': 'date'}).attrs.get("content")
    matches = list(datefinder.find_dates(blog_date))
    blog_date = str(matches[0]) if matches else ""

    # Append to dataframe.
    data = data.append({
        "url": i,
        "article_date": blog_date,
        "article_content": blog_body},
        ignore_index=True)



Extracting data from scrapped content..




In [6]:
data.head()

Unnamed: 0,url,article_date,article_content
0,https://www.prnewswire.com/news-releases/doosa...,2021-11-18 03:00:00-05:00,"\n\n\nSÉOUL, Corée du Sud et LAS VEGAS, 18 nov..."
1,https://www.prnewswire.com/news-releases/commo...,2021-11-18 03:00:00-05:00,"\n\n\nKÖLN, 18. November 2021 /PRNewswire/ -- ..."
2,https://www.prnewswire.com/news-releases/the9-...,2021-11-18 03:00:00-05:00,"\n\n\nSINGAPORE, Nov. 18, 2021 /PRNewswire/ --..."
3,https://www.prnewswire.com/news-releases/neoli...,2021-11-18 03:00:00-05:00,"\n\n\nCASTELLÓN, Spain, Nov. 18, 2021 /PRNewsw..."
4,https://www.prnewswire.com/news-releases/savor...,2021-11-19 03:00:00-05:00,"\n\n\nDUBAI, UAE, Nov. 19, 2021 /PRNewswire/ -..."


In [7]:
#Dropping duplicates if any
data.drop_duplicates(subset = "article_content",inplace = True,ignore_index = True)

In [8]:
tickers = set()
for i in range(len(data)):
    temp = re.findall(r':\s[A-Z]{1,5}[)]',data.iloc[i]["article_content"])
    for tick in temp: 
        tickers.add(tick[-(len(tick) - 2):-1])

In [9]:
stocks = {}
for tick in tickers:
    stocks[tick] = yf.Ticker(tick).history(period = "YTD")

- LALAB: No data found, symbol may be delisted


In [41]:
stocks

{'FTSI':                  Open       High        Low      Close  Volume  Dividends  \
 Date                                                                        
 2021-01-04  19.879999  20.000000  18.620001  18.700001  138900          0   
 2021-01-05  18.629999  19.520000  18.500000  18.559999  251700          0   
 2021-01-06  18.830000  19.639999  18.730000  18.900000  328400          0   
 2021-01-07  18.850000  19.516001  18.440001  18.440001  172100          0   
 2021-01-08  18.420000  19.530001  17.780001  19.270000  105600          0   
 ...               ...        ...        ...        ...     ...        ...   
 2021-11-23  26.459999  26.600000  26.160000  26.500000  125700          0   
 2021-11-24  26.520000  26.799999  26.500000  26.620001   47200          0   
 2021-11-26  26.299999  26.750000  26.260000  26.520000  117600          0   
 2021-11-29  26.520000  26.570000  26.389999  26.500000  187500          0   
 2021-11-30  26.379999  26.520000  26.209999  26.340000 

In [145]:
def candlestick(ticker,increasing_line,decreasing_line):
        fig = go.Figure(data=[go.Candlestick(x=stocks[ticker].index,
                open=stocks[ticker]['Open'],
                high=stocks[ticker]['High'],
                low=stocks[ticker]['Low'],
                close=stocks[ticker]['Close'],increasing_line_color= increasing_line, decreasing_line_color= decreasing_line)])
        fig.update_layout(autosize=False,
        width=1000,
        height=800,)
        fig.show()
        
def plot_tick(ticker):
    fig = plt.figure(figsize=(30,21))
    ax1 = plt.subplot(2, 2, 1)
    plt.xticks(rotation=45)
    plt.xticks(fontsize = 20)
    plt.yticks(fontsize = 20)
    ax2 = plt.subplot(2, 2, 2)
    plt.xticks(rotation=45)
    plt.xticks(fontsize = 20)
    plt.yticks(fontsize = 20)
    ax2.yaxis.offsetText.set_fontsize(20)
    ax1.plot(stocks[ticker]["Close"])
    ax2.plot(stocks[ticker]["Volume"])
i_cl = ["gold","green","cyan"]
d_cl = ["gray","red","black"]


def preprocess(ticker):
    series = stocks[ticker]["Close"]
    series = series.pct_change()
    series = series.dropna()
    seq_data = []
    prev_days = deque(maxlen = 100)
    for i in series:
        prev_days.append(i)
        if len(prev_days) == 100:
            seq_data.append([np.array(prev_days)])
        X =[]
        for seq in seq_data:
            X.append(seq)
        X = np.array(X)
    return model_predict(X.reshape(-1,X.shape[2],X.shape[1]))

def model_predict(X):
        if np.argmax(model.predict(X)[-1]) == 0:
            print("Sell")
        else:
            print("buy")
    

In [11]:
widgets.interact(candlestick, ticker = stocks.keys(),increasing_line = i_cl,decreasing_line = d_cl)

interactive(children=(Dropdown(description='ticker', options=('FTSI', 'EMR', 'RLX', 'NCTY', 'ZME', 'LEVL', 'AZ…

<function __main__.candlestick(ticker, increasing_line, decreasing_line)>

In [12]:
widgets.interact(plot_tick, ticker = stocks.keys())

interactive(children=(Dropdown(description='ticker', options=('FTSI', 'EMR', 'RLX', 'NCTY', 'ZME', 'LEVL', 'AZ…

<function __main__.plot_tick(ticker)>

In [146]:
widgets.interact(preprocess, ticker = stocks.keys())

interactive(children=(Dropdown(description='ticker', options=('FTSI', 'EMR', 'RLX', 'NCTY', 'ZME', 'LEVL', 'AZ…

<function __main__.preprocess(ticker)>