### 1. Import Dependencies

In [13]:
# !pip install transformers

# to scrape websites
import requests
from bs4 import BeautifulSoup

# for text operation
import tensorflow as tf
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

### 2. Setup Model (summarization)

In [2]:
# Pegasus - huggingface library model trained and built for financial texts.
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

### 3. Test a Single Article

In [8]:
url = "https://finance.yahoo.com/news/india-stocks-indian-shares-drop-103315194.html"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
paragraph = soup.find_all('p')

In [9]:
paragraph

[<p>(Updates closing levels in paragraph 2, adds fresh analysts' comments)</p>,
 <p>By Bharath Rajeswaran</p>,
 <p>BENGALURU, May 18 (Reuters) - Indian shares reversed course to finish lower on Thursday, as declines in the stocks of State Bank of India Ltd and ITC Ltd added to the pressure of investors booking profits after an upbeat start to the session.</p>,
 <p>The Nifty 50 closed 0.28% down at 18,129.95, falling for the third session in a row, while the 30-member S&amp;P BSE Sensex fell 0.21% to 61,431.74.</p>,
 <p>The benchmarks opened higher and gained over 0.6% during the session on optimism that the United States was close to a deal to raise its debt ceiling.</p>,
 <p>However, that soon faded as traders booked profits, much like the previous two sessions following a gain of nearly 4% since mid-April when the earnings season started. The Nifty 50 hit a five-month high on Monday.</p>,
 <p>"It's a healthy consolidation in markets," said Ajit Mishra, vice president of technical res

In [10]:
# Clean text
text = [para.text for para in paragraph]
words = ' '.join(text).split(' ')[:400]
art = ' '.join(words)

In [11]:
# removed tags, end lines, etc.
art

'(Updates closing levels in paragraph 2, adds fresh analysts\' comments) By Bharath Rajeswaran BENGALURU, May 18 (Reuters) - Indian shares reversed course to finish lower on Thursday, as declines in the stocks of State Bank of India Ltd and ITC Ltd added to the pressure of investors booking profits after an upbeat start to the session. The Nifty 50 closed 0.28% down at 18,129.95, falling for the third session in a row, while the 30-member S&P BSE Sensex fell 0.21% to 61,431.74. The benchmarks opened higher and gained over 0.6% during the session on optimism that the United States was close to a deal to raise its debt ceiling. However, that soon faded as traders booked profits, much like the previous two sessions following a gain of nearly 4% since mid-April when the earnings season started. The Nifty 50 hit a five-month high on Monday. "It\'s a healthy consolidation in markets," said Ajit Mishra, vice president of technical research at Religare Broking, referring to the slide in the la

In [16]:
# Creating summary
input_text = tokenizer.encode(art, return_tensors='pt')
output = model.generate(input_text, max_length=55, num_beams=5, early_stopping=True)
summary = tokenizer.decode(output[0], skip_special_tokens=True)

In [17]:
# summarized text
summary

'Shares of State Bank of India and ITC fall after earnings reports'

### 4. Building Pipeline (scrape news &rarr; summarize text &rarr; sentiment)

In [18]:
# creating a list of topics to be scraped
topics = ['SBI', 'HDFC']

##### 4.1 Define a function to search for news using google and yahoo-finance

In [20]:
# search for "a" tags which contains hrefs i.e. urls
def search_urls(topic):
    search_url = "https://www.google.com/search?q=yahoo+finance+{}&tbm=nws".format(topic)
    r = requests.get(search_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    atags = soup.find_all('a')
    hrefs = [link['href'] for link in atags]
    return hrefs

In [21]:
raw_urls = {topic:search_urls(topic) for topic in topics}

In [24]:
# Every item under a tag.
raw_urls

{'SBI': ['/?sa=X&ved=0ahUKEwiJ-MG9ovL_AhXJJbkGHTUFAe8QOwgC',
  '/search?q=yahoo+finance+SBI&tbm=nws&ie=UTF-8&gbv=1&sei=35qiZMmMLcnL5OUPtYqE-A4',
  '/search?q=yahoo+finance+SBI&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwiJ-MG9ovL_AhXJJbkGHTUFAe8Q_AUIBSgA',
  '/search?q=yahoo+finance+SBI&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwiJ-MG9ovL_AhXJJbkGHTUFAe8Q_AUIBygC',
  '/search?q=yahoo+finance+SBI&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwiJ-MG9ovL_AhXJJbkGHTUFAe8Q_AUICCgD',
  '/search?q=yahoo+finance+SBI&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwiJ-MG9ovL_AhXJJbkGHTUFAe8Q_AUICSgE',
  'https://maps.google.com/maps?q=yahoo+finance+SBI&um=1&ie=UTF-8&sa=X&ved=0ahUKEwiJ-MG9ovL_AhXJJbkGHTUFAe8Q_AUICigF',
  '/search?q=yahoo+finance+SBI&ie=UTF-8&tbm=shop&source=lnms&sa=X&ved=0ahUKEwiJ-MG9ovL_AhXJJbkGHTUFAe8Q_AUICygG',
  '/advanced_search',
  '/search?q=yahoo+finance+SBI&ie=UTF-8&tbm=nws&source=lnt&tbs=qdr:h&sa=X&ved=0ahUKEwiJ-MG9ovL_AhXJJbkGHTUFAe8QpwUIDQ',
  '/search?q=yahoo+finance+SBI&ie=U

In [25]:
# every item in 'SBI'
raw_urls['SBI']

['/?sa=X&ved=0ahUKEwiJ-MG9ovL_AhXJJbkGHTUFAe8QOwgC',
 '/search?q=yahoo+finance+SBI&tbm=nws&ie=UTF-8&gbv=1&sei=35qiZMmMLcnL5OUPtYqE-A4',
 '/search?q=yahoo+finance+SBI&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwiJ-MG9ovL_AhXJJbkGHTUFAe8Q_AUIBSgA',
 '/search?q=yahoo+finance+SBI&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwiJ-MG9ovL_AhXJJbkGHTUFAe8Q_AUIBygC',
 '/search?q=yahoo+finance+SBI&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwiJ-MG9ovL_AhXJJbkGHTUFAe8Q_AUICCgD',
 '/search?q=yahoo+finance+SBI&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwiJ-MG9ovL_AhXJJbkGHTUFAe8Q_AUICSgE',
 'https://maps.google.com/maps?q=yahoo+finance+SBI&um=1&ie=UTF-8&sa=X&ved=0ahUKEwiJ-MG9ovL_AhXJJbkGHTUFAe8Q_AUICigF',
 '/search?q=yahoo+finance+SBI&ie=UTF-8&tbm=shop&source=lnms&sa=X&ved=0ahUKEwiJ-MG9ovL_AhXJJbkGHTUFAe8Q_AUICygG',
 '/advanced_search',
 '/search?q=yahoo+finance+SBI&ie=UTF-8&tbm=nws&source=lnt&tbs=qdr:h&sa=X&ved=0ahUKEwiJ-MG9ovL_AhXJJbkGHTUFAe8QpwUIDQ',
 '/search?q=yahoo+finance+SBI&ie=UTF-8&tbm=nws&sourc

##### 4.2 Define a function to remove unwanted urls 

In [26]:
# We need regex library for this operation
import re

In [27]:
# create an exclude list of unwanted urls
# similar to stopwords in nltk library
excludewords = ['maps', 'policies', 'preferences', 'accounts', 'support']

In [28]:
def remove_urls(urls, excludewords):
    valid = []
    for url in urls: 
        if 'https://' in url and not any(exclude_word in url for exclude_word in excludewords):
            result = re.findall(r'(https?://\S+)', url)[0].split('&')[0]
            valid.append(result)
    return list(set(valid)) # this line of code removes duplicates from the final list

In [29]:
cleaned_urls = {topic:remove_urls(raw_urls[topic], excludewords) for topic in topics}

In [30]:
# removed all unwanted urls
cleaned_urls

{'SBI': ['https://finance.yahoo.com/news/cellusion-raises-2-83-billion-020000523.html',
  'https://finance.yahoo.com/news/singapore-outlines-design-framework-interoperable-083244040.html',
  'https://finance.yahoo.com/news/japans-sbi-holdings-raises-stakes-035049208.html',
  'https://finance.yahoo.com/news/sepsis-market-size-share-grow-170000301.html',
  'https://finance.yahoo.com/news/sbi-forms-joint-venture-set-092620906.html',
  'https://finance.yahoo.com/news/condom-maker-draws-rush-buy-013000156.html',
  'https://finance.yahoo.com/news/blockfills-integrates-zodia-custody-fulfill-140000019.html',
  'https://finance.yahoo.com/news/bioprocess-technology-global-market-report-100400496.html',
  'https://www.google.com/search?q%3Dyahoo%2Bfinance%2BSBI%26tbm%3Dnws%26pccc%3D1',
  'https://finance.yahoo.com/news/toyota-braces-investor-admonition-over-150100479.html',
  'https://finance.yahoo.com/news/qcp-capital-sbi-alpha-execute-113000676.html'],
 'HDFC': ['https://www.google.com/search?q

##### 4.3 A function to search and scrape through 'cleaned_urls'

In [36]:
def search_scrape(links):
    contents = []
    for link in links: 
        r = requests.get(link)
        soup = BeautifulSoup(r.text, 'html.parser')
        paragraphs = soup.find_all('p')
        text = [parah.text for parah in paragraphs]
        words = ' '.join(text).split(' ')[:300]
        content = ' '.join(words)
        contents.append(content)
    return contents

In [37]:
article = {topic:search_scrape(cleaned_urls[topic]) for topic in topics}

In [38]:
# All the required content from the web is here
article

{'SBI': ['TOKYO, June 07, 2023--(BUSINESS WIRE)--Cellusion Inc. (Head Office: Tokyo; CEO: Shin Hatou; hereinafter referred to as "Cellusion") announced that it has raised 2.83 billion yen (21 million USD) in Series C Round financing. New investors of the third-party allocation of shares in this Round are the investment limited partnerships operated by JIC Venture Growth Investments Co., Ltd., NISSAY CAPITAL Co., Ltd., SPARX Asset Management Co., Ltd. (Mirai Creation Fund III), Axil Capital Partners II LLP, Nikon-SBI Innovation Fund, and AIS CAPITAL LIMITED, while existing investors are the investment limited partnerships operated by The University of Tokyo Edge Capital Partners Co., Ltd., DBJ Capital Co., Ltd., SMBC Venture Capital Co., Ltd., and Gemseki Inc. Cellusion has raised 4.5 billion yen (33 million USD) in total to date. [Purpose and background of financing] In line with its mission of "Regenerating Human Potential" and vision of "More Freedom and More Smiles to the World," Ce

##### 4.4 A function to summarize texts generated from each url.

In [39]:
def summarize(article):
    summaries = []
    for item in article:
        input_item = tokenizer.encode(item, return_tensors='pt')
        output = model.generate(input_item, max_length=55, num_beams=5, early_stopping=True)
        summary = tokenizer.decode(output[0], skip_special_tokens=True)
        summaries.append(summary)
    return summaries

In [40]:
summaries = {topic:summarize(article[topic]) for topic in topics}

In [41]:
# summary of each webpage is here
summaries

{'SBI': ['Series C round of financing was led by JIC Venture Growth Investments. Cellusion is promoting as its lead program the development of iPS cell-derived corneal cell substitute',
  'City state’s central bank enlists 11 financial institutions. JFSA becomes first overseas regulator to join MAS’ Project Guardian',
  'Online banking giant to take Shinsei private. Shares of smaller lenders are also being sold to raise cash',
  'The sepsis market in 7MM is expected to show good positive growth, during the forecast period.',
  'Carbon EX to start over-the-counter trading this month. Exchange to offer services in English and Japanese',
  'Shares have surged more than 60% since listing in May. Most analysts covering the company recommend buying it',
  'BlockFills, pioneer in the digital asset sector for liquidity provision, announces partnership with Zodia Custody.',
  'Bioprocess technology market is expected to grow from $18.70 billion in 2022 to $21.90 billion in 2023.',
  'All images

##### 4.5 Sentiment Analysis

In [42]:
# easy method : transformers and pipeline
from transformers import pipeline
sentiment = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [43]:
# creating a scores dictionary to store sentiment scores
scores = {topic:sentiment(summaries[topic]) for topic in topics}

In [44]:
scores

{'SBI': [{'label': 'NEGATIVE', 'score': 0.951328694820404},
  {'label': 'POSITIVE', 'score': 0.9832780957221985},
  {'label': 'NEGATIVE', 'score': 0.9826846718788147},
  {'label': 'POSITIVE', 'score': 0.9992499947547913},
  {'label': 'NEGATIVE', 'score': 0.977104902267456},
  {'label': 'POSITIVE', 'score': 0.997575581073761},
  {'label': 'POSITIVE', 'score': 0.9941131472587585},
  {'label': 'POSITIVE', 'score': 0.9876918196678162},
  {'label': 'NEGATIVE', 'score': 0.9880996346473694},
  {'label': 'NEGATIVE', 'score': 0.7681273221969604},
  {'label': 'NEGATIVE', 'score': 0.9764251112937927}],
 'HDFC': [{'label': 'NEGATIVE', 'score': 0.9880996346473694},
  {'label': 'POSITIVE', 'score': 0.5510118007659912},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'POSITIVE', 'score': 0.9924783706665039},
  {'label': 'NEGATIVE', 'score': 0.9915587902069092},
  {'label': 'NEGATIVE', 'score': 0.5101869702339172},
  {'label': 'POSITIVE', 'score': 0.9471595883369446},
  {'label': 'NE