# 1. Install and import base line dependencies

In [16]:
! pip install transformers



In [17]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from bs4 import BeautifulSoup
import requests

# 2. Setup Summarization model

In [18]:
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

# 3. Summarize a single article

In [28]:
url = "https://finance.yahoo.com/news/lithium-extraction-tech-could-tesla-000000328.html"
r = requests.get(url)
soup = BeautifulSoup(r.text,'html.parser')
paragraphs = soup.find_all('p')

In [29]:
paragraphs

[<p class="xray-tooltip-text wafer-tooltip-text"></p>,
 <p>In Austin, Texas, Elon Musk is building his <a class="link rapid-noclick-resp" data-ylk="slk:Tesla Cybertruck plant" href="https://www.benchmarkminerals.com/membership/exclusive-tesla-becomes-first-automaker-to-enter-lithium-supply-chain-reshapes-foundations-of-china-dominated-spodumene-2/" rel="nofollow noopener" target="_blank">Tesla Cybertruck plant</a>…</p>,
 <p>And is developing a <a class="link rapid-noclick-resp" data-ylk="slk:new technology" href="https://chargedevs.com/newswire/tesla-develops-new-cost-slashing-lithium-extraction-process/" rel="nofollow noopener" target="_blank">new technology</a> to supply their huge lithium requirements.</p>,
 <p>Even Samsung is moving production from <a class="link rapid-noclick-resp" data-ylk="slk:overseas to Texas" href="https://www.wsj.com/articles/samsung-picks-texas-for-17-billion-chip-making-factory-11637710466#:~:text=Samsung%2C%20the%20world's%20largest%20semiconductor,30%20m

In [30]:
paragraphs[5].text

'American lithium is expected to be in high-demand for the next century…'

In [31]:
text = [paragraph.text for paragraph in paragraphs]
words = ' '.join(text).split(' ')[:400]
atricle = ' '.join(words)

In [32]:
words

['',
 'In',
 'Austin,',
 'Texas,',
 'Elon',
 'Musk',
 'is',
 'building',
 'his',
 'Tesla',
 'Cybertruck',
 'plant…',
 'And',
 'is',
 'developing',
 'a',
 'new',
 'technology',
 'to',
 'supply',
 'their',
 'huge',
 'lithium',
 'requirements.',
 'Even',
 'Samsung',
 'is',
 'moving',
 'production',
 'from',
 'overseas',
 'to',
 'Texas',
 'Why?',
 'American',
 'lithium',
 'is',
 'expected',
 'to',
 'be',
 'in',
 'high-demand',
 'for',
 'the',
 'next',
 'century…',
 'And',
 'early',
 'investors',
 'in',
 'the',
 'companies',
 'set',
 'to',
 'profit',
 'could',
 'receive',
 'significant',
 'returns',
 'on',
 'their',
 'investment',
 'in',
 '2022.',
 'Few',
 'investors',
 'may',
 'know',
 'that',
 'the',
 'US',
 'has',
 'some',
 'of',
 'the',
 'world’s',
 'largest',
 'lithium',
 'reserves…',
 'And',
 'with',
 'supply-chain',
 'issues',
 'and',
 'China’s',
 'emergence',
 'as',
 'a',
 'lithium',
 'power-house…',
 'Lithium',
 'production',
 'is',
 'now',
 'flooding',
 'back',
 'to',
 'America.',

In [33]:
len(words)

400

In [34]:
atricle

" In Austin, Texas, Elon Musk is building his Tesla Cybertruck plant… And is developing a new technology to supply their huge lithium requirements. Even Samsung is moving production from overseas to Texas Why? American lithium is expected to be in high-demand for the next century… And early investors in the companies set to profit could receive significant returns on their investment in 2022. Few investors may know that the US has some of the world’s largest lithium reserves… And with supply-chain issues and China’s emergence as a lithium power-house… Lithium production is now flooding back to America. We think one little-known stock could benefit the most because it is the leader in what could become a unique, new lithium extraction technology… Details below… LITHIUM DEMAND ESTIMATED TO INCREASE 40X: 25,000 TONNE DEFICIT PER YEAR FORECASTED! Not only do supply chain issues have American companies scrambling to bring business back to America… But current levels and types of lithium ext

In [35]:
input_ids = tokenizer.encode(atricle, return_tensors='pt')
output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
summary = tokenizer.decode(output[0], skip_special_tokens=True)

In [36]:
summary

'Demand for lithium is expected to increase by 40x over next 20 years. U.S. companies are developing new technology to meet demand'

In [37]:
input_ids

tensor([[  222,  5098,   108,  1824,   108, 32981, 20248,   117,   563,   169,
         11997,  9826, 25863,  1306,   401,   325,   117,  1690,   114,   177,
           552,   112,  1376,   153,  1124, 17935,  1096,   107,  1513,  4122,
           117,  1218,   889,   135,  5685,   112,  1824,  1807,   152,   655,
         17935,   117,  1214,   112,   129,   115,   281,   121, 11965,   118,
           109,   352,  1902,   401,   325,   616,  2714,   115,   109,   524,
           323,   112,  3508,   256,   719,  1225,  3115,   124,   153,  1237,
           115, 38019, 16102,  2714,   218,   235,   120,   109,   787,   148,
           181,   113,   109,   278,   123,   116,  1368, 17935,  7106,   401,
           325,   122,  1376,   121, 18894,   618,   111,  1224,   123,   116,
         15922,   130,   114, 17935,   484,   121,  2907,   401, 31331,   889,
           117,   239, 10233,   247,   112,  1086,   107,   184,   311,   156,
           332,   121,  4338,  1279,   256,  1280,  

# 4. Building a news and sentiment pipeline

In [49]:
monitored_tickers = ['reliance', 'grasim', 'hindalco']

## 4.1 Search for stock news using google and yahoo finance

In [50]:
def search_for_stock_news_urls(ticker):
    search_url = "https://www.google.com/search?q=mint+{}+&tbm=nws".format(ticker)
    r = requests.get(search_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    atags = soup.find_all('a')
    hrefs = [link['href'] for link in atags]
    return hrefs

In [51]:
raw_urls = {ticker:search_for_stock_news_urls(monitored_tickers) for ticker in monitored_tickers}
raw_urls

{'reliance': ['/?sa=X&ved=0ahUKEwjJ0YPbt9T0AhWJUWwGHSn2A1MQOwgC',
  '/?output=search&ie=UTF-8&tbm=nws&sa=X&ved=0ahUKEwjJ0YPbt9T0AhWJUWwGHSn2A1MQPAgE',
  '/search?q=mint+%5B%27reliance%27,+%27grasim%27,+%27hindalco%27%5D&tbm=nws&ie=UTF-8&gbv=1&sei=C8SwYYkgiaOx4w-p7I-YBQ',
  '/search?q=mint+%5B%27reliance%27,+%27grasim%27,+%27hindalco%27%5D&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwjJ0YPbt9T0AhWJUWwGHSn2A1MQ_AUIBygA',
  'https://maps.google.com/maps?q=mint+%5B%27reliance%27,+%27grasim%27,+%27hindalco%27%5D&um=1&ie=UTF-8&sa=X&ved=0ahUKEwjJ0YPbt9T0AhWJUWwGHSn2A1MQ_AUICSgC',
  '/search?q=mint+%5B%27reliance%27,+%27grasim%27,+%27hindalco%27%5D&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwjJ0YPbt9T0AhWJUWwGHSn2A1MQ_AUICigD',
  '/search?q=mint+%5B%27reliance%27,+%27grasim%27,+%27hindalco%27%5D&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwjJ0YPbt9T0AhWJUWwGHSn2A1MQ_AUICygE',
  '/search?q=mint+%5B%27reliance%27,+%27grasim%27,+%27hindalco%27%5D&ie=UTF-8&tbm=shop&source=lnms&sa=X&ved=0ahUKEwjJ0YPbt9T0

In [52]:
raw_urls.keys()

dict_keys(['reliance', 'grasim', 'hindalco'])

In [53]:
raw_urls.values()

dict_values([['/?sa=X&ved=0ahUKEwjJ0YPbt9T0AhWJUWwGHSn2A1MQOwgC', '/?output=search&ie=UTF-8&tbm=nws&sa=X&ved=0ahUKEwjJ0YPbt9T0AhWJUWwGHSn2A1MQPAgE', '/search?q=mint+%5B%27reliance%27,+%27grasim%27,+%27hindalco%27%5D&tbm=nws&ie=UTF-8&gbv=1&sei=C8SwYYkgiaOx4w-p7I-YBQ', '/search?q=mint+%5B%27reliance%27,+%27grasim%27,+%27hindalco%27%5D&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwjJ0YPbt9T0AhWJUWwGHSn2A1MQ_AUIBygA', 'https://maps.google.com/maps?q=mint+%5B%27reliance%27,+%27grasim%27,+%27hindalco%27%5D&um=1&ie=UTF-8&sa=X&ved=0ahUKEwjJ0YPbt9T0AhWJUWwGHSn2A1MQ_AUICSgC', '/search?q=mint+%5B%27reliance%27,+%27grasim%27,+%27hindalco%27%5D&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwjJ0YPbt9T0AhWJUWwGHSn2A1MQ_AUICigD', '/search?q=mint+%5B%27reliance%27,+%27grasim%27,+%27hindalco%27%5D&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwjJ0YPbt9T0AhWJUWwGHSn2A1MQ_AUICygE', '/search?q=mint+%5B%27reliance%27,+%27grasim%27,+%27hindalco%27%5D&ie=UTF-8&tbm=shop&source=lnms&sa=X&ved=0ahUKEwjJ0YPbt9T0AhWJUWwGHSn2A1

## 4.2 Strip out unwanted URLs


In [54]:
import re

In [55]:
exclude_list = ['maps' ,'policies' ,'preferences', 'accounts', 'support']

In [56]:
def strip_unwanted_urls(urls,exclude_list):
    val = []
    for url in urls:
        if 'https://' in url and not any(exclude_word in url for exclude_word in exclude_list):
            res = re.findall(r'(https?://\S+)',url)[0].split('&')[0]
            val.append(res)
    return list(set(val))

In [57]:
strip_unwanted_urls(raw_urls['reliance'],exclude_list)

['https://www.livemint.com/companies/news/birla-may-infuse-at-least-150-mn-into-vodafone-idea-this-month-11635879044507.html',
 'https://www.livemint.com/market/stock-market-news/stocks-to-watch-ril-dr-reddy-s-cadila-tata-motors-jet-airways-eicher-motors-11592188160640.html',
 'https://www.livemint.com/market/stock-market-news/the-ongoing-churn-in-indian-capitalism-and-the-sensex-1555999278980.html',
 'https://www.livemint.com/market/stock-market-news/three-of-10-top-nifty-performers-in-samvat-2077-from-tata-group-11636180793638.html',
 'https://www.livemint.com/companies/news/aditya-birla-group-s-stake-rises-to-27-18-post-vodafone-idea-rights-issue-1557249529794.html',
 'https://www.livemint.com/market/live-blog/share-market-live-updates-sensex-nifty-bse-nse-stock-market-today-02-11-2021-11635816878699.html',
 'https://www.livemint.com/Companies/pdrr5qntcmigCAvE0CXSUN/In-Birlas-20year-transformation-a-mix-of-caution-and-ambi.html',
 'https://www.livemint.com/market/live-blog/share-mar

In [58]:
cleaned_urls = {ticker:strip_unwanted_urls(raw_urls[ticker], exclude_list) for ticker in monitored_tickers}
cleaned_urls

{'reliance': ['https://www.livemint.com/companies/news/birla-may-infuse-at-least-150-mn-into-vodafone-idea-this-month-11635879044507.html',
  'https://www.livemint.com/market/stock-market-news/stocks-to-watch-ril-dr-reddy-s-cadila-tata-motors-jet-airways-eicher-motors-11592188160640.html',
  'https://www.livemint.com/market/stock-market-news/the-ongoing-churn-in-indian-capitalism-and-the-sensex-1555999278980.html',
  'https://www.livemint.com/market/stock-market-news/three-of-10-top-nifty-performers-in-samvat-2077-from-tata-group-11636180793638.html',
  'https://www.livemint.com/companies/news/aditya-birla-group-s-stake-rises-to-27-18-post-vodafone-idea-rights-issue-1557249529794.html',
  'https://www.livemint.com/market/live-blog/share-market-live-updates-sensex-nifty-bse-nse-stock-market-today-02-11-2021-11635816878699.html',
  'https://www.livemint.com/Companies/pdrr5qntcmigCAvE0CXSUN/In-Birlas-20year-transformation-a-mix-of-caution-and-ambi.html',
  'https://www.livemint.com/market

## 4.3 Search and scrape cleaned URLs

In [66]:
def scrape_and_process(URLs):
    articles = []
    for url in URLs:
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        paragraphs = soup.find_all('p')
        text = [paragraph.text for paragraph in paragraphs]
        words = ' '.join(text).split(' ')[:350]
        article = ' '.join(words)
        articles.append(article)
    return articles

In [67]:
articles = {ticker:scrape_and_process(cleaned_urls[ticker]) for ticker in monitored_tickers}
articles

{'reliance': ['Vodafone Group may invest in Vodafone Idea by using the proceeds from a planned sale of a stake in Indus Towers \n\n\tAditya Birla Group chairman Kumar Mangalam Birla is close to investing at least $150 million in Vodafone Idea Ltd in his personal capacity, two people aware of the matter said, as an immediate measure to keep the group’s cash-strapped telecom business afloat. \n\n\t“An issue of fresh convertibles or equities is being considered. Birla is likely to invest from his personal assets as early as this month," one of the two people cited above said on condition of anonymity. \n\n\t“This capital infusion by Birla is crucial. Post Birla’s infusion in a personal capacity, Vodafone Group Plc (the company’s UK-based promoter), too, is planning to infuse a similar amount into Vodafone Idea either as a single investor or as part of a consortium of investors, who may together invest close to $1 billion into Vodafone Idea in March 2022," this person added. \n\n\tBirla’s 

In [68]:
articles['reliance']

['Vodafone Group may invest in Vodafone Idea by using the proceeds from a planned sale of a stake in Indus Towers \n\n\tAditya Birla Group chairman Kumar Mangalam Birla is close to investing at least $150 million in Vodafone Idea Ltd in his personal capacity, two people aware of the matter said, as an immediate measure to keep the group’s cash-strapped telecom business afloat. \n\n\t“An issue of fresh convertibles or equities is being considered. Birla is likely to invest from his personal assets as early as this month," one of the two people cited above said on condition of anonymity. \n\n\t“This capital infusion by Birla is crucial. Post Birla’s infusion in a personal capacity, Vodafone Group Plc (the company’s UK-based promoter), too, is planning to infuse a similar amount into Vodafone Idea either as a single investor or as part of a consortium of investors, who may together invest close to $1 billion into Vodafone Idea in March 2022," this person added. \n\n\tBirla’s capital infus

In [73]:
def summarize(articles):
    summaries = []
    for article in articles:
        input_ids = tokenizer.encode(article,return_tensors='pt')
        output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
        summary = tokenizer.decode(output[0], skip_special_tokens=True)
        summaries.append(summary)
    return summaries

In [74]:
summaries = {ticker:summarize(articles[ticker]) for ticker in monitored_tickers}
summaries

KeyboardInterrupt: 