## 1. Install and Import Baseline Dependencies

In [7]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from bs4 import BeautifulSoup
import requests
import re
from transformers import pipeline
import csv

## 2. Setup Model

In [2]:
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at human-centered-summarization/financial-summarization-pegasus and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 ## 3. Setup Pipeline

In [None]:
# monitored_tickers = ['ETH']

In [4]:
# Take input from the user for the monitored tickers
monitored_tickers = ["AUDUSD"]

## 4.1. Search for Stock News using Google and Yahoo Finance

In [5]:
print('Searching For Stock News For', monitored_tickers)
def search_for_stock_news_links(ticker):
    search_url = 'https://www.google.com/search?q=yahoo+finance+{}&tbm=nws'.format(ticker)
    r = requests.get(search_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    atags = soup.find_all('a')
    hrefs = [link['href'] for link in atags]
    print(hrefs)
    return hrefs

Searching For Stock News For ['AUDUSD']


In [8]:
raw_urls = {ticker:search_for_stock_news_links(ticker) for ticker in monitored_tickers}

['/?sa=X&ved=0ahUKEwiEut6G3LeLAxVhH0QIHd_vGMcQOwgC', '/search?q=yahoo+finance+AUDUSD&sca_esv=fb7aeb5dce633c51&ie=UTF-8&tbm=nws&gbv=1&sei=0zapZ8SEKuG-kPIP39_juAw', '/search?q=yahoo+finance+AUDUSD&sca_esv=fb7aeb5dce633c51&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwiEut6G3LeLAxVhH0QIHd_vGMcQ_AUIBSgA', '/search?q=yahoo+finance+AUDUSD&sca_esv=fb7aeb5dce633c51&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwiEut6G3LeLAxVhH0QIHd_vGMcQ_AUIBygC', '/search?q=yahoo+finance+AUDUSD&sca_esv=fb7aeb5dce633c51&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwiEut6G3LeLAxVhH0QIHd_vGMcQ_AUICCgD', '/search?q=yahoo+finance+AUDUSD&sca_esv=fb7aeb5dce633c51&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwiEut6G3LeLAxVhH0QIHd_vGMcQ_AUICSgE', '/url?q=https://maps.google.com/maps%3Fq%3Dyahoo%2Bfinance%2BAUDUSD%26um%3D1%26ie%3DUTF-8%26ved%3D1t:200713%26ictx%3D111&opi=89978449&sa=U&ved=0ahUKEwiEut6G3LeLAxVhH0QIHd_vGMcQiaAMCAooBQ&usg=AOvVaw2N5sIt9jy3AhAHekjcxFxv', '/url?q=/search%3Fq%3Dyahoo%2Bfinance%2BAUDUSD%26sca_esv%3Dfb7aeb5

## 4.2. Strip - Out Unwanted URLs

In [9]:
print('Cleaning URLs...')
exclude_list = ['maps', 'policies', 'preferences', 'accounts', 'support']
def strip_unwanted_urls(urls, exclude_list):
    val = []
    for url in urls:
        if 'https://' in url and not any(exc in url for exc in exclude_list):
            res = re.findall(r'(https?://\S+)', url)[0].split('&')[0]
            val.append(res)
    return list(set(val))

cleaned_urls = {ticker:strip_unwanted_urls(raw_urls[ticker] , exclude_list) for ticker in monitored_tickers}

Cleaning URLs...


In [10]:
for url in cleaned_urls.values():
  print(url)

['https://finance.yahoo.com/news/aud-usd-forecast-aussie-dollar-121022904.html', 'https://finance.yahoo.com/news/aud-usd-weekly-price-forecast-141124717.html', 'https://finance.yahoo.com/news/aud-usd-declines-ahead-rba-163800442.html', 'https://finance.yahoo.com/news/aud-usd-weekly-price-forecast-130811765.html', 'https://finance.yahoo.com/news/aud-usd-forecast-australian-dollar-132830205.html', 'https://finance.yahoo.com/news/aud-usd-weekly-price-forecast-132935240.html', 'https://finance.yahoo.com/news/aud-usd-forecast-australian-dollar-130812973.html', 'https://finance.yahoo.com/news/aud-usd-forecast-aussie-dollar-122342692.html', 'https://www.google.com/search?q%3Dyahoo%2Bfinance%2BAUDUSD%26tbm%3Dnws%26pccc%3D1', 'https://finance.yahoo.com/news/aud-usd-forecast-aussie-dollar-132834054.html', 'https://finance.yahoo.com/news/aud-usd-forecast-aussie-continues-161221802.html']


## 4.3. Search and Scrape Cleaned URLs

In [10]:
print('Scraping News Links...')
def scrape_and_process(URLs):
    ARTICLES = []
    for url in URLs:
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        results = soup.find_all('p')
        text = [res.text for res in results]
        words = ' '.join(text).split(' ')[:350]
        ARTICLE = ' '.join(words)
        ARTICLES.append(ARTICLE)
    return ARTICLES
articles = {ticker:scrape_and_process(cleaned_urls[ticker]) for ticker in monitored_tickers}

Scraping News Links...


In [11]:
for article in articles.values():
  print(article)

['Thank you for your patience. Our engineers are working quickly to resolve the issue.', '© 2025 - Privacy - Terms', 'Thank you for your patience. Our engineers are working quickly to resolve the issue.', 'Thank you for your patience. Our engineers are working quickly to resolve the issue.', 'Thank you for your patience. Our engineers are working quickly to resolve the issue.', 'Thank you for your patience. Our engineers are working quickly to resolve the issue.', 'Thank you for your patience. Our engineers are working quickly to resolve the issue.', 'Thank you for your patience. Our engineers are working quickly to resolve the issue.', 'Thank you for your patience. Our engineers are working quickly to resolve the issue.', 'Thank you for your patience. Our engineers are working quickly to resolve the issue.', 'Thank you for your patience. Our engineers are working quickly to resolve the issue.']


## 4.4. Summarise all Articles

In [12]:
print('Summarizing Articles...')
def summarize(articles):
    summaries = []
    for article in articles:
        input_ids = tokenizer.encode(article, return_tensors="pt")
        output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
        summary = tokenizer.decode(output[0], skip_special_tokens=True)
        summaries.append(summary)
    return summaries

summaries = {ticker:summarize(articles[ticker]) for ticker in monitored_tickers}

Summarizing Articles...


In [13]:
for summary in summaries.values():
  print(summary)

['We are aware of the issue and are working to resolve it.', 'We are aware of the issue and are working to resolve it.', 'We are aware of the issue and are working to resolve it.', 'We are aware of the issue and are working to resolve it.', 'We are aware of the issue and are working to resolve it.', 'We are aware of the issue and are working to resolve it.', 'We are aware of the issue and are working to resolve it.', 'We are aware of the issue and are working to resolve it.', 'Your information may be shared with third parties.', 'We are aware of the issue and are working to resolve it.', 'We are aware of the issue and are working to resolve it.']


## 5. Adding Sentiment Analysis

In [None]:
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer

model_name = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
sentiment = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
scores = {ticker:sentiment(summaries[ticker]) for ticker in monitored_tickers}

## 6. Exporting Results

In [None]:
import datetime

print('Exporting Results...')

def create_output_array(summaries, scores, urls):
    output = []
    for ticker in monitored_tickers:
        for counter in range(len(summaries[ticker])):
            output_this = [
                            ticker,
                            summaries[ticker][counter],
                            scores[ticker][counter]['label'],
                            scores[ticker][counter]['score'],
                            urls[ticker][counter]
                          ]
            output.append(output_this)
    return output

final_output = create_output_array(summaries, scores, cleaned_urls)
final_output.insert(0, ['Ticker','Summary', 'Sentiment', 'Sentiment Score', 'URL'])

# Get the current date and time in the format DD_MM_YYYY_HH_MM_SS
current_date_time = datetime.datetime.now().strftime('%d_%m_%Y_%H_%M_%S')

# Construct the file name using the current date and time
file_name = f'summaries_{current_date_time}.csv'

# Open the file for writing
with open(file_name, mode='w', newline='') as f:
    csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerows(final_output)

Exporting Results...


In [None]:
print('Done!')

Done!


In [None]:
## Umang Laad
## 20100BTCSDSI07300

## Anaconda3 Prompt Commands

In [None]:
## cd "C:\Users\laad_"
## python Stock-and-Crypto-News-ScrapingSummarizationSentiment_1.1.py