In [2]:
import os
import requests
from bs4 import BeautifulSoup
from scipy.special import softmax
from transformers import AutoTokenizer
from urllib.parse import unquote
from transformers import AutoModelForSequenceClassification
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import BartTokenizer, BartForConditionalGeneration

In [3]:
def scraping_article(url):
    headers = {
    'User-Agent': 'Your User Agent String',
    }
    r=requests.get(url,headers=headers)
    soup=BeautifulSoup(r.text,'html.parser')
    paragraphs=soup.find_all('p')
    text= [paragraph.text for paragraph in paragraphs]
    words=' '.join(text).split(' ')
    article = ' '.join(words)
    return article

In [4]:
def find_news_url(keyword, start_date, end_date):
    root = "https://www.google.com/"
    search_query = keyword.replace(" ", "+")
    link = f"{root}search?q={search_query}&tbm=nws&tbs=cdr:1,cd_min:{start_date},cd_max:{end_date}"

    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

    response = requests.get(link, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    news_links = []

    for article in soup.select('div.SoaBEf'):
        link = article.select_one('a')
        if link and 'href' in link.attrs:
            url = link['href']
            if url.startswith('/url?q='):
                url = unquote(url.split('/url?q=')[1].split('&sa=')[0])
            news_links.append(url)

    return news_links

In [5]:
def to_chunks(data):
    text_splitter=RecursiveCharacterTextSplitter(
        chunk_size=3000,
        chunk_overlap=50
    )
    docs=text_splitter.split_text(data)
    return docs

In [6]:
model_name="facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

In [7]:
def summarize_text(tokenizer, model, text, max_chunk_length, summary_max_length):
    inputs = tokenizer(text, return_tensors="pt", max_length=max_chunk_length, truncation=True)
    summary_ids = model.generate(inputs["input_ids"], max_length=summary_max_length, min_length=200, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [8]:
def summarize_article(tokenizer,model,url):
    data = scraping_article(url)
    chunks = to_chunks(data)
    # tokenizer, model=load_pegasus_model("google/pegasus-xsum")
    # tokenizer, model = load_bart_model(model_name)
    summaries = []
    for chunk in chunks:
        chunk_text = chunk
        summary = summarize_text(tokenizer, model, chunk_text,3000,800)
        summaries.append(summary)
    concatenated_summaries = " ".join(summaries)
    #  Second summarization pass: Summarize the concatenated summaries
    intermediate_chunks = [concatenated_summaries[i:i+3000] for i in range(0, len(concatenated_summaries), 3000)]
    final_summaries = []
    for intermediate_chunk in intermediate_chunks:
        final_summary = summarize_text(tokenizer, model, intermediate_chunk,3000,800)
        final_summaries.append(final_summary)
    final_summary_text = " ".join(final_summaries)
    return final_summary_text

In [44]:
for i in range(9,10):
    url=find_news_url('tesla',"04/03/2024","04/03/2024")[i]
    summary = summarize_article(tokenizer,model,url)
    print(summary)
    print()


Tesla sales for the first quarter of the year have dropped significantly. The electric vehicle giant has reported 3,86,810 global deliveries, down 8.5 per cent. It is the first annual drop in sales since the first year of the COVID-19 pandemic. Some analysts believe that itâs competition and the current supply chain issues hampering sales. But others say that it might be the company's chief who is putting off potential customers. Elon Musk has courted several controversies, especially on matters of anti-Semitism and transphobia. He has also endorsed several right-wing conspiracy theories on his platform X. The billionaire has also been accused of promoting a long-debunked conspiracy theory which alleged high-profile Democrats ran a paedophile abuse ring from a Washington pizza restaurant. 42 per cent of respondents had an unfavourable view of Musk in February, up from 34 per cent in April 2022. But there are still many who remain loyal to Tesla and Elon Musk.

