In [1]:
import os
import requests
from bs4 import BeautifulSoup
from scipy.special import softmax
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import BartTokenizer, BartForConditionalGeneration

In [2]:
def scraping_article(url):
    headers = {
    'User-Agent': 'Your User Agent String',
    }
    r=requests.get(url,headers=headers)
    soup=BeautifulSoup(r.text,'html.parser')
    paragraphs=soup.find_all('p')
    text= [paragraph.text for paragraph in paragraphs]
    words=' '.join(text).split(' ')
    article = ' '.join(words)
    return article

In [3]:
def find_news_url(keyword, start_date, end_date):
    root = "https://www.google.com/"
    search_query = keyword.replace(" ", "+")
    link = f"{root}search?q={search_query}&tbm=nws&tbs=cdr:1,cd_min:{start_date},cd_max:{end_date}"

    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

    response = requests.get(link, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    news_links = []

    for article in soup.select('div.SoaBEf'):
        link = article.select_one('a')
        if link and 'href' in link.attrs:
            url = link['href']
            if url.startswith('/url?q='):
                url = unquote(url.split('/url?q=')[1].split('&sa=')[0])
            news_links.append(url)

    return news_links

In [4]:
def to_chunks(data):
    text_splitter=RecursiveCharacterTextSplitter(
        chunk_size=3000,
        chunk_overlap=50
    )
    docs=text_splitter.split_text(data)
    return docs

In [10]:
model_name="facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

In [6]:
def summarize_text(tokenizer, model, text, max_chunk_length, summary_max_length):
    inputs = tokenizer(text, return_tensors="pt", max_length=max_chunk_length, truncation=True)
    summary_ids = model.generate(inputs["input_ids"], max_length=summary_max_length, min_length=200, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [11]:
def summarize_article(tokenizer,model,url):
    data = scraping_article(url)
    chunks = to_chunks(data)
    # tokenizer, model=load_pegasus_model("google/pegasus-xsum")
    # tokenizer, model = load_bart_model(model_name)
    summaries = []
    for chunk in chunks:
        chunk_text = chunk
        summary = summarize_text(tokenizer, model, chunk_text,3000,800)
        summaries.append(summary)
    concatenated_summaries = " ".join(summaries)
    #  Second summarization pass: Summarize the concatenated summaries
    intermediate_chunks = [concatenated_summaries[i:i+3000] for i in range(0, len(concatenated_summaries), 3000)]
    final_summaries = []
    for intermediate_chunk in intermediate_chunks:
        final_summary = summarize_text(tokenizer, model, intermediate_chunk,3000,800)
        final_summaries.append(final_summary)
    final_summary_text = " ".join(final_summaries)
    return final_summary_text

In [41]:
url=find_news_url('tesla',"01/11/2024","01/11/2024")[8]
print(url)

https://techcrunch.com/2024/01/11/hertz-sell-evs-tesla-fleet-gm-polestar-gas/


In [42]:
summary = summarize_article(tokenizer,model,url)
print(summary)

Hertz is selling off a third of its electric vehicle fleet, which is predominantly made up of Teslas. The company cited lower demand for EVs and higher-than-expected repair costs as reasons for the decision. Hertz’s move to slash its EV fleet comes as electric vehicle sales growth has cooled from record highs. The U.K. government announced sanctions against 12 executives and senior leaders of the Russia-based cybersecurity giant Kaspersky. Google said today that it globally paused its experiment that aimed to allow new kinds of real-money games on the Play Store, citing the chasm between Google and other major publishers. Spotify is introducing a new “Basic” streaming plan in the United States. Squarespace is selling Tock, its restaurant reservation service, to American Express in a deal worth $400 million. The February ransomware attack on UHG-owned Change Healthcare stands as one of the largest-ever known digital thefts of U.S. medical records. Nearly one in three girls have seriousl