In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
def to_chunks(data):
    text_splitter=RecursiveCharacterTextSplitter(
        chunk_size=3000,
        chunk_overlap=50
    )
    docs=text_splitter.split_text(data)
    return docs

In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


In [4]:
def load_distilbert_model(model_name="sshleifer/distilbart-cnn-12-6"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    return tokenizer, model

In [5]:
def summarize_text(tokenizer, model, text, max_chunk_length, summary_max_length):
    inputs = tokenizer(text, return_tensors="pt", max_length=max_chunk_length, truncation=True)
    summary_ids = model.generate(inputs["input_ids"], max_length=summary_max_length, min_length=200, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


In [6]:
def summarize_article(article, model_name="sshleifer/distilbart-cnn-12-6"):
    data = article
    chunks = to_chunks(data)
    tokenizer, model = load_distilbert_model(model_name)
    summaries = []
    for chunk in chunks:
        chunk_text = chunk
        summary = summarize_text(tokenizer, model, chunk_text,3000,800)
        summaries.append(summary)
    concatenated_summaries = " ".join(summaries)
    #  Second summarization pass: Summarize the concatenated summaries
    intermediate_chunks = [concatenated_summaries[i:i+3000] for i in range(0, len(concatenated_summaries), 3000)]
    final_summaries = []
    for intermediate_chunk in intermediate_chunks:
        final_summary = summarize_text(tokenizer, model, intermediate_chunk,3000,800)
        final_summaries.append(final_summary)
    
    final_summary_text = " ".join(final_summaries)
    
    return final_summary_text

In [7]:
import requests
from bs4 import BeautifulSoup


In [33]:
url = "https://www.hindustantimes.com/business/elon-musk-s-x-wants-laid-off-employees-to-give-money-back-they-were-overpaid-101718355497670.html"
headers = {
    'User-Agent': 'Your User Agent String',
}
r = requests.get(url, headers=headers)

soup=BeautifulSoup(r.text,'html.parser')
paragraphs=soup.find_all('p')

In [34]:
paragraphs

[<p>Subscribe Now! Get features like</p>,
 <p>Elon Musk's company X (formerly Twitter) is asking at least six former Australian employees to repay the money which they have been accidentally given. The company is threatening legal action to recover these overpayments, Sydney Morning Herald reported. The issue come after an error in currency conversion from US dollars to Australian dollars. Emails from the company's Asia Pacific HR department show that overpayments occurred in January 2023, ranging from $1,500 to $70,000 per employee.</p>,
 <p>As per the report, these payments were part of a 'deferred cash compensation' tied to employee shares which were originally valued at $54.20 USD each which is the price Elon Musk paid when he acquired Twitter in 2022. None of the former employees have returned the money, the report claimed. The overpayments happened because Elon Musk's company used an incorrect conversion rate which was 2.5 times the actual value.</p>,
 <p>As per Australian law, o

In [10]:
paragraphs[2].text

'The social media platform, formerly known as Twitter, acknowledged its error in currency conversion from the US to Australian dollars on the payments and asked its former employees to repay amounts of up to $70,000 in some cases, the report said on Wednesday.\xa0'

In [22]:
text= [paragraph.text for paragraph in paragraphs]
words=' '.join(text).split(' ')
article = ' '.join(words)

In [23]:
print(article)

File photo Billionaire Elon Musk's X has asked its sacked employees in Australia to return the money it claims was accidentally overpaid to them, the Sydney Morning Herald reported. The social media platform, formerly known as Twitter, acknowledged its error in currency conversion from the US to Australian dollars on the payments and asked its former employees to repay amounts of up to $70,000 in some cases, the report said on Wednesday.  The currency conversion errors made by X when employees were paid their entitlements after they were fired had reportedly led to overpayments of between $1,500 and $70,000. According to the report, at least six former X staff have received legal notices.  "It has come to our attention that you received a significant overpayment in error in January 2023," the Sydney Morning Herald quoted X's Asia Pacific human resources department as saying, citing an email to several former employees this year. "We would be grateful if you could arrange the repayment 

In [24]:
summary = summarize_article(article)


In [14]:
print(summary)

 The social media platform, formerly known as Twitter, acknowledged its error in currency conversion from the US to Australian dollars on the payments. X has been accused in multiple suits of numerous labor and workplace violations. Thousands of former Twitter employees said they were cheated of severance pay when the billionaire laid them off after acquiring the social media site. X also said that the overpayment was related to "deferred cash compensation" in the form of employee shares issued to the staff when they joined Twitter. At least six former X staff have received legal notices, according to the Sydney Morning Herald report, the report said. The billionaire's $44 billion acquisition of Twitter in 2022 is due to take place in 2022. Back to Mail Online home to go to the page you came from: http://www.dailymailonline.com/news/sundestribune/dailymail/dailydailymail/. Back to the original version of this article. Back to The Newsquiz.com: Please contact us on


In [52]:
import os
from dotenv import load_dotenv

In [56]:
load_dotenv()
api_key_news=os.getenv('NEWSAPI_ORG')


In [59]:
# import requests

# # Replace with your API key
# api_key = api_key_news
# base_url = 'https://newsapi.org/v2/everything'

# # Example query parameters
# params = {
#     'q': 'apple',  # Example keyword (replace with your specific keyword)
#     'domains': 'techcrunch.com,theverge.com',  # Domains related to technology
#     'sortBy': 'publishedAt',  # Sort by published date
#     'pageSize': 5,  # Number of results per page (max 100)
#     'apiKey': api_key
# }

# response = requests.get(base_url, params=params)

# if response.status_code == 200:
#     data = response.json()
#     articles = data['articles']
#     if articles:
#         for idx, article in enumerate(articles):
#             print(f"News {idx + 1}: {article['title']}")
#             print(f"URL: {article['url']}")
#             print()  # Add a blank line for readability
#     else:
#         print("No articles found.")
# else:
#     print("Error fetching data:", response.status_code)


In [129]:
from pygooglenews import GoogleNews

In [130]:
gn = GoogleNews()
search_results = gn.search('technology')


In [131]:
urls = [entry['link'] for entry in search_results['entries'][:5]]

In [132]:
for url in urls:
    print(url)
    print()

https://news.google.com/rss/articles/CBMiYmh0dHBzOi8vd3d3LnlhaG9vLmNvbS90ZWNoL3NjaWVudGlzdHMtc2lnbmlmaWNhbnQtYnJlYWt0aHJvdWdoLW1pY3JvY2hpcC10ZWNobm9sb2d5LTA5MDAwMDg0OC5odG1s0gEA?oc=5

https://news.google.com/rss/articles/CBMijAFodHRwczovL3d3dy5lc3BuLmNvbS91bml0ZWQtZm9vdGJhbGwtbGVhZ3VlL3N0b3J5L18vaWQvNDAzMzAwNzgvdWZsLXRlY2hub2xvZ3ktaW5ub3ZhdGlvbnMtbmZsLWJhbGwtc3BvdHRpbmctaGVsbWV0LXRyYW5zbWl0dGVycy12aWRlby1pcGFkc9IBAA?oc=5

https://news.google.com/rss/articles/CBMiemh0dHBzOi8vd3d3LmZvcmJlcy5jb20vc2l0ZXMvZGF2aWRiYWxhYmFuLzIwMjQvMDYvMTUvc2VjdXJpbmctZWR1Y2F0aW9uYWwtaW5zdGl0dXRpb25zLWluLXRoZS1lcmEtb2YtaW1tZXJzaXZlLXRlY2hub2xvZ3kv0gEA?oc=5

https://news.google.com/rss/articles/CBMiT2h0dHBzOi8vbmV3cy5taXQuZWR1LzIwMjQvY3JlYXRpb24tc3RvcnktdG9sZC10aHJvdWdoLWltbWVyc2l2ZS10ZWNobm9sb2d5LTA2MTTSAQA?oc=5

https://news.google.com/rss/articles/CBMiVmh0dHBzOi8vZm9ydHVuZS5jb20vMjAyNC8wNi8xMy9jbGltYXRlLXRlY2hub2xvZ3ktY29uc3VtZXIta2luZy1ldi1yZXRhaWwtZW52aXJvbm1lbnQv0gEA?oc=5



In [197]:
import requests
from bs4 import BeautifulSoup
import time

def scraping_article(url):
    headers = {
        'User-Agent': 'Your User Agent String',
    }
    
    # Perform the initial request without allowing redirects
    r = requests.get(url, headers=headers, allow_redirects=False)
    
    # Check if there was a redirection
    if r.status_code == 302:  # Status code 302 indicates a redirection
        redirected_url = r.headers['Location']
        print(f"Redirected to: {redirected_url}")
        
        # Now fetch the redirected URL
        r = requests.get(redirected_url, headers=headers)
    else:
        redirected_url = url  # No redirection, use the original URL
    
    # Delay to ensure the page is fully loaded (adjust as needed)
    time.sleep(5)
    
    soup = BeautifulSoup(r.text, 'html.parser')
    paragraphs = soup.find_all('p')
    text = [paragraph.text for paragraph in paragraphs]
    article = ' '.join(text)
    
    return article, redirected_url


In [171]:
def find_url(keyword):    
    gn = GoogleNews(country='IN')
    search_results = gn.search(keyword)
    urls = [entry['link'] for entry in search_results['entries'][:5]]
    return urls[0]

In [207]:
from newsapi import NewsApiClient

def find_news_urls(keyword):
    # Initialize News API client with your API key
    newsapi = NewsApiClient(api_key='e2fbdb35e6ac4796ab195b29fd82bd93')

    # Search for news articles based on keyword
    try:
        # Specify the language and sources if needed
        articles = newsapi.get_everything(q=keyword, language='en', page_size=5)

        # Extract URLs from the articles
        urls = [article['url'] for article in articles['articles']]
        return urls

    except Exception as e:
        print(f"Error fetching news articles: {e}")
        return []

# Example usage:
keyword = 'apple'
urls = find_news_urls(keyword)
print(urls)


['https://removed.com', 'https://removed.com', 'https://consent.yahoo.com/v2/collectConsent?sessionId=1_cc-session_8823786c-90f4-4b30-8198-1d295ab9a66c', 'https://removed.com', 'https://removed.com']


In [204]:
url=find_urls('apple')
print(url)

['https://www.apple.com/in/', 'https://www.apple.com/in/', 'https://www.instagram.com/apple/?hl=en', 'https://www.icloud.com/', 'https://www.icloud.com/']


In [198]:
a,b=scraping_article("https://news.google.com/rss/articles/CBMidWh0dHBzOi8vaW5kaWFuZXhwcmVzcy5jb20vYXJ0aWNsZS90ZWNobm9sb2d5L2FydGlmaWNpYWwtaW50ZWxsaWdlbmNlL2FwcGxlLWFpLXd3ZGMtYXBwbGUtaW50ZWxsaWdlbmNlLWdlbi1haS05MzkxMTYzL9IBemh0dHBzOi8vaW5kaWFuZXhwcmVzcy5jb20vYXJ0aWNsZS90ZWNobm9sb2d5L2FydGlmaWNpYWwtaW50ZWxsaWdlbmNlL2FwcGxlLWFpLXd3ZGMtYXBwbGUtaW50ZWxsaWdlbmNlLWdlbi1haS05MzkxMTYzL2xpdGUv?oc=5")

NoSuchDriverException: Message: Unable to obtain driver for chrome; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors/driver_location


In [196]:
print(a)




In [208]:
from bs4 import BeautifulSoup
import requests
from urllib.request import Request,urlopen

In [210]:
    root="https://www.google.com/"

    link="https://www.google.com/search?q=elon+musk&sca_esv=98cf8f6fdb8bd6f4&biw=1280&bih=551&tbm=nws&sxsrf=ADLYWIIZRUl5d0YuJvdGYn_GeaiCwyf2LA:1718530241064&story=GkEIpQEaGgoTc3RvcnlfbGFiZWxfcGFydGlhbBIDRVZNGiAKE3N0b3J5X2xhYmVsX3BhcnRpYWwSCUVsb24gTXVzazIuCiSI8oTgz4WLj1Owp6yThJCO7QSLjPP_wJTe9WiO1Nn76oSX1gMQt6eg6QsYBXICEAI&fcs=ACgqzedYp_suCJzWAjo7rGmw3tYrNkRaug&sa=X&ved=2ahUKEwjXqreE6N-GAxUUTGwGHRZMDCUQ7IUHegQIKhAF"

In [211]:
req = Request(link, headers={'User-Agent':'Mozilla/5.0'})
webpage= urlopen(req).read()

In [242]:
with requests.Session() as c:
    soup=BeautifulSoup(webpage,'html5lib')
    # print(soup)tHmfQe
    for items in soup.find_all('div',attrs={'class':'Gx5Zad'}):
        a_tag = items.find('a')
        if a_tag:
            # Extract the value of the 'href' attribute
            link = a_tag.get('href')
            print(link)        
        # print(items)

/search?q=elon+musk&sca_esv=98cf8f6fdb8bd6f4&biw=1280&bih=551&tbm=nws&story=GkEIpQEaGgoTc3RvcnlfbGFiZWxfcGFydGlhbBIDRVZNGiAKE3N0b3J5X2xhYmVsX3BhcnRpYWwSCUVsb24gTXVzazIuCiSI8oTgz4WLj1Owp6yThJCO7QSLjPP_wJTe9WiO1Nn76oSX1gMQt6eg6QsYBXICEAI&fcs=ACgqzedYp_suCJzWAjo7rGmw3tYrNkRaug&source=lnt&tbs=qdr:h&sa=X&ved=0ahUKEwiWzcfF6N-GAxUVr5UCHS0UDswQpwUIDQ
/url?q=https://www.indiatoday.in/india/story/elon-musk-evm-rajeev-chandrasekhar-bjp-congress-rahul-gandhi-controversy-2553867-2024-06-16&sa=U&ved=2ahUKEwiWzcfF6N-GAxUVr5UCHS0UDswQxfQBegQICBAC&usg=AOvVaw3NDIjptDHIMrua_dgw1V3I
/url?q=https://www.ndtv.com/world-news/we-should-eliminate-evms-elon-musk-flags-risk-of-poll-rigging-5900100&sa=U&ved=2ahUKEwiWzcfF6N-GAxUVr5UCHS0UDswQxfQBegQICRAC&usg=AOvVaw2nXGLwuAD0SA3Sg-0IZZ6K
/url?q=https://www.thehindu.com/sci-tech/technology/rajeev-chandrasekhar-counters-elon-musks-evm-tampering-remark-calls-it-a-sweeping-generalisation/article68296179.ece&sa=U&ved=2ahUKEwiWzcfF6N-GAxUVr5UCHS0UDswQxfQBegQIBxAC&usg=AOvVa

In [246]:
from bs4 import BeautifulSoup
import requests

root = "https://www.google.com/"
search_query = "modi"
link = f"https://www.google.com/search?q={search_query}&tbm=nws"

# Send request with headers to mimic a browser
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(link, headers=headers)
webpage = response.content

# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(webpage, 'html5lib')

# Initialize an empty list to store the links
links = []

# Find all div elements with class 'Gx5Zad'
for div_tag in soup.find_all('div', class_='Gx5Zad'):
    # Find the <a> tag inside each div
    a_tag = div_tag.find('a')
    if a_tag:
        # Extract the actual URL from the 'href' attribute
        if 'href' in a_tag.attrs:
            href = a_tag['href']
            # Extract the actual URL from the Google redirect URL
            if href.startswith('/url?q='):
                url = href.split('/url?q=')[1].split('&sa=')[0]
                links.append(url)

# Now 'links' contains all the extracted URLs in list format
print("List of extracted links:")
print(links)


List of extracted links:
['https://www.thehindu.com/news/international/g7-summit-2024-highlights-of-pm-modis-meetings-with-world-leaders-watch-video/article68292605.ece', 'https://www.livemint.com/politics/news/thank-you-prime-minister-sharad-pawar-credits-pm-modi-for-lok-sabha-election-2024-outcome-in-maharashtra-heres-why-11718443807338.html', 'https://indianexpress.com/article/opinion/columns/inside-track-modified-modi-3-0-9395030/', 'https://timesofindia.indiatimes.com/india/palestine-pm-writes-to-pm-modi-seeks-india-help-for-gaza-truce/articleshow/111026952.cms', 'https://www.india.com/business/pm-modi-to-flag-off-chennai-nagercoil-bengaluru-madurai-vande-bharat-express-check-route-timetable-7018009/', 'https://m.economictimes.com/news/india/modi-seizes-center-stage-at-g-7-to-ambush-biden-trudeau/articleshow/111030821.cms', 'https://www.ndtv.com/india-news/asked-about-inviting-pm-modi-to-g7-next-year-justin-trudeau-said-this-5901546', 'https://www.wionews.com/entertainment/kangana

In [253]:
a="elon musk"
a = "elon musk"
a = a.replace(" ", "+")

In [254]:
a

'elon+musk'

In [257]:
def scraping_article(url):
    headers = {
    'User-Agent': 'Your User Agent String',
    }
    r=requests.get(url,headers=headers)
    soup=BeautifulSoup(r.text,'html.parser')
    paragraphs=soup.find_all('p')
    text= [paragraph.text for paragraph in paragraphs]
    words=' '.join(text).split(' ')
    article = ' '.join(words)
    return paragraphs

In [258]:
scraping_article("https://www.dnaindia.com/business/report-mukesh-ambani-beats-elon-musk-jeff-bezos-gets-approval-to-launch-satellite-internet-in-india-via-jio-3093364")

[<p><a href="/education/report-meet-man-who-once-worked-at-construction-sites-had-no-money-for-food-then-took-oath-to-clear-mppsc-exam-now-3093613">Meet man who once worked at construction sites, had no money for food, then took oath to clear MPPSC exam, now..</a></p>,
 <p><a href="/india/report-elon-musk-vs-ex-union-minister-rajeev-chandrasekhar-as-evm-debate-heats-up-rahul-gandhi-jumps-in-as-well-3093617">Elon Musk vs ex-Union Minister Rajeev Chandrasekhar as EVM debate heats up, Rahul Gandhi jumps in as well</a></p>,
 <p><a href="/bollywood/report-anushka-sharma-wishes-happy-father-s-day-to-virat-kohli-on-vamika-akaay-behalf-3093615">Anushka Sharma wishes Happy Father's Day to Virat Kohli on Vamika, Akaay's behalf: 'How can one person be...'</a></p>,
 <p><a href="/health/report-rare-flesh-eating-bacteria-that-can-kill-in-48-hours-spreads-in-japan-all-you-need-to-know-3093614">Rare flesh-eating bacteria that can kill in 48 hours spreads in Japan: All you need to know</a></p>,
 <p><a 