In [1]:
from bs4 import BeautifulSoup
import requests
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from urllib.parse import urljoin
from requests_html import HTMLSession

In [2]:
#pegasus load
model_name='google/pegasus-xsum'
tokenizer= PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
base_url='https://techcrunch.com'
relative_url='/category/artificial-intelligence'
url=urljoin(base_url, relative_url)
page= requests.get(url)
soup=BeautifulSoup(page.content, 'html.parser')

In [12]:
def get_headlines(base_url, relative_url):
    try:
        absolute_url = urljoin(base_url, relative_url)
        session = HTMLSession()
        response = session.get(absolute_url)
        response.raise_for_status() 
        # Find all elements containing both headline text and article URLs
        headlines_elements = response.html.find('h2.post-block__title a')
        # Extract headline text and article URLs as tuples
        headlines = [(headline.text, headline.absolute_links.pop()) for headline in headlines_elements]
        return headlines
    except Exception as e:
        print(f"Error fetching headlines from URL '{absolute_url}': {e}")
        return None

def get_article_content(article_url):
    try:
        session = HTMLSession()
        response = session.get(article_url)
        response.raise_for_status()
        content_element = response.html.find('div.article-content', first=True)
        if content_element:
            content = content_element.text
            return content
        else:
            print(f"Content element not found in URL '{article_url}'")
            return None
    except Exception as e:
        print(f"Error fetching content from URL '{article_url}': {e}")
        return None

def get_paraphrased_content(original_content):
    try:
        print("Original Content Length:", len(original_content))
        # Split the original content into paragraphs
        paragraphs = original_content.split('\n\n')  # Assuming paragraphs are separated by double line breaks
        paraphrased_paragraphs = []
        
        # Paraphrase each paragraph
        for paragraph in paragraphs:
            # Split each paragraph into chunks of 512 tokens
            chunks = [paragraph[i:i+512] for i in range(0, len(paragraph), 512)]
            paraphrased_chunks = []
            
            # Paraphrase each chunk
            for chunk in chunks:
                inputs = tokenizer([chunk], return_tensors='pt', max_length=512, truncation=True)
                summary_ids = model.generate(inputs['input_ids'], num_beams=4, min_length=30, max_length=512, temperature=0.7, do_sample=True, early_stopping=True)
                paraphrased_chunk = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
                paraphrased_chunks.append(paraphrased_chunk)
                
            # Combine the paraphrased chunks into a single string representing the paraphrased paragraph
            paraphrased_paragraph = ' '.join(paraphrased_chunks)
            paraphrased_paragraphs.append(paraphrased_paragraph)
        
        # Combine the paraphrased paragraphs into a single string representing the paraphrased content
        paraphrased_content = '\n\n'.join(paraphrased_paragraphs)
        
        # Check if the paraphrased content is not empty
        if paraphrased_content.strip():
            return paraphrased_content
        else:
            print("Paraphrased content is empty")
            return original_content
    except Exception as e:
        print(f"Error fetching or paraphrasing content: {e}")
        return None

In [5]:
#Titles with their indexes
headlines=get_headlines(base_url, relative_url)
if headlines:
    for i, (headline,_) in enumerate(headlines, 0):
        print(f"{i}. {headline}")

0. Microsoft’s Windows 11 Copilot gets smarter with new plugins and skills
1. With Brain.ai, generative AI is the OS
2. Former Twitter engineers are building Particle, an AI-powered news reader, backed by $4.4M
3. Brave’s Leo AI assistant is now available to Android users
4. Google brings Stack Overflow’s knowledge base to Gemini for Google Cloud
5. Venus Williams brings her interior design skills to Palazzo, a new generative AI-powered platform
6. Mark Zuckerberg woos Big Tech in Asia to double down on AI chips
7. Gemini on Android can’t ID songs, and it’s frustrating
8. Tim Cook says Apple will ‘break new ground’ in GenAI this year
9. Morph Studio lets you make films using Stability AI–generated clips
10. Anamorph’s generative technology reorders scenes to create unlimited versions of one film
11. Adobe reveals a GenAI tool for music
12. Microsoft invests in yet another AI company
13. StarCoder 2 is a code-generating AI that runs on most GPUs
14. SambaNova now offers a bundle of gene

In [18]:
#Original Content for comparision
index = 1
if index >= 0 and index < len(headlines):
    _, article_url = headlines[index]
    content = get_article_content(article_url)
    if content:
        print(f"\n Title: {headlines[index][0]}")
        print(f"\n Content: {content[:]}...")
    else:
        print("Content not available")
else:
    print("Invalid index")



 Title: With Brain.ai, generative AI is the OS

 Content: The Humane Ai Pin and Rabbit handheld have captured a good bit of press interest for their individual approaches to integrating generative AI with hardware. Humane, in particular, is presenting its wearable as a look at life beyond the smartphone. That naturally prompts the question: What, precisely, is wrong with the smartphone? While it’s true that the form factor has plateaued, these devices are still out in the world, in billions of hands.
Earlier this week, I met with Jerry Yue amid the cacophonous din of Deutsch Telekom’s Mobile World Congress booth. After a product demo and a sit-down conversation, I admit that I’m impressed with the Brain.ai (alternately known as Brain Technologies) founder and CEO’s vision for the future of smartphones. I won’t go so far as saying I’m fully convinced until I’ve had an opportunity to spend more time with the product, but it absolutely paints a compelling picture of how generative AI mig

In [19]:
# Paraphrased content block
if content:
    paraphrased_content = get_paraphrased_content(content)
    if paraphrased_content:
        print("\nParaphrased Content:")
        print(paraphrased_content)
    else:
        print("Paraphrased content not available")


Original Content Length: 9622


KeyboardInterrupt: 