In [64]:
from bs4 import BeautifulSoup
import requests
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from urllib.parse import urljoin
from requests_html import HTMLSession

In [65]:
#pegasus load
model_name='google/pegasus-xsum'
tokenizer= PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [66]:
base_url='https://techcrunch.com'
relative_url='/category/artificial-intelligence'
url=urljoin(base_url, relative_url)
page= requests.get(url)
soup=BeautifulSoup(page.content, 'html.parser')

In [73]:
def get_headlines(base_url, relative_url):
    try:
        absolute_url = urljoin(base_url, relative_url)
        session = HTMLSession()
        response = session.get(absolute_url)
        response.raise_for_status() 
        # Find all elements containing both headline text and article URLs
        headlines_elements = response.html.find('h2.post-block__title a')
        # Extract headline text and article URLs as tuples
        headlines = [(headline.text, headline.absolute_links.pop()) for headline in headlines_elements]
        return headlines
    except Exception as e:
        print(f"Error fetching headlines from URL '{absolute_url}': {e}")
        return None

def get_article_content(article_url):
    try:
        session = HTMLSession()
        response = session.get(article_url)
        response.raise_for_status()
        content_element = response.html.find('div.article-content', first=True)
        if content_element:
            content = content_element.text
            return content
        else:
            print(f"Content element not found in URL '{article_url}'")
            return None
    except Exception as e:
        print(f"Error fetching content from URL '{article_url}': {e}")
        return None

def get_paraphrased_content(original_content):
    try:
        print("Original Content Length:", len(original_content))
        # Split the original content into paragraphs
        paragraphs = original_content.split('\n\n')  # Assuming paragraphs are separated by double line breaks
        paraphrased_paragraphs = []
        
        # Paraphrase each paragraph
        for paragraph in paragraphs:
            # Split each paragraph into chunks of 512 tokens
            chunks = [paragraph[i:i+512] for i in range(0, len(paragraph), 512)]
            paraphrased_chunks = []
            
            # Paraphrase each chunk
            for chunk in chunks:
                inputs = tokenizer([chunk], return_tensors='pt', max_length=512, truncation=True)
                summary_ids = model.generate(inputs['input_ids'], num_beams=4, min_length=30, max_length=512, temperature=0.7, do_sample=True, early_stopping=True)
                paraphrased_chunk = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
                paraphrased_chunks.append(paraphrased_chunk)
                
            # Combine the paraphrased chunks into a single string representing the paraphrased paragraph
            paraphrased_paragraph = ' '.join(paraphrased_chunks)
            paraphrased_paragraphs.append(paraphrased_paragraph)
        
        # Combine the paraphrased paragraphs into a single string representing the paraphrased content
        paraphrased_content = '\n\n'.join(paraphrased_paragraphs)
        
        # Check if the paraphrased content is not empty
        if paraphrased_content.strip():
            return paraphrased_content
        else:
            print("Paraphrased content is empty")
            return original_content
    except Exception as e:
        print(f"Error fetching or paraphrasing content: {e}")
        return None

In [70]:
#Titles with their indexes
headlines=get_headlines(base_url, relative_url)
if headlines:
    for i, (headline,_) in enumerate(headlines, 0):
        print(f"{i}. {headline}")

0. ‘Embarrassing and wrong’: Google admits it lost control of image-generating AI
1. Treating a chatbot nicely might boost its performance — here’s why
2. Humane pushes Ai Pin ship date to mid-April
3. Arc browser’s new AI-powered ‘pinch-to-summarize’ feature is clever, but often misses the mark
4. Mutale Nkonde’s nonprofit is working to make AI less biased
5. Armenia’s 10web brings AI website-building to WordPress
6. Reddit says it’s made $203M so far licensing its data
7. Stable Diffusion 3 arrives to solidify early lead in AI imagery against Sora and Gemini
8. Chrome gets a built-in AI writing tool powered by Gemini
9. Women in AI: Krystal Kauffman, research fellow at the Distributed AI Research Institute
10. The women in AI making a difference
11. DatologyAI is building tech to automatically curate AI training datasets
12. Google pauses AI tool Gemini’s ability to generate images of people after historical inaccuracies
13. Antler’s founder on its vertical AI bet in Southeast Asia
1

In [74]:
index=13
if index>=0 and index<len(headlines):
    _, article_url=headlines[index]
    content=original_content(article_url)
    if content:
        print(f"\n Title: {headlines[index][0]}")
        print(f"\n Content: {content[:]}...")
        # Paraphrase the entire content
        paraphrased_content=get_paraphrased_content(content)
        if paraphrased_content:
            print("\nParaphrased Content:")
            print(paraphrased_content)
        else:
            print("Paraphrased content not available")
    else:
        print("Content not available")
else:
    print("Invalid index")


 Title: Antler’s founder on its vertical AI bet in Southeast Asia

 Content: A growing roster of vertical AI startups is emerging in Southeast Asia to serve sectors ranging from seafood to finance. Singapore-based venture capital firm Antler recently made a bet on 37 of them, investing $5.1 million in total for pre-seed deals. Antler also announced a partnership with Khazanah, Malaysia’s sovereign wealth fund.
“If you look at the rest of the world, there’s lots of horizontal AI and it’s becoming insanely competitive,” Antler co-founder and managing partner Jussi Salovaara tells TechCrunch. “What founders are increasingly looking to solve in this part of the world are practical problems in different industries.”
He adds that even though Southeast Asia doesn’t have the talent pool to build something like OpenAI yet, they can take a customer-first approach to AI apps, solving pain points unique to different sectors and markets.
Within verticalized AI, different trends are emerging in eac