In [3]:
import requests
from bs4 import BeautifulSoup
import re

def clean_text(text):
    """Clean text by removing unnecessary characters and altering the format of words."""
    text = text.lower()  # convert to lowercase
    text = re.sub(r'\s+', ' ', text)  # replace multiple spaces with one space
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = text.strip()  # remove leading and trailing spaces
    return text

def extract_data():
    url = 'https://www.bbc.com'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    cards = soup.find_all('div', {'data-testid': 'card-text-wrapper'})

    data = []
    for card in cards:
        headline = card.find('h2', {'data-testid': 'card-headline'})
        description = card.find('p', {'data-testid': 'card-description'})
        link_tag = card.parent.find('a', {'data-testid': 'internal-link'})

        title = clean_text(headline.text) if headline else 'No title found'
        description = clean_text(description.text) if description else 'No description available'
        link = url + link_tag['href'] if link_tag and link_tag.get('href') else 'No link available'

        data.append({
            'title': title,
            'description': description,
            'link': link
        })

    return data

extracted_data = extract_data()
for item in extracted_data:
    print(item)


{'title': 'ohtanis exinterpreter to plead guilty to fraud', 'description': 'ippei mizuhara agrees to plead guilty to stealing from japanese baseball star shohei ohtani in the us', 'link': 'No link available'}
{'title': 'ohtanis exinterpreter to plead guilty to fraud', 'description': 'ippei mizuhara agrees to plead guilty to stealing from japanese baseball star shohei ohtani in the us', 'link': 'No link available'}
{'title': 'us to halt some arms supplies if israel invades rafah', 'description': 'president biden warns he will stop supplying some weapons if israel launches a ground operation', 'link': 'No link available'}
{'title': 'marjorie taylor greene fails to remove house speaker', 'description': 'the house killed the farright lawmakers motion to remove republican mike johnson over support for ukraine aid', 'link': 'No link available'}
{'title': 'laken riley venezuelan man charged with murder', 'description': 'georgia files 10 charges against undocumented migrant jose ibarra over de

In [4]:
import json
def save_data_to_json(data, filename):
    """Save the data to a JSON file."""
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    print(f"Data successfully saved to {filename}")

# Example usage
save_data_to_json(extracted_data, 'extracted_data.json')

Data successfully saved to extracted_data.json
