In [1]:
!pip install beautifulsoup4 requests

Defaulting to user installation because normal site-packages is not writeable


In [26]:
import requests
from bs4 import BeautifulSoup as bs
import re
import csv

In [18]:
class BBCWebCrawler:
    def __init__(self, theUrl:str):
        article = requests.get(theUrl)
        self.soup = bs(article.content, "html.parser")
        self.title = self.get_title()
        self.body = self.get_article()[0]
        self.images = self.get_images()
        self.paragraphs = self.get_paragraphs()
        self.hlinks = self.get_hlinks()
        self.subheadings = self.get_subheadings()
        self.author = self.get_author_details()
        
    def get_article(self) -> list:
        return self.soup.find_all("article")
    
    def get_title(self) -> str:
        return self.soup.html.head.title.text
    
    def get_subheadings(self) -> list:
        subhds = self.body.findChildren("h2")
        return [h.text for h in subhds] 
    
    def get_images(self) -> list:
        ims = self.body.findChildren("img")
        descriptionsAndPaths = []
        for im in ims: 
            if "<header" not in str(im.parent):
                temp = {}
                temp["Name"] = im.get("alt")
                temp["Link"] = im.get("src")
                descriptionsAndPaths.append(temp)
        return descriptionsAndPaths
    
    def get_paragraphs(self) -> list:
        ps = self.body.findChildren("p")
        return [t.text for t in ps if "<header" not in str(t.parent)]
    
    def get_hlinks(self) -> list:
        links = self.body.findChildren("a")
        return [l.get("href") for l in links]
    
    def get_author_details(self) -> dict: 
        head = self.soup.find_all('p', {'class': re.compile(r'.*Contributor.*')})
        theAuthor = head[0]
        temp = {}
        temp["Author_Image"] = theAuthor.findChildren("img")[0].get("src")
        temp["Author_Name"] = theAuthor.findChildren("strong")[0].text
        #temp["Author_Occupation"] = 
        thelinks = theAuthor.findChildren("a")
        temp["Author_link"] = thelinks[0].get("href")
        temp["Author_SocialMedia"] = thelinks[1].get("href")
        return temp

In [19]:
parsed = BBCWebCrawler("https://www.bbc.co.uk/news/health-60324928")

In [20]:
parsed.author

{'Author_Image': 'https://ichef.bbci.co.uk/news/64/cpsprodpb/141F0/production/_112961428_nicktriggle.jpg',
 'Author_Name': 'Nick Triggle',
 'Author_link': '/news/correspondents/nicktriggle',
 'Author_SocialMedia': 'https://www.twitter.com/nicktriggle'}

In [21]:
parsed.title

"Is ending the last Covid rule 'brave or stupid'? - BBC News"

In [22]:
parsed.subheadings

['The case for relaxing is not clear cut',
 'The impact could be marginal',
 'More on this story',
 'Related Topics']

In [23]:
parsed.images

[{'Name': '',
  'Link': 'https://ichef.bbci.co.uk/news/64/cpsprodpb/141F0/production/_112961428_nicktriggle.jpg'},
 {'Name': 'A woman looks out her window',
  'Link': 'https://ichef.bbci.co.uk/news/976/cpsprodpb/D688/production/_122602945_gettyimages-1215743266.jpg'},
 {'Name': 'Chart showing excess deaths',
  'Link': 'https://ichef.bbci.co.uk/news/2048/cpsprodpb/08A2/production/_123201220_optimised-uk_mortality_v_average__08feb-nc.png'},
 {'Name': 'Presentational white space',
  'Link': 'https://ichef.bbci.co.uk/news/624/cpsprodpb/63B0/production/_117302552_a6bc73e5-91d8-4252-b778-e93e82abc65b.png'},
 {'Name': 'Chart showing infections',
  'Link': 'https://ichef.bbci.co.uk/news/2048/cpsprodpb/13D3A/production/_123201218_optimised-ons_uk_timeseries_09feb-nc.png'},
 {'Name': 'Presentational white space',
  'Link': 'https://ichef.bbci.co.uk/news/624/cpsprodpb/63B0/production/_117302552_a6bc73e5-91d8-4252-b778-e93e82abc65b.png'},
 {'Name': "Banner image reading 'more about coronavirus'",


In [24]:
parsed.hlinks

['/news/correspondents/nicktriggle',
 'https://www.twitter.com/nicktriggle',
 '#comments',
 'https://www.bbc.co.uk/usingthebbc/terms/can-i-share-things-from-the-bbc',
 '/news/topics/cyz0z8w0ydwt',
 'https://www.bbc.co.uk/news/uk-60319947',
 'https://www.bbc.co.uk/news/explainers-54239922',
 'https://www.bbc.co.uk/news/explainers-52530518',
 'https://twitter.com/nicktriggle',
 'http://www.bbc.co.uk/nicktriggle',
 'https://www.bbc.co.uk/news/uk-51768274',
 'https://www.bbc.co.uk/news/world-asia-china-51176409',
 'https://www.bbc.co.uk/news/world-51235105',
 'https://www.bbc.co.uk/news/health-52354520',
 'https://www.bbc.co.uk/news/health-52003804',
 'mailto:haveyoursay@bbc.co.uk?subject=CovidRestrictions60324928',
 'http://twitter.com/BBC_HaveYourSay',
 'https://www.bbc.co.uk/send/u16904890',
 'http://www.bbc.co.uk/usingthebbc/terms/',
 'http://www.bbc.co.uk/usingthebbc/privacy-policy/',
 'https://www.bbc.co.uk/news/have_your_say',
 'mailto:Haveyoursay@bbc.co.uk',
 '/news/uk-51768274',
 

In [25]:
parsed.paragraphs

['The last remaining Covid restriction in England could be gone in weeks, after the prime minister announced the legal requirement to isolate after a positive test may be scrapped early.',
 'The plan had been to end it on 24 March - but if the trends remained positive, Boris Johnson said, it could go a month early.',
 'The move has taken many experts by surprise, with one describing it as either brave or stupid.',
 'As often with Covid, the evidence is unclear.',
 'Certainly questions are being asked about the politics of the move, with the prime minister under continuing pressure. ',
 'But there is plenty of positive news in the latest data - as the prime minister suggested. ',
 'Hospital cases are falling and, despite the large wave of Omicron infections, overall deaths have not risen above what would normally be seen in winter.',
 'With new treatments and a huge amount of immunity in the population, from both vaccination and infection, the risk from the virus is lessening all the ti

In [29]:
fn = 'authorDetails.csv'
with open(fn, 'w', newline='') as f:
    w = csv.DictWriter(f, ['Author_Name', 'Author_Image', 'Author_SocialMedia', 'Author_link'])
    w.writeheader()
    w.writerow(parsed.author)

In [30]:
fn = 'Images.csv'
with open(fn, 'w', newline='') as f:
    w = csv.DictWriter(f, ['Name', 'Link'])
    w.writeheader()
    for im in parsed.images: 
        w.writerow(im)

In [31]:
fn = 'articleText.txt'
with open(fn, 'w', newline='') as f:
    for p in parsed.paragraphs: 
        f.write(p)
        f.write('\n')