# Web scrapping BBC RSS fedd data

In [1]:
#import library
import requests
from bs4 import BeautifulSoup

#enter URL
url = "http://feeds.bbci.co.uk/news/world/us_and_canada/rss.xml"

resp = requests.get(url)

soup = BeautifulSoup(resp.content, features="xml")

In [2]:
print(soup.prettify())

<?xml version="1.0" encoding="utf-8"?>
<?xml-stylesheet title="XSL_formatting" type="text/xsl" href="/shared/bsp/xsl/rss/nolsol.xsl"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:media="http://search.yahoo.com/mrss/">
 <channel>
  <title>
   BBC News - US &amp; Canada
  </title>
  <description>
   BBC News - US &amp; Canada
  </description>
  <link>
   https://www.bbc.co.uk/news/
  </link>
  <image>
   <url>
    https://news.bbcimg.co.uk/nol/shared/img/bbc_news_120x60.gif
   </url>
   <title>
    BBC News - US &amp; Canada
   </title>
   <link>
    https://www.bbc.co.uk/news/
   </link>
  </image>
  <generator>
   RSS for Node
  </generator>
  <lastBuildDate>
   Sun, 08 Mar 2020 00:31:02 GMT
  </lastBuildDate>
  <copyright>
   Copyright: (C) British Broadcasting Corporation, see http://news.bbc.co.uk/2/hi/help/rss/4498287.stm for terms and conditions of reuse.
  </

In [3]:
items = soup.findAll('item')

In [4]:
print(items)

[<item>
<title>Mick Mulvaney: Trump replaces White House chief of staff</title>
<description>Trump ally Mark Meadows takes over the job, as Mr Mulvaney is sent to Northern Ireland as an envoy.</description>
<link>https://www.bbc.co.uk/news/world-us-canada-51779902</link>
<guid isPermaLink="true">https://www.bbc.co.uk/news/world-us-canada-51779902</guid>
<pubDate>Sat, 07 Mar 2020 13:34:07 GMT</pubDate>
</item>, <item>
<title>South by Southwest festival cancelled over coronavirus</title>
<description>The world-famous music event in Austin, Texas is called off for the first time in its 34-year history.</description>
<link>https://www.bbc.co.uk/news/world-us-canada-51778423</link>
<guid isPermaLink="true">https://www.bbc.co.uk/news/world-us-canada-51778423</guid>
<pubDate>Sat, 07 Mar 2020 02:41:35 GMT</pubDate>
</item>, <item>
<title>Woody Allen book pulped after walkout at publisher</title>
<description>Hachette staff protested over the book deal for the director, who has been dogged by a

In [5]:
len(items)

36

In [6]:
item = items[1]

In [7]:
item

<item>
<title>South by Southwest festival cancelled over coronavirus</title>
<description>The world-famous music event in Austin, Texas is called off for the first time in its 34-year history.</description>
<link>https://www.bbc.co.uk/news/world-us-canada-51778423</link>
<guid isPermaLink="true">https://www.bbc.co.uk/news/world-us-canada-51778423</guid>
<pubDate>Sat, 07 Mar 2020 02:41:35 GMT</pubDate>
</item>

In [8]:
item.title

<title>South by Southwest festival cancelled over coronavirus</title>

In [9]:
item.title.text

'South by Southwest festival cancelled over coronavirus'

In [10]:
item.pubdate

In [11]:
#declare empty var to append data
news_items = []

In [12]:
#scarring HTML tags such as Title, Description, Links and Publication date
for item in items:
    news_item = {}
    news_item['title'] = item.title.text
    news_item['description'] = item.description.text
    news_item['link'] = item.link.text
    news_item['pubDate'] = item.pubDate.text
    news_items.append(news_item)

In [13]:
print(news_items)

[{'title': 'Mick Mulvaney: Trump replaces White House chief of staff', 'description': 'Trump ally Mark Meadows takes over the job, as Mr Mulvaney is sent to Northern Ireland as an envoy.', 'link': 'https://www.bbc.co.uk/news/world-us-canada-51779902', 'pubDate': 'Sat, 07 Mar 2020 13:34:07 GMT'}, {'title': 'South by Southwest festival cancelled over coronavirus', 'description': 'The world-famous music event in Austin, Texas is called off for the first time in its 34-year history.', 'link': 'https://www.bbc.co.uk/news/world-us-canada-51778423', 'pubDate': 'Sat, 07 Mar 2020 02:41:35 GMT'}, {'title': 'Woody Allen book pulped after walkout at publisher', 'description': 'Hachette staff protested over the book deal for the director, who has been dogged by abuse claims.', 'link': 'https://www.bbc.co.uk/news/world-us-canada-51777650', 'pubDate': 'Sat, 07 Mar 2020 12:00:14 GMT'}, {'title': 'Toronto teen home safe after brazen abduction', 'description': "He is believed to be a victim of reprisal 

In [14]:
news_items[0]

{'title': 'Mick Mulvaney: Trump replaces White House chief of staff',
 'description': 'Trump ally Mark Meadows takes over the job, as Mr Mulvaney is sent to Northern Ireland as an envoy.',
 'link': 'https://www.bbc.co.uk/news/world-us-canada-51779902',
 'pubDate': 'Sat, 07 Mar 2020 13:34:07 GMT'}

In [15]:
#import pandas to create dataframe and CSV
import pandas as pd
df = pd.DataFrame(news_items,columns=['title','description','link','pubDate'])

In [16]:
df.head()

Unnamed: 0,title,description,link,pubDate
0,Mick Mulvaney: Trump replaces White House chie...,"Trump ally Mark Meadows takes over the job, as...",https://www.bbc.co.uk/news/world-us-canada-517...,"Sat, 07 Mar 2020 13:34:07 GMT"
1,South by Southwest festival cancelled over cor...,"The world-famous music event in Austin, Texas ...",https://www.bbc.co.uk/news/world-us-canada-517...,"Sat, 07 Mar 2020 02:41:35 GMT"
2,Woody Allen book pulped after walkout at publi...,Hachette staff protested over the book deal fo...,https://www.bbc.co.uk/news/world-us-canada-517...,"Sat, 07 Mar 2020 12:00:14 GMT"
3,Toronto teen home safe after brazen abduction,He is believed to be a victim of reprisal over...,https://www.bbc.co.uk/news/world-us-canada-517...,"Fri, 06 Mar 2020 16:40:18 GMT"
4,Bill Clinton claims Monica Lewinsky affair was...,The former president was impeached in 1998 for...,https://www.bbc.co.uk/news/world-us-canada-517...,"Fri, 06 Mar 2020 16:56:35 GMT"


In [17]:
df.to_csv('BBCdata1.csv',index=False, encoding = 'utf-8')