## Get the data from the Guardian API

- A simple explation of RSS is available here: https://www.theguardian.com/help/feeds
- Let's try politics RSS: https://www.theguardian.com/politics/rss

In [None]:
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
from lxml import etree

In [None]:
url = 'https://www.theguardian.com/politics/rss'
response = urlopen(url)

## Method 1: Using BeautifulSoup

You can use BeautifulSoup for xml parsing, and can use the same method as you did for web-scraping

In [None]:
bs = BeautifulSoup(response, 'xml')

In [None]:
items = bs.select("item")

In [None]:
## check the length
len(items)

In [None]:
## Printout xml
items[0]

In [None]:
## Printout xml (prettified), maybe not necessary
print(items[0].prettify())

### Extract contents

In [None]:
items[0].title

In [None]:
title = [item.title.get_text() for item in items]

In [None]:
title[:5]

In [None]:
description = [item.description.get_text() for item in items]

In [None]:
pubDate = [item.pubDate.get_text() for item in items]

In [None]:
link = [item.link.get_text() for item in items]

In [None]:
df = pd.DataFrame({"title": title, "description": description, "pubDate": pubDate, "link": link})

In [None]:
df.head()

#### Save the data

In [None]:
## remove html tags
df['description'] = df['description'].str.replace(r"<.+?>", "")

In [None]:
df.to_csv("df_guard_politics.csv")

## Method 2: Use xml parser

In [None]:
url = 'https://www.theguardian.com/politics/rss'
response = urlopen(url)
tree = etree.parse(response)


In [None]:
## Using xpath to select item elements in channel
items = tree.findall('./channel/item')

In [None]:
len(items)

In [None]:
print(etree.tostring(items[0], pretty_print = True, encoding = str))

In [None]:
## For an item you can get the text of title like this
items[0].find("./title").text

In [None]:
title = [item.find("./title").text for item in items]

In [None]:
title[:5]

In [None]:
description = [item.find("./description").text for item in items]

In [None]:
pubDate = [item.find("./pubDate").text for item in items]

In [None]:
link = [item.find("./link").text for item in items]

In [None]:
df = pd.DataFrame({"title": title, "description": description, "pubDate": pubDate, "link": link})