# Working with RSS Feeds Lab

Complete the following set of exercises to solidify your knowledge of parsing RSS feeds and extracting information from them.

In [74]:
import feedparser
import requests 
import xmltodict
import re
import time


### 1. Use feedparser to parse the following RSS feed URL.

In [75]:
url = 'http://feeds.feedburner.com/oreilly/radar/atom'

In [76]:
resp = requests.get(url)
data = feedparser.parse(resp.content)
data

{'feed': {'title': 'Radar',
  'title_detail': {'type': 'text/plain',
   'language': None,
   'base': '',
   'value': 'Radar'},
  'links': [{'rel': 'alternate',
    'type': 'text/html',
    'href': 'https://www.oreilly.com/radar'},
   {'rel': 'self',
    'type': 'application/rss+xml',
    'href': 'http://feeds.feedburner.com/oreilly/radar/atom'},
   {'rel': 'hub',
    'href': 'http://pubsubhubbub.appspot.com/',
    'type': 'text/html'}],
  'link': 'https://www.oreilly.com/radar',
  'subtitle': 'Now, next, and beyond: Tracking need-to-know trends at the intersection of business and technology',
  'subtitle_detail': {'type': 'text/html',
   'language': None,
   'base': '',
   'value': 'Now, next, and beyond: Tracking need-to-know trends at the intersection of business and technology'},
  'updated': 'Wed, 19 Aug 2020 11:44:07 +0000',
  'updated_parsed': time.struct_time(tm_year=2020, tm_mon=8, tm_mday=19, tm_hour=11, tm_min=44, tm_sec=7, tm_wday=2, tm_yday=232, tm_isdst=0),
  'language': '

### 2. Obtain a list of components (keys) that are available for this feed.

In [77]:
data.keys()

dict_keys(['feed', 'entries', 'bozo', 'encoding', 'version', 'namespaces'])

### 3. Obtain a list of components (keys) that are available for the *feed* component of this RSS feed.

In [78]:
data['feed'].keys()

dict_keys(['title', 'title_detail', 'links', 'link', 'subtitle', 'subtitle_detail', 'updated', 'updated_parsed', 'language', 'sy_updateperiod', 'sy_updatefrequency', 'generator_detail', 'generator', 'feedburner_info', 'geo_lat', 'geo_long', 'feedburner_emailserviceid', 'feedburner_feedburnerhostname'])

### 4. Extract and print the feed title, subtitle, author, and link.

In [79]:
title = data['feed']['title']
subtitle =data['feed']['subtitle']

link = data['feed']['link']
author = [i['author'] for i in data['entries'] ]

print(title)
print(subtitle)
print(link)
author


Radar
Now, next, and beyond: Tracking need-to-know trends at the intersection of business and technology
https://www.oreilly.com/radar


['Nat Torkington',
 'Matthew Rocklin and Hugo Bowne-Anderson',
 'Nat Torkington',
 'Mike Loukides',
 'Nat Torkington',
 'Nat Torkington',
 'Nat Torkington',
 'Mike Loukides',
 'Nat Torkington',
 'Nat Torkington',
 'Nat Torkington',
 'Justin Norman, Peter Skomoroch and Mike Loukides',
 'Mike Loukides',
 'Nat Torkington',
 'Nat Torkington',
 'Nat Torkington',
 'Nat Torkington',
 'Mike Loukides',
 'Nat Torkington',
 'Nat Torkington',
 'Nat Torkington',
 'Nat Torkington',
 'Mike Loukides and Steve Swoyer',
 'Nat Torkington',
 'Sarah Gold',
 'Nat Torkington',
 'Nat Torkington',
 'Nat Torkington',
 'Mike Loukides',
 'Nat Torkington',
 'Nat Torkington',
 'Nat Torkington',
 'Nat Torkington',
 'Nat Torkington',
 'Nat Torkington',
 'Mike Loukides',
 'Nat Torkington',
 'Nat Torkington',
 'Nat Torkington',
 'Nat Torkington',
 'Nat Torkington',
 'Mike Loukides',
 'Nat Torkington',
 'Nat Torkington',
 'Nat Torkington',
 'Nat Torkington',
 'Nat Torkington',
 'Hugo Bowne-Anderson',
 'Nat Torkington',


### 5. Count the number of entries that are contained in this RSS feed.

In [80]:
print(len(data['entries']))
data['entries']

60


[{'title': 'Four Short Links: 19 August 2020',
  'title_detail': {'type': 'text/plain',
   'language': None,
   'base': '',
   'value': 'Four Short Links: 19 August 2020'},
  'links': [{'rel': 'alternate',
    'type': 'text/html',
    'href': 'http://feedproxy.google.com/~r/oreilly/radar/atom/~3/-JJuYwrweOg/'}],
  'link': 'http://feedproxy.google.com/~r/oreilly/radar/atom/~3/-JJuYwrweOg/',
  'comments': 'https://www.oreilly.com/radar/four-short-links-19-august-2020/#respond',
  'published': 'Wed, 19 Aug 2020 11:44:06 +0000',
  'published_parsed': time.struct_time(tm_year=2020, tm_mon=8, tm_mday=19, tm_hour=11, tm_min=44, tm_sec=6, tm_wday=2, tm_yday=232, tm_isdst=0),
  'authors': [{'name': 'Nat Torkington'}],
  'author': 'Nat Torkington',
  'author_detail': {'name': 'Nat Torkington'},
  'tags': [{'term': 'Four Short Links', 'scheme': None, 'label': None},
   {'term': 'Signals', 'scheme': None, 'label': None}],
  'id': 'https://www.oreilly.com/radar/?p=13228',
  'guidislink': False,
  '

### 6. Obtain a list of components (keys) available for an entry.

*Hint: Remember to index first before requesting the keys*

In [81]:
lista = []
for i in data['entries']:
    lista.append(i.keys())
lista

[dict_keys(['title', 'title_detail', 'links', 'link', 'comments', 'published', 'published_parsed', 'authors', 'author', 'author_detail', 'tags', 'id', 'guidislink', 'summary', 'summary_detail', 'content', 'wfw_commentrss', 'slash_comments', 'feedburner_origlink']),
 dict_keys(['title', 'title_detail', 'links', 'link', 'comments', 'published', 'published_parsed', 'authors', 'author', 'author_detail', 'tags', 'id', 'guidislink', 'summary', 'summary_detail', 'content', 'wfw_commentrss', 'slash_comments', 'feedburner_origlink']),
 dict_keys(['title', 'title_detail', 'links', 'link', 'comments', 'published', 'published_parsed', 'authors', 'author', 'author_detail', 'tags', 'id', 'guidislink', 'summary', 'summary_detail', 'content', 'wfw_commentrss', 'slash_comments', 'feedburner_origlink']),
 dict_keys(['title', 'title_detail', 'links', 'link', 'comments', 'published', 'published_parsed', 'authors', 'author', 'author_detail', 'tags', 'id', 'guidislink', 'summary', 'summary_detail', 'content

In [82]:
lista[0]

dict_keys(['title', 'title_detail', 'links', 'link', 'comments', 'published', 'published_parsed', 'authors', 'author', 'author_detail', 'tags', 'id', 'guidislink', 'summary', 'summary_detail', 'content', 'wfw_commentrss', 'slash_comments', 'feedburner_origlink'])

### 7. Extract a list of entry titles.

In [83]:
list_titles = []
for i in data['entries']:
    list_titles.append(i['title'])
    
list_titles   

['Four Short Links: 19 August 2020',
 'Why Best-of-Breed is a Better Choice than All-in-One Platforms for Data Science',
 'Four short links: 14 August 2020',
 'The Least Liked Programming Languages',
 'Four short links: 11 Aug 2020',
 'Four short links: 7 Aug 2020',
 'Four short links: 5 August 2020',
 'Radar trends to watch: August 2020',
 'Four short links: 31 July 2020',
 'Four short links: 30 July 2020',
 'Four short links: 29 July 2020',
 'Bringing an AI Product to Market',
 'Power, Harms, and Data',
 'Four short links: 27 July 2020',
 'Four short links: 24 July 2020',
 'Four short links: 26 July 2020',
 'Four short links: 22 July 2020',
 'AI, Protests, and Justice',
 'Four short links: 21 July 2020',
 'Four short links: 20 July 2020',
 'Four short links: 17 July 2020',
 'Four short links: 16 July 2020',
 'Microservices Adoption in 2020',
 'Four short links: 15 July 2020',
 'Society-Centered Design',
 'Four short links: 14 July 2020',
 'Four short links: 13 July 2020',
 'Four shor

### 8. Calculate the percentage of "Four short links" entry titles.

In [84]:
four_short = [i for i in list_titles if i.startswith('Four short links')]


porcentaje = len(four_short)/len(list_titles)
print(porcentaje*100, '%')

73.33333333333333 %


### 9. Create a Pandas data frame from the feed's entries.

In [85]:
import pandas as pd

In [86]:
df = pd.DataFrame(data['entries'])
df

Unnamed: 0,title,title_detail,links,link,comments,published,published_parsed,authors,author,author_detail,tags,id,guidislink,summary,summary_detail,content,wfw_commentrss,slash_comments,feedburner_origlink
0,Four Short Links: 19 August 2020,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/four-short-links...,"Wed, 19 Aug 2020 11:44:06 +0000","(2020, 8, 19, 11, 44, 6, 2, 232, 0)",[{'name': 'Nat Torkington'}],Nat Torkington,{'name': 'Nat Torkington'},"[{'term': 'Four Short Links', 'scheme': None, ...",https://www.oreilly.com/radar/?p=13228,False,The Design Space of Computational Notebooks &#...,"{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/four-short-links...,0,https://www.oreilly.com/radar/four-short-links...
1,Why Best-of-Breed is a Better Choice than All-...,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/why-best-of-bree...,"Tue, 18 Aug 2020 11:30:42 +0000","(2020, 8, 18, 11, 30, 42, 1, 231, 0)",[{'name': 'Matthew Rocklin and Hugo Bowne-Ande...,Matthew Rocklin and Hugo Bowne-Anderson,{'name': 'Matthew Rocklin and Hugo Bowne-Ander...,"[{'term': 'AI & ML', 'scheme': None, 'label': ...",https://www.oreilly.com/radar/?p=13220,False,So you need to redesign your company’s data in...,"{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/why-best-of-bree...,0,https://www.oreilly.com/radar/why-best-of-bree...
2,Four short links: 14 August 2020,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/four-short-links...,"Fri, 14 Aug 2020 11:38:56 +0000","(2020, 8, 14, 11, 38, 56, 4, 227, 0)",[{'name': 'Nat Torkington'}],Nat Torkington,{'name': 'Nat Torkington'},"[{'term': 'Four Short Links', 'scheme': None, ...",https://www.oreilly.com/radar/?p=13217,False,Sinter &#8212; Sinter uses the user-mode Endpo...,"{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/four-short-links...,0,https://www.oreilly.com/radar/four-short-links...
3,The Least Liked Programming Languages,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/the-least-liked-...,"Tue, 11 Aug 2020 11:46:42 +0000","(2020, 8, 11, 11, 46, 42, 1, 224, 0)",[{'name': 'Mike Loukides'}],Mike Loukides,{'name': 'Mike Loukides'},"[{'term': 'Software Engineering', 'scheme': No...",https://www.oreilly.com/radar/?p=13202,False,StackOverflow&#8217;s 2020 developer survey in...,"{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/the-least-liked-...,0,https://www.oreilly.com/radar/the-least-liked-...
4,Four short links: 11 Aug 2020,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/four-short-links...,"Tue, 11 Aug 2020 11:26:22 +0000","(2020, 8, 11, 11, 26, 22, 1, 224, 0)",[{'name': 'Nat Torkington'}],Nat Torkington,{'name': 'Nat Torkington'},"[{'term': 'Four Short Links', 'scheme': None, ...",https://www.oreilly.com/radar/?p=13211,False,"ImmuDB &#8212; lightweight, high-speed immutab...","{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/four-short-links...,0,https://www.oreilly.com/radar/four-short-links...
5,Four short links: 7 Aug 2020,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/four-short-links...,"Fri, 07 Aug 2020 13:13:40 +0000","(2020, 8, 7, 13, 13, 40, 4, 220, 0)",[{'name': 'Nat Torkington'}],Nat Torkington,{'name': 'Nat Torkington'},"[{'term': 'Four Short Links', 'scheme': None, ...",https://www.oreilly.com/radar/?p=13199,False,Surprising Economics of Load-Balanced Systems ...,"{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/four-short-links...,0,https://www.oreilly.com/radar/four-short-links...
6,Four short links: 5 August 2020,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/four-short-links...,"Wed, 05 Aug 2020 11:21:38 +0000","(2020, 8, 5, 11, 21, 38, 2, 218, 0)",[{'name': 'Nat Torkington'}],Nat Torkington,{'name': 'Nat Torkington'},"[{'term': 'Four Short Links', 'scheme': None, ...",https://www.oreilly.com/radar/?p=13196,False,Tales of the Autistic Developer &#8211; Senior...,"{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/four-short-links...,0,https://www.oreilly.com/radar/four-short-links...
7,Radar trends to watch: August 2020,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/radar-trends-to-...,"Mon, 03 Aug 2020 11:33:02 +0000","(2020, 8, 3, 11, 33, 2, 0, 216, 0)",[{'name': 'Mike Loukides'}],Mike Loukides,{'name': 'Mike Loukides'},"[{'term': 'Radar Trends', 'scheme': None, 'lab...",https://www.oreilly.com/radar/?p=13193,False,"I thought July was going to be a dull month, b...","{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/radar-trends-to-...,0,https://www.oreilly.com/radar/radar-trends-to-...
8,Four short links: 31 July 2020,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/four-short-links...,"Fri, 31 Jul 2020 11:33:09 +0000","(2020, 7, 31, 11, 33, 9, 4, 213, 0)",[{'name': 'Nat Torkington'}],Nat Torkington,{'name': 'Nat Torkington'},"[{'term': 'Four Short Links', 'scheme': None, ...",https://www.oreilly.com/radar/?p=13188,False,Migrating a 40TB SQL Server Database &#8212; A...,"{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/four-short-links...,0,https://www.oreilly.com/radar/four-short-links...
9,Four short links: 30 July 2020,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/four-short-links...,"Thu, 30 Jul 2020 11:19:09 +0000","(2020, 7, 30, 11, 19, 9, 3, 212, 0)",[{'name': 'Nat Torkington'}],Nat Torkington,{'name': 'Nat Torkington'},"[{'term': 'Four Short Links', 'scheme': None, ...",https://www.oreilly.com/radar/?p=13185,False,Turning the IDE Inside Out with Datalog &#8212...,"{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/four-short-links...,0,https://www.oreilly.com/radar/four-short-links...


### 10. Count the number of entries per author and sort them in descending order.

In [87]:
df['author'].value_counts()

Nat Torkington                                      45
Mike Loukides                                        9
Sarah Gold                                           1
Mike Loukides and Steve Swoyer                       1
Adam Jacob, Nat Torkington and Mike Loukides         1
Matthew Rocklin and Hugo Bowne-Anderson              1
Hugo Bowne-Anderson                                  1
Justin Norman, Peter Skomoroch and Mike Loukides     1
Name: author, dtype: int64

### 11. Add a new column to the data frame that contains the length (number of characters) of each entry title. Return a data frame that contains the title, author, and title length of each entry in descending order (longest title length at the top).

In [88]:
df['lenght'] = df['title'].apply(lambda x: len(x))
 

In [89]:
dfnew = df[['title', 'author', 'lenght']].sort_values('lenght', ascending = False)
dfnew

Unnamed: 0,title,author,lenght
1,Why Best-of-Breed is a Better Choice than All-...,Matthew Rocklin and Hugo Bowne-Anderson,79
28,Automated Coding and the Future of Programming,Mike Loukides,46
53,Machine Learning and the Production Gap,Mike Loukides,39
3,The Least Liked Programming Languages,Mike Loukides,37
47,Decision-Making in a Time of Crisis,Hugo Bowne-Anderson,35
7,Radar trends to watch: August 2020,Mike Loukides,34
0,Four Short Links: 19 August 2020,Nat Torkington,32
11,Bringing an AI Product to Market,"Justin Norman, Peter Skomoroch and Mike Loukides",32
2,Four short links: 14 August 2020,Nat Torkington,32
35,Radar trends to watch: July 2020,Mike Loukides,32


### 12. Create a list of entry titles whose summary includes the phrase "machine learning."

In [90]:
listamachin = [i for i in df['title'] if re.search('Machine', i) or re.search('Learning',i) or re.search('machine',i) or re.search('learning',i)]
listamachin

['Machine Learning and the Production Gap']