# Working with RSS Feeds Lab

Complete the following set of exercises to solidify your knowledge of parsing RSS feeds and extracting information from them.

In [1]:
import feedparser

### 1. Use feedparser to parse the following RSS feed URL.

In [5]:
url = 'http://feeds.feedburner.com/oreilly/radar/atom'

In [6]:
data = feedparser.parse(url)

### 2. Obtain a list of components (keys) that are available for this feed.

In [8]:
data.keys()

dict_keys(['feed', 'entries', 'bozo', 'headers', 'etag', 'updated', 'updated_parsed', 'href', 'status', 'encoding', 'version', 'namespaces'])

### 3. Obtain a list of components (keys) that are available for the *feed* component of this RSS feed.

In [15]:
data["feed"].keys()

dict_keys(['title', 'title_detail', 'links', 'link', 'subtitle', 'subtitle_detail', 'updated', 'updated_parsed', 'language', 'sy_updateperiod', 'sy_updatefrequency', 'generator_detail', 'generator', 'feedburner_info', 'geo_lat', 'geo_long', 'feedburner_emailserviceid', 'feedburner_feedburnerhostname'])

### 4. Extract and print the feed title, subtitle, author, and link.

In [28]:
feed_title = data["feed"]["title"]
feed_subtitle = data["feed"]["subtitle"]
#feed_author = data["feed"]["author"] 
feed_link = data["feed"]["link"]

print("Titulo:",feed_title)
print("Subtítulo:",feed_subtitle)
print("Autor:","No autor") #El feed no tiene autor, cada una de las entries tiene su autor.
print("Link:",feed_link)


Titulo: Radar
Subtítulo: Now, next, and beyond: Tracking need-to-know trends at the intersection of business and technology
Autor: No autor
Link: https://www.oreilly.com/radar


### 5. Count the number of entries that are contained in this RSS feed.

In [34]:
entries = data["entries"]

len(entries)

60

### 6. Obtain a list of components (keys) available for an entry.

*Hint: Remember to index first before requesting the keys*

In [53]:
data["entries"][0].keys()

dict_keys(['title', 'title_detail', 'links', 'link', 'comments', 'published', 'published_parsed', 'authors', 'author', 'author_detail', 'tags', 'id', 'guidislink', 'summary', 'summary_detail', 'content', 'wfw_commentrss', 'slash_comments', 'feedburner_origlink'])

### 7. Extract a list of entry titles.

In [55]:
titles = []

for i in data["entries"]:
    titles.append(i["title"])
    
titles

['Four Short Links: 19 August 2020',
 'Why Best-of-Breed is a Better Choice than All-in-One Platforms for Data Science',
 'Four short links: 14 August 2020',
 'The Least Liked Programming Languages',
 'Four short links: 11 Aug 2020',
 'Four short links: 7 Aug 2020',
 'Four short links: 5 August 2020',
 'Radar trends to watch: August 2020',
 'Four short links: 31 July 2020',
 'Four short links: 30 July 2020',
 'Four short links: 29 July 2020',
 'Bringing an AI Product to Market',
 'Power, Harms, and Data',
 'Four short links: 27 July 2020',
 'Four short links: 24 July 2020',
 'Four short links: 26 July 2020',
 'Four short links: 22 July 2020',
 'AI, Protests, and Justice',
 'Four short links: 21 July 2020',
 'Four short links: 20 July 2020',
 'Four short links: 17 July 2020',
 'Four short links: 16 July 2020',
 'Microservices Adoption in 2020',
 'Four short links: 15 July 2020',
 'Society-Centered Design',
 'Four short links: 14 July 2020',
 'Four short links: 13 July 2020',
 'Four shor

### 8. Calculate the percentage of "Four short links" entry titles.

In [62]:
count = 0

for i in titles:
    if i.startswith("Four short links"):
        count += 1
        
perc = (count/len(titles))*100
print(f"El porcentaje es {perc}%")

El porcentaje es 73.33333333333333%


### 9. Create a Pandas data frame from the feed's entries.

In [103]:
import pandas as pd
import re

In [114]:
entries =[]

for i in data["entries"]:
    
    title = i["title"]
    date = i["published"]
    authors = i["author"]
    summary = i['summary']
    summary = re.sub("\[?&#\d+;]?","",summary)

    entry={"Title":title,"Date":date,"Authors":authors,"Summary":summary}
    entries.append(entry)

In [117]:
entries_pd = pd.DataFrame(entries)
entries_pd

Unnamed: 0,Title,Date,Authors,Summary
0,Four Short Links: 19 August 2020,"Wed, 19 Aug 2020 11:44:06 +0000",Nat Torkington,The Design Space of Computational Notebooks L...
1,Why Best-of-Breed is a Better Choice than All-...,"Tue, 18 Aug 2020 11:30:42 +0000",Matthew Rocklin and Hugo Bowne-Anderson,So you need to redesign your company’s data in...
2,Four short links: 14 August 2020,"Fri, 14 Aug 2020 11:38:56 +0000",Nat Torkington,Sinter Sinter uses the user-mode EndpointSecu...
3,The Least Liked Programming Languages,"Tue, 11 Aug 2020 11:46:42 +0000",Mike Loukides,StackOverflows 2020 developer survey included ...
4,Four short links: 11 Aug 2020,"Tue, 11 Aug 2020 11:26:22 +0000",Nat Torkington,"ImmuDB lightweight, high-speed immutable data..."
5,Four short links: 7 Aug 2020,"Fri, 07 Aug 2020 13:13:40 +0000",Nat Torkington,Surprising Economics of Load-Balanced Systems ...
6,Four short links: 5 August 2020,"Wed, 05 Aug 2020 11:21:38 +0000",Nat Torkington,Tales of the Autistic Developer Senior Develo...
7,Radar trends to watch: August 2020,"Mon, 03 Aug 2020 11:33:02 +0000",Mike Loukides,"I thought July was going to be a dull month, b..."
8,Four short links: 31 July 2020,"Fri, 31 Jul 2020 11:33:09 +0000",Nat Torkington,Migrating a 40TB SQL Server Database A horror...
9,Four short links: 30 July 2020,"Thu, 30 Jul 2020 11:19:09 +0000",Nat Torkington,Turning the IDE Inside Out with Datalog tl;dr...


### 10. Count the number of entries per author and sort them in descending order.

In [120]:
entries_pd["Authors"].value_counts()

Nat Torkington                                      45
Mike Loukides                                        9
Mike Loukides and Steve Swoyer                       1
Hugo Bowne-Anderson                                  1
Sarah Gold                                           1
Justin Norman, Peter Skomoroch and Mike Loukides     1
Matthew Rocklin and Hugo Bowne-Anderson              1
Adam Jacob, Nat Torkington and Mike Loukides         1
Name: Authors, dtype: int64

### 11. Add a new column to the data frame that contains the length (number of characters) of each entry title. Return a data frame that contains the title, author, and title length of each entry in descending order (longest title length at the top).

In [129]:
entries_pd["Title Length"] = entries_pd["Title"].apply(lambda x: len(x))
entries_pd_2 = entries_pd[["Title","Authors","Title Length"]]

entries_pd_2.sort_values(by=['Title Length'],ascending=False)

Unnamed: 0,Title,Authors,Title Length
1,Why Best-of-Breed is a Better Choice than All-...,Matthew Rocklin and Hugo Bowne-Anderson,79
28,Automated Coding and the Future of Programming,Mike Loukides,46
53,Machine Learning and the Production Gap,Mike Loukides,39
3,The Least Liked Programming Languages,Mike Loukides,37
47,Decision-Making in a Time of Crisis,Hugo Bowne-Anderson,35
7,Radar trends to watch: August 2020,Mike Loukides,34
0,Four Short Links: 19 August 2020,Nat Torkington,32
11,Bringing an AI Product to Market,"Justin Norman, Peter Skomoroch and Mike Loukides",32
2,Four short links: 14 August 2020,Nat Torkington,32
35,Radar trends to watch: July 2020,Mike Loukides,32


### 12. Create a list of entry titles whose summary includes the phrase "machine learning."

In [156]:
ml_lst = []

prueba = entries_pd.apply(lambda x: ml_lst.append(x["Title"]) if "machine learning" in x["Summary"] else False, axis=1)

ml_lst

['Four short links: 8 July 2020', 'Machine Learning and the Production Gap']