In [1]:
from collections import Counter
from difflib import SequenceMatcher
from itertools import product
import re

IDENTICAL = 1.0
TOP_NUMBER = 10
RSS_FEED = 'rss.xml'
SIMILAR = 0.87
TAG_HTML = re.compile(r'<category>([^<]+)</category>')

In [2]:
#This function will get all the tags from RSS feed 
#Get a list of all tags using findall

def get_tags():
    """Find all tags (TAG_HTML) in RSS_FEED.
    Replace dash with whitespace.
    Hint: use TAG_HTML.findall"""
    
    #Using with to open file. No need to use ElementTree to read RSS file.
    with open(RSS_FEED) as f:
        #Using findall from regex to read file and lowercase all letters
        tags=TAG_HTML.findall(f.read().lower())
    #Replace all '-' with whitespaces using translate and maketrans
    REPLACE_CHARS=str.maketrans('-',' ')
    return [tag.translate(REPLACE_CHARS) for tag in tags]

In [3]:
#This function will get top 10 most used tags from the list tags generated in the get_tags function
#Argument for this function is tags
def get_top_tags(tags):
    return Counter(tags).most_common(TOP_NUMBER)
    return 

In [4]:
#Function to compare tags and generate a similarity index.
#Similarity index must lie between IDENTICAL and SIMILAR. Must be greater than SIMILAR. 

def get_similarities(tags):
    for pair in product(tags,tags):
        if pair[0][0]!=pair[1][0]:
            continue
        pair=tuple(sorted(pair)) 
        similarity=SequenceMatcher(None,*pair).ratio()
        if SIMILAR<similarity<IDENTICAL:
            yield pair

In [6]:
#Main function
if __name__=="__main__":
    tags=get_tags()
    top_tags=get_top_tags(tags)
    print('*Top {} tags:'.format(TOP_NUMBER))
    for tag,count in top_tags:
        print('{:<20} {}'.format(tag,count))
    similar_tags=dict(get_similarities(tags))
    print()
    print('*Similar tags:')
    for singular,plural in similar_tags.items():
        print('{:<20}{}'.format(singular,plural))

*Top 10 tags:
python               10
learning             7
tips                 6
tricks               5
github               5
cleancode            5
best practices       5
pythonic             4
collections          4
beginners            4

*Similar tags:
game                games
challenge           challenges
generator           generators
