In [1]:
# Save to CSV

#pods.to_csv("../refined_metadata.csv", index=False)
#test_csv = pd.read_csv('../refined_metadata.csv', sep=',')
#test_csv




# Load libraries

from IPython.display import display
from requests_html import HTMLSession
from bs4.builder import XMLParsedAsHTMLWarning
import pandas as pd
import re
import requests as rq
from time import strptime
import warnings

In [2]:
# Set library config

warnings.filterwarnings('ignore', category=XMLParsedAsHTMLWarning)
pd.set_option('display.max_columns', None)
pd.set_option('display.min_rows', 50)
pd.set_option('display.max_rows', 100)

In [3]:
# Load TSV and grab relevant information

metadata = pd.read_csv('./metadata.tsv', sep='\t')
data = metadata[['show_name', 'episode_name',
                'rss_link', 'duration','show_filename_prefix', 'episode_filename_prefix']].copy()

In [57]:
# Isolate shows with enough episodes

at_least_ep = 20

num_of_ep = data['show_name'].value_counts(dropna=False)
lengthy_pods = num_of_ep[num_of_ep > at_least_ep].to_frame()
lp_list = lengthy_pods.index.to_list()
long_shows = data[data['show_name'].isin(lp_list)]
len(lp_list)

962

In [58]:
# Groupby show_name

list_vals = lambda x: list(x)
common_val = lambda x: x.mode()
pods = long_shows.groupby('show_name')[['show_name', 'episode_name', 'rss_link', 'duration', 'show_filename_prefix', 'episode_filename_prefix'
                                        ]].agg({'episode_name': list_vals, 'rss_link': common_val, 'duration': list_vals, 'show_filename_prefix': list_vals, 'episode_filename_prefix': list_vals
                                               })#.reset_index()

In [59]:
# Get all unique RSS feeds

def get_rss(p):
    unique_links = {}
    for link in p['rss_link']:
        f_link = link.split("/", 3)[2]
        if f_link not in unique_links:
            unique_links[f_link] = 0
        unique_links[f_link] += 1
    return unique_links

unique_links = get_rss(pods)

In [4]:
# Select only RSS feeds with significant episodes

threshold = 70

unique_links = dict((key, val) for key, val in unique_links.items() if val >= threshold)
keep_links = list(unique_links.keys())
keep_links

# Drop shows that don't use chosen feeds

pods = pods[pods.rss_link.str.contains('|'.join(keep_links))].reset_index()

get_rss(pods)

In [61]:
# Define functions for scraping RSS feeds

# Start HTTP session

def pull_http(url):
    try:
        s = HTMLSession()
        http = s.get(url)
        return http
    except requests.exceptions.RequestException as e:
        return e
    
# Get relevant data from RSS feed
    
def grab_episodes(r, pod, index):
    eps = pod['episode_name'][index]
    episodes, dates = [], []
    columns = r.html.find("item", first=False)
    for column in columns:
        date = column.find('pubDate', first=True).text
        if (('2019' in date) or ('2020' in date)):
            title = column.find('title', first=True).text
            episodes.append(title)
            dates.append(date)
    return([episodes, dates])

# Format retrieved data

def format_rss(r, pod, index):
    rss_episode = grab_episodes(r, pod, index)
    rss_ep_names = rss_episode[0]
    rss_ep_dates = rss_episode[1]
    for i in range(0, len(rss_ep_names)):
        rss_ep_names[i] = rss_ep_names[i].replace('<![CDATA[', '')
        rss_ep_names[i] = rss_ep_names[i].replace(']]>', '').strip().lower()
        rss_ep_dates[i] = rss_ep_dates[i].split(' ')[1:4]
        rss_ep_dates[i][0], rss_ep_dates[i][1] = rss_ep_dates[i][1], rss_ep_dates[i][0]
        rss_ep_dates[i][0] = str(strptime(rss_ep_dates[i][0],'%b').tm_mon)
        rss_ep_dates[i] = "-".join(rss_ep_dates[i])
    return (rss_ep_names, rss_ep_dates)

In [67]:
# Add dates column

if 'episode_date' in pods.columns:
    pods = pods.drop(columns=['episode_date'])
episode_dates = [[]]*len(pods)
pods.insert(2, 'episode_date', episode_dates)

# Compare RSS episodes & DF episodes, fill in dates

# def compare_rss(rss, frame, show):
#     to_remove = []
#     for ep in range(len(frame['episode_name'][show])):
#         episode_name = frame['episode_name'][show][ep]
#         if episode_name in rss[0]:
#             date = rss[1][rss[0].index(episode_name)]
#             p = frame['episode_date'][show].copy()
#             p.append(date)
#             #frame['episode_date'][show] = p
#         else:
#             to_remove.append(episode_name)
#     for item in to_remove:
#         frame['episode_name'][show].remove(item)
#     return(len(to_remove))


# TEMPORARY version of compare_rss : doesn't delete or add dates 

def compare_rss(rss, frame, show):
    to_remove = []
    for ep in range(len(frame['episode_name'][show])):
        episode_name = frame['episode_name'][show][ep]
        if episode_name in rss[0]:
            date = rss[1][rss[0].index(episode_name)]
            p = frame['episode_date'][show].copy()
            p.append(date)
            #frame['episode_date'][show] = p
        else:
            to_remove.append(episode_name)
    return(len(to_remove))

In [63]:
# Format episode names inside of dataframe

for show in range(len(pods['episode_name'])):
    for ep in range(len(pods['episode_name'][show])):
        pods['episode_name'][show][ep] = pods['episode_name'][show][ep].strip().lower()

In [3]:
dropped = []
for test in range(100):
    url = pods['rss_link'][test]
    try:
        r = pull_http(url)
        rss_eps = format_rss(r, pods, test)
        dropped.append(compare_rss(rss_eps, pods, test))
    except:
        continue
#print(dropped)

In [2]:
test = 3
url = pods['rss_link'][test]

r = pull_http(url)
rss_eps = format_rss(r, pods, test)

#print(compare_rss(rss_eps, pods, test)) # What is being dropped
#print("")
#print(rss_eps) # RSS episodes we pulled