# Spotify Podcast Dataset

RSS Part 3:  RSS Podcast Category Merge with Metadata.tsv 

Goal:  merge with metadata.tsv
- add category to every row
- add first and second director respectively to make apply functions more automated when needed.

Secondary Goal: create another metadata file that is for categories only - categories_metadata.tsv, which is focused on the expanded list of categories.

In [1]:
import pandas as pd
import numpy as np
import os
import xml.etree.ElementTree as ET

import matplotlib.pyplot as plt
from collections import Counter
plt.rcParams['figure.figsize'] = [5, 5]




## Read metadata from Spotify

In [2]:
df = pd.read_csv('../podcasts-no-audio-13GB-selected/metadata.tsv', sep='\t')

In [3]:
basename = '../podcasts-no-audio-13GB-selected/spotify-podcasts-2020/show-rss/'

In [None]:
# Locations of episodes are based on two levels of folders: 
#    first_folder/second_folder/show/episode_metadata.json
# To figure out the first_folder and second_folder by looking at 
# the first two characters in the show_uri column.

# Then you can extract the transcripts...

df['first_dir'] = df['show_uri'].str.extract(r'spotify:show:(\w{1})')[0]
df['second_dir'] = df['show_uri'].str.extract(r'spotify:show:\w{1}(\w{1})')[0]

In [4]:
# Locations of episodes are based on two levels of folders: 
#    first_folder/second_folder/show/episode_metadata.json
# To figure out the first_folder and second_folder by looking at 
# the first two characters in the show_uri column.

# Then you can extract the transcripts...

df['first_dir'] = df['show_uri'].str.extract(r'spotify:show:(\w{1})')[0]
df['second_dir'] = df['show_uri'].str.extract(r'spotify:show:\w{1}(\w{1})')[0]

## Extract the RSS category metadata of a show.

In [5]:
print(f"Searching for {df['show_filename_prefix'].unique().shape[0]} Shows.")

Searching for 18376 Shows.


In [6]:
import xml.etree.ElementTree as ET

def find_category(file):
    """
    find_category - attempts to look for the the first itunes:category 
                    tag in the RSS file.  
    returns 
    """    
    extracted_tags = []
    
    # Parse the XML file
    tree = ET.parse(file)
    root = tree.getroot()
    
    # Iterate through the XML elements
    for element in root.iter():
        # Check if the element name contains "category"
        if 'category' in element.tag:
            # Check if the element has a "text" attribute
            if 'text' in element.attrib:
                # Extract and store the data from the "text" attribute
                text_data = element.attrib['text']
                extracted_tags.append(text_data)
    return extracted_tags


In [7]:
files_found_counter = 0

# This list will be appended directly to the primary dataframe 
# directly as a list.  It will follow a similar format as the 
# language column.
#
# The tuple is meant to preserve the filename with the category.
# first element is filename, second element is the category list.
category_tuple_list = []

# This list is an expanded version.  in stead of aggregating 
# the categories with one show.  the show is repeated for each
# category.  Nice for playing around with categories in general.
# But important to remember to understand that merging it with 
# the primary data frame would mean there are now more rows 
# and the rows no longer can be interpreted as the number of 
# episodes for example.
#
# The tuple is meant to preserve the filename with the category.
# first element is filename, second element is the category.
expanded_category_tuple_list = []

error_list = []
notag_list = []
# Iterate through files in the folder and its subdirectories
for root, dirs, files in os.walk(basename):
    for filename in files:
        # Check if the file is a JSON file
        if filename.endswith('.xml'):
            # TODO / ISSUE: there are two more xml files found based on the 
            # metadata file's number of possible shows.
            # TODO / ISSUE: there are 8 errors in valid show RSS files.
            try:
                files_found_counter += 1
                file_path = os.path.join(root, filename)
                cats = find_category(file_path)
                
                # check and see if its empty
                if cats:
                    for cat in cats: 
                        expanded_category_tuple_list.append((filename[:-4], cat))
                    category_tuple_list.append((filename[:-4], cats))
                else:
                    notag_list.append(filename[:-4])
            except Exception as e:
                #print(f"Error reading file: {filename}.\n{e}")
                error_list.append(filename[:-4])

categories = [c for s,c in expanded_category_tuple_list]                
print(f"{files_found_counter} Files Found.")
print(f"{len(set(categories))} Unique Categories and Sub-categories.")
print(f"{len(error_list)} files had errors while reading (see error_list).")
print(f"{len(notag_list)} files had no tags (see notag_list).")

18376 Files Found.
122 Unique Categories and Sub-categories.
8 files had errors while reading (see error_list).
8 files had no tags (see notag_list).


## Save a metadata.tsv with categories

In [8]:
# Create a DataFrame from the list of tuples
cats_df = pd.DataFrame(category_tuple_list, columns=['show_filename_prefix', 'category'])
print(cats_df.shape)
cats_df.head(2)

(18360, 2)


Unnamed: 0,show_filename_prefix,category
0,show_0RpgmiJjAk5DHxzABcllYp,"[Leisure, Games]"
1,show_0ruY06AVqTtXm1Rs3bwDtZ,[Society & Culture]


In [9]:
merged_df = df.merge(cats_df, on='show_filename_prefix', how='left')
print(merged_df.shape)
print(merged_df.isna().sum())
merged_df[merged_df['show_filename_prefix']=='show_4Gtc4ccgCF8xySTleFp3HF']

(105360, 15)
show_uri                     0
show_name                    0
show_description             2
publisher                    0
language                     0
rss_link                     0
episode_uri                  0
episode_name                 0
episode_description        205
duration                     0
show_filename_prefix         0
episode_filename_prefix      0
first_dir                    0
second_dir                   0
category                    92
dtype: int64


Unnamed: 0,show_uri,show_name,show_description,publisher,language,rss_link,episode_uri,episode_name,episode_description,duration,show_filename_prefix,episode_filename_prefix,first_dir,second_dir,category
77494,spotify:show:4Gtc4ccgCF8xySTleFp3HF,Anything and Everything,It’s ya boi Nathan making a podcast where I ch...,Nathan Ortiz,['en'],https://anchor.fm/s/12c31ce0/podcast/rss,spotify:episode:5hsL9wnnzX83q5Qn5XuEMu,Me and my Vietnam war friend Don Vu,"Nathan and Don y’all about wizards, government...",49.663233,show_4Gtc4ccgCF8xySTleFp3HF,5hsL9wnnzX83q5Qn5XuEMu,4,G,"[Comedy, Comedy Interviews]"
84259,spotify:show:4Gtc4ccgCF8xySTleFp3HF,Anything and Everything,It’s ya boi Nathan making a podcast where I ch...,Nathan Ortiz,['en'],https://anchor.fm/s/12c31ce0/podcast/rss,spotify:episode:6Df5bnlC2Ns06W2HK4qJmu,nathan & don talk about ghosts and homeless wo...,just a fun little conversation today with our ...,27.888717,show_4Gtc4ccgCF8xySTleFp3HF,6Df5bnlC2Ns06W2HK4qJmu,4,G,"[Comedy, Comedy Interviews]"
93757,spotify:show:4Gtc4ccgCF8xySTleFp3HF,Anything and Everything,It’s ya boi Nathan making a podcast where I ch...,Nathan Ortiz,['en'],https://anchor.fm/s/12c31ce0/podcast/rss,spotify:episode:6vRd118V8up2pvqsvM4IMF,nathan & don speak about random things w/ a sa...,three friends catch up & dive into some topics...,30.074483,show_4Gtc4ccgCF8xySTleFp3HF,6vRd118V8up2pvqsvM4IMF,4,G,"[Comedy, Comedy Interviews]"


In [10]:
# TODO: Why are there 92 NaNs? shows that did not get categories, had NaNs instead.  see shape below.
merged_df[merged_df['show_filename_prefix'].str.contains('|'.join(error_list+notag_list))].shape

(92, 15)

In [11]:
# Save the DataFrame as a TSV file
file_path = '../podcasts-no-audio-13GB-selected/metadata_with_podcast_category.tsv'
merged_df.to_csv(file_path, sep='\t', index=False)

print(f"Merged DataFrame saved as {file_path}")

Merged DataFrame saved as ../podcasts-no-audio-13GB-selected/metadata_with_podcast_category.tsv


In [12]:
new_df = pd.read_csv('../podcasts-no-audio-13GB-selected/metadata_with_podcast_category.tsv', sep='\t')
print(new_df.shape)
print(new_df.columns)

(105360, 15)
Index(['show_uri', 'show_name', 'show_description', 'publisher', 'language',
       'rss_link', 'episode_uri', 'episode_name', 'episode_description',
       'duration', 'show_filename_prefix', 'episode_filename_prefix',
       'first_dir', 'second_dir', 'category'],
      dtype='object')


In [13]:
new_df.isna().sum()

show_uri                     0
show_name                    0
show_description             2
publisher                    0
language                     0
rss_link                     0
episode_uri                  0
episode_name                 0
episode_description        205
duration                     0
show_filename_prefix         0
episode_filename_prefix      0
first_dir                    0
second_dir                   0
category                    92
dtype: int64

## Create a tsv file with the expanded metadata for categories. expanded_categories.tsv

In [14]:
# create dataframe from tuple...
expanded_cats_df = pd.DataFrame(expanded_category_tuple_list, columns=['show_filename_prefix', 'category'])
expanded_cats_df.head()

Unnamed: 0,show_filename_prefix,category
0,show_0RpgmiJjAk5DHxzABcllYp,Leisure
1,show_0RpgmiJjAk5DHxzABcllYp,Games
2,show_0ruY06AVqTtXm1Rs3bwDtZ,Society & Culture
3,show_0r4ElWjFWBJRuzH6ooMBvK,Sports
4,show_0r4ElWjFWBJRuzH6ooMBvK,Basketball


In [15]:
# add in mean_show_duration for good form...
duration_df = df[['show_filename_prefix','duration']].groupby('show_filename_prefix').mean()
duration_df = duration_df.reset_index()
duration_df['mean_show_duration'] = duration_df['duration']
duration_df = duration_df.drop(columns=['duration'])
display(duration_df.head(2))
print(duration_df.shape)

Unnamed: 0,show_filename_prefix,mean_show_duration
0,show_002B8PbILr169CdsS9ySTH,57.1772
1,show_005ZAjJK1wlD4E2YxeibBb,82.179683


(18376, 2)


In [19]:
# merge them.  should have a few that do not have a mean show duration...
expanded_cats_df2 = expanded_cats_df.merge(duration_df, on='show_filename_prefix', how='left')
print(expanded_cats_df2.shape)
print(expanded_cats_df2.describe())
print("\nNaN Summary:")
print(expanded_cats_df2.isna().sum())

expanded_cats_df2.sample(10)

(27018, 3)
       mean_show_duration
count        27018.000000
mean            31.595887
std             20.989321
min              0.180733
25%             13.470388
50%             29.472783
75%             46.219154
max            155.800271

NaN Summary:
show_filename_prefix    0
category                0
mean_show_duration      0
dtype: int64


Unnamed: 0,show_filename_prefix,category,mean_show_duration
17243,show_3IqnOKmHHBP8Zz6ikeJsEO,Religion & Spirituality,26.365867
21494,show_2ABPOUVu8eVRVvRvMBIr9E,TV & Film,5.212425
4114,show_76EDjHsPlYvwVJVavPWoUD,Religion & Spirituality,0.54645
1278,show_0azxzl78csOPsVsunPtJED,Business,72.771046
2807,show_0BtfH2LbEkhW0VBp81CUJY,Business,17.014033
334,show_00REco2kCHAb32m7GfuCk0,Education,38.587222
26568,show_5LdMmD7x2V9yQBUxt61VTF,After Shows,48.286088
26881,show_5PpZB7vNm8eeLEGbP73qud,Comedy,54.683383
15739,show_4x26XSd9pkHZI13kQi2qvE,Health & Fitness,17.485933
5986,show_7yPBxLJUkqWguIFTzhstZU,News,29.818291


Adding mean_category_duration

In [24]:
expanded_cats_df2_catmean = expanded_cats_df2[['category','mean_show_duration']].groupby('category').mean().reset_index()
expanded_cats_df2_catmean = expanded_cats_df2_catmean.rename(columns={'mean_show_duration': 'mean_category_duration'})
expanded_cats_df2_catmean = expanded_cats_df2_catmean.sort_values(by='mean_category_duration',ascending=False)
print(expanded_cats_df2_catmean.isna().sum())
expanded_cats_df2_catmean

category                  0
mean_category_duration    0
dtype: int64


Unnamed: 0,category,mean_category_duration
88,Podcasting,56.804517
63,Literature,56.323906
89,Professional,55.017821
97,Self-Help,53.755766
43,Games & Hobbies,52.248239
...,...,...
46,Government & Organizations,11.533311
60,Language Learning,11.509802
49,Hinduism,10.466221
108,Stories for Kids,8.933133


In [26]:
expanded_cats_df3 = expanded_cats_df2.merge(expanded_cats_df2_catmean, on='category', how='left')
print(expanded_cats_df3.shape)
print(expanded_cats_df3.describe())
print("\nNaN Summary:")
print(expanded_cats_df3.isna().sum())

expanded_cats_df3.sample(10)

(27018, 4)
       mean_show_duration  mean_category_duration
count        27018.000000            27018.000000
mean            31.595887               31.595887
std             20.989321                6.797801
min              0.180733                1.845217
25%             13.470388               26.992464
50%             29.472783               29.177933
75%             46.219154               38.051158
max            155.800271               56.804517

NaN Summary:
show_filename_prefix      0
category                  0
mean_show_duration        0
mean_category_duration    0
dtype: int64


Unnamed: 0,show_filename_prefix,category,mean_show_duration,mean_category_duration
20021,show_3WCzRBpS7myMDQHvP97pmv,Fantasy Sports,18.109433,43.208902
10575,show_16oQTbAx0xEQAijjhpcQd1,Health & Fitness,33.21105,29.078194
1691,show_0MTCY7tw7AKad94BlV25Lh,Comedy,58.052764,39.321419
23447,show_2PPnLpqWj23hg0Jqg5rqgR,Arts,20.065917,26.992464
26412,show_5knxV4vyGYWJ6mXS57345R,Comedy,21.318633,39.321419
13776,show_4ivu500NtrhKu8delQtfvx,Health & Fitness,46.248767,29.078194
7806,show_6ohc3T18zvoFhiViRUR93r,Games,61.0704,36.797213
3530,show_7u3KWEv98YJf9BEbjTBTgd,Self-Improvement,9.897117,26.785216
13898,show_4gI4LYOBe5Z4BvFj4e7PEl,Sports,53.886933,41.794109
11041,show_1s54aXh7LcDenT6NaxRKvI,Arts,14.52555,26.992464


In [30]:
# Save the DataFrame as a TSV file
file_path = '../podcasts-no-audio-13GB-selected/expanded_category_metadata.tsv'
expanded_cats_df3.to_csv(file_path, sep='\t', index=False)

print(f"DataFrame saved as {file_path}")

DataFrame saved as ../podcasts-no-audio-13GB-selected/expanded_category_metadata.tsv


In [31]:
new_df2 = pd.read_csv('../podcasts-no-audio-13GB-selected/expanded_category_metadata.tsv', sep='\t')
print(new_df2.shape)
print(new_df2.columns)

(27018, 4)
Index(['show_filename_prefix', 'category', 'mean_show_duration',
       'mean_category_duration'],
      dtype='object')


In [32]:
new_df2.isna().sum()

show_filename_prefix      0
category                  0
mean_show_duration        0
mean_category_duration    0
dtype: int64