# Spotify Podcast Dataset - Reading Transcripts

Attempt to come up with an API to support batch processing of transcripts for this data set.

In [184]:
import pandas as pd

In [185]:
df = pd.read_csv('../podcasts-no-audio-13GB-selected/metadata.tsv', sep='\t')

In [186]:
basename = '../podcasts-no-audio-13GB-selected/spotify-podcasts-2020/show-rss/'

## Attempt to extract the RSS metadata of a show.

In [187]:
# Locations of episodes are based on two levels of folders: 
#    first_folder/second_folder/show/episode_metadata.json
# To figure out the first_folder and second_folder by looking at 
# the first two characters in the show_uri column.

# Then you can extract the transcripts...

df['first_dir'] = df['show_uri'].str.extract(r'spotify:show:(\w)')[0]
df['second_dir'] = df['show_uri'].str.extract(r'spotify:show:\w(\w)')[0]

In [188]:
row = df.iloc[33]
row

show_uri                                 spotify:show:4g9vEFANueSAWA50HtDjQM
show_name                  Big D Beatdown - An unbiased Cowboys football ...
show_description           Austin Smith, Ken Cunningham, and Ty Rogers ho...
publisher                                                       BigDBeatdown
language                                                              ['en']
rss_link                             https://anchor.fm/s/a4c5784/podcast/rss
episode_uri                           spotify:episode:009awoapZd90HCQHzDczrV
episode_name                              Will Dak Prescott remain a Cowboy?
episode_description        Tune in to Austin Smith and Coach Ty Rogers as...
duration                                                           58.559567
show_filename_prefix                             show_4g9vEFANueSAWA50HtDjQM
episode_filename_prefix                               009awoapZd90HCQHzDczrV
first_dir                                                                  4

In [189]:
show_dir = basename + row['first_dir']+'/'+row['second_dir'] + '/' + row['show_filename_prefix']
show_xml = show_dir + '.xml'
show_xml

'../podcasts-no-audio-13GB-selected/spotify-podcasts-2020/show-rss/4/g/show_4g9vEFANueSAWA50HtDjQM.xml'

In [190]:
import xml.etree.ElementTree as ET

# Parse the XML file
tree = ET.parse(show_xml)
root = tree.getroot()

# Define a list to store the extracted data
extracted_data = []

# Iterate through the XML elements
for element in root.iter():
    # Check if the element name contains "category"
    if 'category' in element.tag:
        # Check if the element has a "text" attribute
        if 'text' in element.attrib:
            # Extract and store the data from the "text" attribute
            text_data = element.attrib['text']
            extracted_data.append(text_data)
    
extracted_data

['Sports']

## Find all the categories in "4G" folder

In [191]:
import xml.etree.ElementTree as ET

def find_category(file): 
    extracted_data = []
    
    # Parse the XML file
    tree = ET.parse(file)
    root = tree.getroot()
    
    # Iterate through the XML elements
    for element in root.iter():
        # Check if the element name contains "category"
        if 'category' in element.tag:
            # Check if the element has a "text" attribute
            if 'text' in element.attrib:
                # Extract and store the data from the "text" attribute
                text_data = element.attrib['text']
                extracted_data.append(text_data)
    
    return extracted_data

In [192]:
random_dir = basename + row['first_dir']+'/'+row['second_dir'] + '/'

In [193]:
import os
categories = []
# Iterate through files in the folder
for filename in os.listdir(random_dir):
    # Construct the full file path
    file_path = os.path.join(random_dir, filename)

    # Check if the path is a file (not a directory)
    if os.path.isfile(file_path):
        # Call your function on the file
        #print(f"Calling function on: {file_path}")
        categories.extend(find_category(file_path))
set(categories)

{'After Shows',
 'Alternative Health',
 'Arts',
 'Automotive',
 'Books',
 'Business',
 'Careers',
 'Christianity',
 'Comedy',
 'Comedy Interviews',
 'Drama',
 'Education',
 'Entertainment News',
 'Entrepreneurship',
 'Fiction',
 'Fitness',
 'Food',
 'Games',
 'Government',
 'Health & Fitness',
 'Hobbies',
 'Islam',
 'Leisure',
 'Music',
 'Music Commentary',
 'News',
 'Religion & Spirituality',
 'Self-Improvement',
 'Sexuality',
 'Society & Culture',
 'Spirituality',
 'Sports',
 'TV & Film',
 'Technology',
 'True Crime',
 'Video Games',
 'Visual Arts',
 'Wilderness'}