In [1]:
# Load the google drive if running on colab
# Avoid this step if loading from local
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Import required libraries
import os
import xml.etree.ElementTree as ET
import pandas as pd
from tqdm.notebook import tqdm_notebook
import time
import json
import re
import tqdm
tqdm.tqdm.pandas()

In [3]:
# Find all the xml file names 
xml_file_names = []
for subdir, dirs, files in os.walk('/content/drive/MyDrive/TREC/spotify-podcasts-2020/show-rss-summarization-testset'):
    for file in files:
        xml_file_names.append(os.path.join(subdir, file))

In [5]:
# Extract the required tags from the xml files
file_list = []
failed_files = []
for selected_file in tqdm_notebook(xml_file_names):
  if ".xml" in selected_file:
    try:
      file_dict = {}
      item_counter = 0
      tree = ET.parse(selected_file)
      for elem in tree.iter():
        if elem.tag == "item":
          file_dict = {'filepath':selected_file}
          #item_counter+=1
          for sib_elem in elem.iter():
            for sel_tag in ['summary', 'description', 'title', 'duration']:
              if sel_tag in sib_elem.tag:
                file_dict[sel_tag] = sib_elem.text
          file_list.append(file_dict)
    except Exception as e:
      failed_files.append(selected_file)
    #print(item_counter)

  0%|          | 0/860 [00:00<?, ?it/s]

In [6]:
# Identify the files for which the XML could not be read properly
failed_files

['/content/drive/MyDrive/TREC/spotify-podcasts-2020/show-rss-summarization-testset/2/E/show_2ewOKdwBPK2OdQkKWiCM9O.xml']

In [7]:
# Convert the information into a dataframe for easy processing
raw_xmls = pd.DataFrame(file_list)
raw_xmls.shape

(74103, 5)

In [8]:
# Process the filepaths to get the file names
raw_xmls['xml_show_name'] = raw_xmls['filepath'].apply(lambda x: x.split('/')[-1].replace(".xml",""))
raw_xmls.head(10)

Unnamed: 0,filepath,title,description,summary,duration,xml_show_name
0,/content/drive/MyDrive/TREC/spotify-podcasts-2...,Basic science of coronavirus with a smartphone...,Working listeners towards being able to employ...,Working listeners towards being able to employ...,1142,show_0r0wLcdQ3MGqJBALy2cbE6
1,/content/drive/MyDrive/TREC/spotify-podcasts-2...,Dr. Kaya Erbil’s Coronavirus Podcast Episode #1.,Motivated by a lack of basic humility in the m...,Motivated by a lack of basic humility in the m...,1799,show_0r0wLcdQ3MGqJBALy2cbE6
2,/content/drive/MyDrive/TREC/spotify-podcasts-2...,How a Funeral Home Director Learned About Life...,<p>From a young age Codi Shewan was fascinated...,<p>From a young age Codi Shewan was fascinated...,4341,show_0RufZJusNYTpcMwHhU0dDi
3,/content/drive/MyDrive/TREC/spotify-podcasts-2...,How a Violent & Life Altering Event Led a Man ...,"<p>When you look back at your life, what event...","<p>When you look back at your life, what event...",3447,show_0RufZJusNYTpcMwHhU0dDi
4,/content/drive/MyDrive/TREC/spotify-podcasts-2...,Behind The Scenes in to The Life of a DJ,"<p>We all love a great party, and what makes a...","<p>We all love a great party, and what makes a...",5658,show_0RufZJusNYTpcMwHhU0dDi
5,/content/drive/MyDrive/TREC/spotify-podcasts-2...,Escaping a World of Human Trafficking to Becom...,<p>This is one of the most remarkable stories ...,<p>This is one of the most remarkable stories ...,4026,show_0RufZJusNYTpcMwHhU0dDi
6,/content/drive/MyDrive/TREC/spotify-podcasts-2...,How an Orphan Met His Biological Father 35 Yea...,<p>Thanh Campbell was brought over to Canada i...,<p>Thanh Campbell was brought over to Canada i...,4652,show_0RufZJusNYTpcMwHhU0dDi
7,/content/drive/MyDrive/TREC/spotify-podcasts-2...,From Being Homeless to Working With the Toront...,"<p>We all have a dream, but how far are we wil...","<p>We all have a dream, but how far are we wil...",4216,show_0RufZJusNYTpcMwHhU0dDi
8,/content/drive/MyDrive/TREC/spotify-podcasts-2...,Escaping an Abusive Marriage & Launching a Mov...,<p>In this episode Kapil sits down with author...,<p>In this episode Kapil sits down with author...,5046,show_0RufZJusNYTpcMwHhU0dDi
9,/content/drive/MyDrive/TREC/spotify-podcasts-2...,Introduction to Kapil,"<p>In this episode, Kapil Ghai, the host of th...","<p>In this episode, Kapil Ghai, the host of th...",386,show_0RufZJusNYTpcMwHhU0dDi


In [9]:
# Read the meta data to start the mapping
meta_data_path = '/content/drive/MyDrive/TREC/spotify-podcasts-2020/metadata-summarization-testset.tsv'
meta_data = pd.read_table(meta_data_path)
req_meta_data = meta_data[['episode_name','episode_description','duration', 'show_filename_prefix','episode_filename_prefix']]
req_meta_data

Unnamed: 0,episode_name,episode_description,duration,show_filename_prefix,episode_filename_prefix
0,Ep.05 Why We Don't Sell Presets,Today on the podcast we go on a JOURNEY! We ta...,54.271200,show_015DbLwcXu2fK7e9jIfbFo,74t5WREXUbhEKNI89CNSkL
1,How to Grow as an Esports Player - The Wavelen...,Ever wanted a podcast from your three favorite...,77.500850,show_01DbRiALDPdvZdoiY8yQL6,5fG4VlWnWwzAt6mSs0H7lY
2,Irish Illustrated Insider: Observations and an...,The Irish Illustrated Insider crew discusses N...,56.930300,show_01Txd706SjsgvM0cm0UXuM,5hvOWPoB0j6HMrSVAMtJLV
3,Irish Illustrated Insider: Previewing Notre Da...,Irish Illustrated Insider tackles NFL Combine ...,46.802917,show_01Txd706SjsgvM0cm0UXuM,7JG3lLnRoDdOxuqjf14ZkM
4,Episode 73 - Comeback SZN,Breaking down a classic Calgary Flames comebac...,49.487233,show_01eumErJvBdxCW4YJivbwc,2WQ1GcC6J0k7qsO8Vvf2be
...,...,...,...,...,...
1022,17: Shuri,In this episode we cover Shuri in our newest M...,41.621550,show_7wbEc7QxASrGQoFyb0dxoS,1t6R1TrR8D7kkKnFcuOD8A
1023,Ep. 96 - Are Thermal Clip-Ons a Good Investment?,"Once again, The Late Night Vision Show is back...",40.438850,show_7wd4F1RZw7aQKN4K7cVakB,1JnsLxgHqv9kv21D5bhIKy
1024,#13 Giving up Perfection for Peace,In this episode I talk about the idea of Perfe...,18.188583,show_7xjC57YxW8SqmQha1Tjgge,6e6LyN7z8yusnDrqnE6Hm5
1025,Are you facing pain in Hand and Arms?,Carpal Tunnel Syndrome. What is it? Reasons du...,4.207883,show_7yeMJ1fd1BLgqft0WnARzb,7qPPLigaXeKDzE9lUyNY91


In [10]:
# Find the xml information for the data present in the metadata
merged_info = req_meta_data.merge( raw_xmls,right_on=['xml_show_name','title'], left_on=['show_filename_prefix','episode_name'])
merged_info.shape

(917, 11)

In [11]:
# Remove duplicate columns
merged_info.drop(['episode_name','episode_description','xml_show_name'], axis=1, inplace=True)

In [12]:
# Rename and rearrage the columns 
merged_info.rename(columns = {'filepath':'xml_filepath', 'duration_y':'duration_minutes', 'duration_x':'duration_seconds'}, inplace=True)
merged_info = merged_info[['xml_filepath', 'title', 'summary', 'description', 'duration_minutes', 'duration_seconds', 'show_filename_prefix', 'episode_filename_prefix']]
merged_info.head()

Unnamed: 0,xml_filepath,title,summary,description,duration_minutes,duration_seconds,show_filename_prefix,episode_filename_prefix
0,/content/drive/MyDrive/TREC/spotify-podcasts-2...,Ep.05 Why We Don't Sell Presets,<p>Today on the podcast we go on a JOURNEY! We...,<p>Today on the podcast we go on a JOURNEY! We...,3256,54.2712,show_015DbLwcXu2fK7e9jIfbFo,74t5WREXUbhEKNI89CNSkL
1,/content/drive/MyDrive/TREC/spotify-podcasts-2...,How to Grow as an Esports Player - The Wavelen...,Ever wanted a podcast from your three favorite...,Ever wanted a podcast from your three favorite...,4650,77.50085,show_01DbRiALDPdvZdoiY8yQL6,5fG4VlWnWwzAt6mSs0H7lY
2,/content/drive/MyDrive/TREC/spotify-podcasts-2...,Irish Illustrated Insider: Observations and an...,<p>The Irish Illustrated Insider crew discusse...,<p>The Irish Illustrated Insider crew discusse...,3415,56.9303,show_01Txd706SjsgvM0cm0UXuM,5hvOWPoB0j6HMrSVAMtJLV
3,/content/drive/MyDrive/TREC/spotify-podcasts-2...,Irish Illustrated Insider: Previewing Notre Da...,<p>Irish Illustrated Insider tackles NFL Combi...,<p>Irish Illustrated Insider tackles NFL Combi...,2808,46.802917,show_01Txd706SjsgvM0cm0UXuM,7JG3lLnRoDdOxuqjf14ZkM
4,/content/drive/MyDrive/TREC/spotify-podcasts-2...,Episode 73 - Comeback SZN,<p>Breaking down a classic Calgary Flames come...,<p>Breaking down a classic Calgary Flames come...,2969,49.487233,show_01eumErJvBdxCW4YJivbwc,2WQ1GcC6J0k7qsO8Vvf2be


In [13]:
# Function to find the episode corresponding to the summary from the xml
def find_json_path(row):
  xml = row[0]
  episode = row[7]
  json_super_folder = '/content/drive/MyDrive/TREC/spotify-podcasts-2020/podcasts-transcripts-summarization-testset/'
  xml_super_folder = '/content/drive/MyDrive/TREC/spotify-podcasts-2020/show-rss-summarization-testset/'
  json = xml.replace(xml_super_folder, json_super_folder)
  json = json.replace('.xml','/'+episode+'.json')
  #print(xml)
  #print(json)
  return json

In [14]:
# Find the json for every summary
merged_info['json_filepath'] = merged_info.apply(find_json_path, axis=1)

In [15]:
# get a glimpse of the data
merged_info.head()

Unnamed: 0,xml_filepath,title,summary,description,duration_minutes,duration_seconds,show_filename_prefix,episode_filename_prefix,json_filepath
0,/content/drive/MyDrive/TREC/spotify-podcasts-2...,Ep.05 Why We Don't Sell Presets,<p>Today on the podcast we go on a JOURNEY! We...,<p>Today on the podcast we go on a JOURNEY! We...,3256,54.2712,show_015DbLwcXu2fK7e9jIfbFo,74t5WREXUbhEKNI89CNSkL,/content/drive/MyDrive/TREC/spotify-podcasts-2...
1,/content/drive/MyDrive/TREC/spotify-podcasts-2...,How to Grow as an Esports Player - The Wavelen...,Ever wanted a podcast from your three favorite...,Ever wanted a podcast from your three favorite...,4650,77.50085,show_01DbRiALDPdvZdoiY8yQL6,5fG4VlWnWwzAt6mSs0H7lY,/content/drive/MyDrive/TREC/spotify-podcasts-2...
2,/content/drive/MyDrive/TREC/spotify-podcasts-2...,Irish Illustrated Insider: Observations and an...,<p>The Irish Illustrated Insider crew discusse...,<p>The Irish Illustrated Insider crew discusse...,3415,56.9303,show_01Txd706SjsgvM0cm0UXuM,5hvOWPoB0j6HMrSVAMtJLV,/content/drive/MyDrive/TREC/spotify-podcasts-2...
3,/content/drive/MyDrive/TREC/spotify-podcasts-2...,Irish Illustrated Insider: Previewing Notre Da...,<p>Irish Illustrated Insider tackles NFL Combi...,<p>Irish Illustrated Insider tackles NFL Combi...,2808,46.802917,show_01Txd706SjsgvM0cm0UXuM,7JG3lLnRoDdOxuqjf14ZkM,/content/drive/MyDrive/TREC/spotify-podcasts-2...
4,/content/drive/MyDrive/TREC/spotify-podcasts-2...,Episode 73 - Comeback SZN,<p>Breaking down a classic Calgary Flames come...,<p>Breaking down a classic Calgary Flames come...,2969,49.487233,show_01eumErJvBdxCW4YJivbwc,2WQ1GcC6J0k7qsO8Vvf2be,/content/drive/MyDrive/TREC/spotify-podcasts-2...


In [16]:
# Function to merge all the components of the the json to get a single string 
def extract_json_text(json_file):  
  with open(json_file) as f:
    json_content = json.load(f)['results']
    json_text = ""
    for item in json_content:
      for sub_item in item['alternatives']:
        if 'transcript' in sub_item.keys():
          json_text+=" "+sub_item['transcript']
    json_text = re.sub(' +', ' ', json_text)
  return json_text

In [17]:
# Apply the function to read the json and return the transcript as a single string
merged_info['json_transcript'] = merged_info['json_filepath'].progress_apply(lambda x: extract_json_text(x))

100%|██████████| 917/917 [07:33<00:00,  2.02it/s]


In [18]:
# Find a glimpse of the data
merged_info.head()

Unnamed: 0,xml_filepath,title,summary,description,duration_minutes,duration_seconds,show_filename_prefix,episode_filename_prefix,json_filepath,json_transcript
0,/content/drive/MyDrive/TREC/spotify-podcasts-2...,Ep.05 Why We Don't Sell Presets,<p>Today on the podcast we go on a JOURNEY! We...,<p>Today on the podcast we go on a JOURNEY! We...,3256,54.2712,show_015DbLwcXu2fK7e9jIfbFo,74t5WREXUbhEKNI89CNSkL,/content/drive/MyDrive/TREC/spotify-podcasts-2...,Welcome back to another episode of tuxedo tim...
1,/content/drive/MyDrive/TREC/spotify-podcasts-2...,How to Grow as an Esports Player - The Wavelen...,Ever wanted a podcast from your three favorite...,Ever wanted a podcast from your three favorite...,4650,77.50085,show_01DbRiALDPdvZdoiY8yQL6,5fG4VlWnWwzAt6mSs0H7lY,/content/drive/MyDrive/TREC/spotify-podcasts-2...,"What's up, guys? This episode of the podcast ..."
2,/content/drive/MyDrive/TREC/spotify-podcasts-2...,Irish Illustrated Insider: Observations and an...,<p>The Irish Illustrated Insider crew discusse...,<p>The Irish Illustrated Insider crew discusse...,3415,56.9303,show_01Txd706SjsgvM0cm0UXuM,5hvOWPoB0j6HMrSVAMtJLV,/content/drive/MyDrive/TREC/spotify-podcasts-2...,You are listening to Irish illustrate Insider...
3,/content/drive/MyDrive/TREC/spotify-podcasts-2...,Irish Illustrated Insider: Previewing Notre Da...,<p>Irish Illustrated Insider tackles NFL Combi...,<p>Irish Illustrated Insider tackles NFL Combi...,2808,46.802917,show_01Txd706SjsgvM0cm0UXuM,7JG3lLnRoDdOxuqjf14ZkM,/content/drive/MyDrive/TREC/spotify-podcasts-2...,You have tuned into Irish Illustrated Insider...
4,/content/drive/MyDrive/TREC/spotify-podcasts-2...,Episode 73 - Comeback SZN,<p>Breaking down a classic Calgary Flames come...,<p>Breaking down a classic Calgary Flames come...,2969,49.487233,show_01eumErJvBdxCW4YJivbwc,2WQ1GcC6J0k7qsO8Vvf2be,/content/drive/MyDrive/TREC/spotify-podcasts-2...,"What's up, everybody? Welcome to the in the d..."


In [None]:
# Write the output to a file
merged_info.to_excel('/content/drive/MyDrive/summarization_aggregated_data.xlsx', index=False)