# Imports

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import matplotlib.pyplot as plt
import matplotlib.cm as cm
plt.style.use('ggplot')

import seaborn as sns

params = {
    'text.color': (0.25, 0.25, 0.25),
    'figure.figsize': [18, 6],
   }

plt.rcParams.update(params)

import pandas as pd
pd.options.mode.chained_assignment = None
pd.options.display.max_rows = 500
pd.options.display.max_seq_items = 500

import numpy as np
from numpy import percentile
np.random.seed(42)

import requests
from bs4 import BeautifulSoup

import re
import copy
import json
import time
import string
from datetime import datetime

from tqdm.notebook import tqdm

from tabulate import tabulate
import feedparser

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

In [3]:
# Save your Spotify API keys in a separate config.py file.
# Import both keys.
from config import CLIENT_ID, CLIENT_SECRET

client_credentials_manager = SpotifyClientCredentials(client_id=CLIENT_ID,
                                                      client_secret=CLIENT_SECRET)
spotify = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

# What is the ideal length of a podcast?
---

One interesting aspect of podcasting is that there are very few creative rules, if any. Podcasts are still an emerging medium and best practices are being invented day by day. Every creator is free to find his or her way of concept, formatting, etc. 

In this quick analysis, I examine one specific and from my perspective quite relevant aspect of formatting – the lengths of a podcast. I know from professional experience that the ideal length of a podcast is discussed intensively and controversely among podcasters. Opinions range from *«It doesn't matter at all as long as the content is interesting and captivating...»* to *«Under no circumstances longer than X minutes for this genre...»*.

I have listened to very short podcasts/episodes of just around a couple of minutes up to very long formats like [«Alles gesagt»](https://www.zeit.de/serie/alles-gesagt) which can potentially last for days. I wonder – how can find creators «their» ideal episode length with good reason? Is it just the time the topic needs? Is the perceived time span more important than the factual duration? 

A first step to clarify these questions is to look into the market. What are «the usual lengths» that successful creators use in general or in particular genres?

For this analysis, I gather meta-data from top podcasts from the two most important podcast platforms: iTunes and Spotify.

# tl;dr
- **I analysed ~225k episodes of ~1.8k iTunes podcasts** and **37k episodes of ~800 Spotify podcasts**.
- **A prototypical length of a podcast episode is around 40 minutes.** 
- **90% of all podcast episodes have a length between 20 and 60 minutes.**
- **Typical lengths vary between the different genres – with median values between 15 and 65 minutes.**


# iTunes
---

## Get podcast metadata from RSS feeds
Apple categorizes podcasts in genres that can either be searched by name or ID. Let's start by retrieving these. 

In [4]:
url = "https://podcasts.apple.com/de/genre/podcasts/id26"
res = requests.get(url)
soup = BeautifulSoup(res.content)

In [5]:
# Retrieve top genres
results = []
for top_genre in soup.find_all("a", class_="top-level-genre"):
    id_ = top_genre["href"]
    id_ = id_.split("/")[-1].replace("id", "")
    id_ = int(id_)
    results.append((top_genre.text, id_))
top_genres = dict(results)
print("Apple podcast top genres")
display(top_genres)

Apple podcast top genres


{'Kunst': 1301,
 'Wirtschaft': 1321,
 'Comedy': 1303,
 'Bildung': 1304,
 'Fiktion': 1483,
 'Regierung': 1511,
 'Gesundheit und Fitness': 1512,
 'Geschichte': 1487,
 'Kinder und Familie': 1305,
 'Freizeit': 1502,
 'Musik': 1310,
 'Nachrichten': 1489,
 'Religion und Spiritualität': 1314,
 'Wissenschaft': 1533,
 'Gesellschaft und Kultur': 1324,
 'Sport': 1545,
 'TV und Film': 1309,
 'Technologie': 1318,
 'Wahre Kriminalfälle': 1488}

Now we query the current iTunes charts for all genres, clean the results and save to disk.

Every podcast has one primary genre but can belong to other genres and will be listed in these as well.

In [6]:
# Helper functions

def get_podcast_data(genre_id, market="de", top_n_podcasts=10):
    url = f"https://itunes.apple.com/{market}/rss/topaudiopodcasts/limit={top_n_podcasts}/genre={genre_id}/explicit=true/json"
    res = requests.get(url)
    assert res.status_code==200
    data = json.loads(res.content)

    # The JSON from the API contains redundant key names and is overly complicated in its structure.
    # Therefore we just extract the podcast IDs and make a batch query to the iTunes search API which yields cleaner metadata.
    ids = pd.DataFrame(data["feed"]["entry"]).T.loc["id"].values
    ids = [x["attributes"] for x in ids] 
    ids = [x["im:id"] for x in ids]
    ids = ",".join(ids)

    # Query iTunes lookup endpoint with list of podcast IDs.
    query_url = f"https://itunes.apple.com/lookup?id={ids}"
    res_feeds = requests.get(query_url)
    tmp = json.loads(res_feeds.text)
    df = pd.DataFrame(tmp["results"])
    df.rename({"collectionName":"title"}, axis=1, inplace=True)

    # Reduce to most informative columns.
    cols = ['trackId', 'artistName', 'title', 
            'feedUrl', 'releaseDate',
            'trackCount', 'primaryGenreName', 
            'genreIds', 'genres']
    df = df[cols]
    return df


timestamp_to_unit = {0:1, 
                     1:60,
                     2:3600}


def convert_time_string_to_seconds(timestr):
    seconds = 0
    # Some samples have «min» in string, replace these.
    timestr = timestr.replace(" min", "")
    
    # Some samples contain an empty string, replace these with np.nan.
    if timestr=="":
        return np.nan
    
    # Some samples contain erroneous float values as strings, replace these with np.nan.
    if type(timestr) is str and "." in timestr:
        return np.nan
        # timestr = timestr.split(".")[0]
        
    try:
        for idx, part in enumerate(timestr.split(":")[::-1]):
            seconds = seconds + int(part) * (timestamp_to_unit[idx])
        return seconds
    except Exception as e:
        print(f"Likely wrong value «{timestr}» for duration. Omitting this sample.")
        return np.nan


def get_episode_data(podcast):
    cols = ['title', 'itunes_duration', 'summary', 'published']
    frames = []

    main_feed = feedparser.parse(podcast.feedUrl)
    entry_count = len(main_feed["entries"])

    if entry_count==0:
        return np.nan

    tmp = pd.DataFrame(main_feed["entries"])

    # Some episodes do not contain a value for duration, return None in this case.
    if "itunes_duration" not in tmp.columns:
        return np.nan

    tmp = tmp[cols]
    tmp.rename({"title": "episode_title"}, axis=1, inplace=True)
    tmp["artistName"] = podcast.artistName
    tmp["title"] = podcast.title
    tmp["trackId"] = podcast.trackId 
    frames.append(tmp)

    cols_reorder = ['trackId', 'artistName', 'title', 'episode_title', 'itunes_duration', 'summary', 'published']
    df_ep = pd.concat(frames)
    df_ep = df_ep[cols_reorder]
    df_ep.reset_index(drop=True, inplace=True)
    df_ep.dropna(subset=["itunes_duration"], inplace=True)
    df_ep["duration_secs"] = df_ep.itunes_duration.apply(convert_time_string_to_seconds)
    
    return df_ep

Request the **podcast** data.

Note: Queries to the iTunes Search API sometimes do not work for no apparent reason. In this case simply retry executing the cell below again. 

In [14]:
%%time

market = "de"
top_n_podcasts = 200

frames_pods = []

for genre_name, genre_id in tqdm(top_genres.items()):
    print(f"Retrieving metadata for genre: «{genre_name}»")
    df_pods = get_podcast_data(genre_id, market=market, top_n_podcasts=top_n_podcasts)
    df_pods["genre"] = genre_name
    df_pods["genre_id"] = genre_id
    frames_pods.append(df_pods)
    
df = pd.concat(frames_pods)
df.reset_index(drop=True, inplace=True)
print(f"{df.shape[0]:,.0f} podcasts found.")

df.to_csv(f"_data/itunes_podcasts_{market}.csv")

Request the **episode** data. Metadata is retrieved for all available episodes in the RSS feed.

In [17]:
frames_eps = []

for _, podcast in tqdm(df.iterrows()):
    try:
        df_eps = get_episode_data(podcast)
        if df_eps is np.nan:
            continue
        df_eps["genre"] = podcast.genre
        df_eps["genre_id"] = podcast.genre_id
        frames_eps.append(df_eps)
        
    except Exception as e:
        print(e)
        continue

0it [00:00, ?it/s]

Likely wrong value «01:05:57:59» for duration. Omitting this sample.
Likely wrong value «01:03:48:00» for duration. Omitting this sample.


In [18]:
df_eps = pd.concat(frames_eps)
df_eps.reset_index(drop=True, inplace=True)
print(f"{df_eps.shape[0]:,.0f} episodes found.")

# # Save data for later use to avoid excessive calls to APIs.
# df_eps.to_csv(f"_data/itunes_episodes_{market}.csv", index=False)

11,622 episodes found.


## Analyse iTunes episode data

Before analysis we remove:
- episodes without a value for duration
- episodes shorter than 2 minutes (mostly teasers, trailers, announcements or erroneous values)
- episodes longer than 10 hours (mostly errors though there are some gaming podcast episodes that are actually longer than 10 hours...)

In [28]:
# df_eps = pd.read_csv(f"_data/top_{top_n_podcasts}_episodes_{market}_{date_stamp}.csv")
# Reload saved data
df_eps = pd.read_csv(f"_data/itunes_episodes_de.csv")
print(f"{len(df_eps):,.0f} episodes in full data set.")

# Drop episodes without value for duration
nan_count = len(df_eps[df_eps.duration_secs.isna()])
print(f"{nan_count} episodes without a value for duration.")
df_eps.dropna(subset=["duration_secs"], inplace=True)
df_eps.reset_index(drop=True, inplace=True)

# Remove episodes shorter than 2 minutes which very likely will be trailers and announcements
to_drop = df_eps[df_eps.duration_secs<120].index
print(f"{len(to_drop):,.0f} episodes shorter than 2 minutes.")
df_eps = df_eps.drop(to_drop)
df_eps.reset_index(drop=True, inplace=True)

# Remove episodes longer than 10 hours which very likely are erroneous values
to_drop = df_eps[df_eps.duration_secs>36_000].index
print(f"{len(to_drop)} episodes longer than 10 hours.")
df_eps = df_eps.drop(to_drop)
df_eps.reset_index(drop=True, inplace=True)

# Calculate duration in minutes
df_eps["duration_mins"] = df_eps.duration_secs / 60

print(f"{len(df_eps):,.0f} episodes in data set after cleaning.")
print()

# display(df_eps.info(memory_usage="deep"))

232,055 episodes in full data set.
620 episodes without a value for duration.
6,469 episodes shorter than 2 minutes.
20 episodes longer than 10 hours.
224,946 episodes in data set after cleaning.



We now **calculate basis statistics** of the lengths of the episodes. 

We have very different episode counts per title. Therefore we first calculate the statistics **per podcast** and only then aggregate these to single values.

In [8]:
def get_percentiles(data, perc):
    return np.percentile(data, perc)


def get_statistics(data, group_key="title"):
    data_group = data.groupby(group_key)
    mean_ = int(data_group.duration_secs.mean().mean()/60)
    median_ = int(data_group.duration_secs.median().median()/60)
    
    std_ = int(data_group.duration_secs.std().median()/60)
    
    min_ = int(data_group.duration_secs.min().min()/60)
    max_ = int(data_group.duration_secs.max().max()/60)
    
    perc_5 = int(data_group.duration_secs.agg(lambda x: get_percentiles(x, 5)).median()/60)
    perc_95 = int(data_group.duration_secs.agg(lambda x: get_percentiles(x, 95)).median()/60)
    
    return [median_, mean_, std_, min_, max_, perc_5, perc_95]

In [9]:
results = []

for name, data in df_eps.groupby("genre"):    
    results.append((name, *get_statistics(data, group_key="title")))

Observations:
- We have outliers in the data (see plots below), so the mean is not a suitable metric in this case. I only calculate it for completeness sake.
- The median length varies substantially, with a lowest value of ~20 minutes for genre «Bildung» up to ~65 minutes for genre «TV und Film».
- From a practical perspective it makes sense to look at the percentiles. E.g., 90% of all episodes in genre «Bildung» have a length between 10 and 43 minutes. If you publish an episode in this percentile range of your genre you are «in good company».

In [10]:
print("iTunes podcast lengths – various statistics, values in minutes")
print(f"Calculated from {df_eps.title.nunique():,.0f} podcasts and {len(df_eps):,.0f} episodes\n")
stats = pd.DataFrame(results, columns=["genre", "median", "mean", "std", "min", "max", "5%", "95%"]).sort_values(by="median", ascending=False)
stats.rename({"genre": "iTunes Genre"}, axis=1, inplace=True)
print(tabulate(stats, showindex=False, headers="keys"))
print()

iTunes podcast lengths – various statistics, values in minutes
Calculated from 1,766 podcasts and 224,946 episodes

iTunes Genre                  median    mean    std    min    max    5%    95%
--------------------------  --------  ------  -----  -----  -----  ----  -----
TV und Film                       65      68     19      2    358    34     96
Comedy                            62      64     12      2    486    42     78
Musik                             60      65     15      2    550    57     78
Freizeit                          59      66     15      2    594    31     82
Sport                             52      58     16      2    311    24     83
Technologie                       47      52     13      2    372    26     66
Gesellschaft und Kultur           42      47     11      2    310    26     61
Kunst                             42      47     13      2    437    18     68
Wahre Kriminalfälle               40      44     12      2    285    23     61
Geschichte     

Looking at all podcasts regardless of genre we get a **typical length of ~39 minutes** and **a range between 20 and 61 minutes wherein 90% of all episode lengths lie**.

In [11]:
stats_total = get_statistics(df_eps)
stats_total = pd.DataFrame(stats_total).T
stats_total.columns = ["median", "mean", "std", "min", "max", "5%", "95%"]
print("Statistics for all episodes")
print(tabulate(stats_total, showindex=False, headers="keys"))
print()

Statistics for all episodes
  median    mean    std    min    max    5%    95%
--------  ------  -----  -----  -----  ----  -----
      39      45     12      2    594    20     61



From **plotting the lengths for each podcast genre** we observe:
- The data is not normally distributed but quite skewed.
- There are a lot of outliers which mostly are unusually long episodes. 
- Some genres have clear length patterns, e.g., «Musik» with clear clusters at 30, 60, 120 minutes. Or «Wissenschaft» with two distinct clusters around 10 and 60 minutes. Some of these might be radio broadcasts that are published as podcasts too.

In [16]:
rng = range(0, 630, 30)

for genre in df_eps.genre.unique():
    tmp = df_eps[df_eps.genre==genre]
    
    fig, axes = plt.subplots(figsize=(16,6), nrows=2)
    plt.suptitle(f"iTunes podcast genre: «{genre}»", size=24)
    sns.stripplot(data=tmp, x="duration_mins", jitter=0.4, size=1.5, ax=axes[0])
    sns.boxplot(data=tmp, x="duration_mins", ax=axes[1])

    for idx, ax_ in enumerate(axes):
        ax_.set_xticks(rng)
        ax_.set_xlim(0, 300)
        if idx==0:
            ax_.set_xlabel("")
        else:
            ax_.set_xlabel("Duration, in minutes, cut off at 300 mins for better readability")
        
    plt.tight_layout()
    plt.savefig(f'_plots/itunes_{genre.replace(" ", "_")}.png')
    plt.show()

# Spotify
---

## Get podcast data from Spotify API

I couldn't find an official endpoint for Spotify's podcast top charts. I therefore scrape the current charts from [this website](https://podcastcharts.byspotify.com/).

In [17]:
# Scrape data from this site: https://podcastcharts.byspotify.com/
market = "de"
params = (('region', market),)

# 
GENRES = ["top", "arts", "business", "comedy", "education", "fiction", 
          "health%252520%2526%252520fitness", "history", "leisure", 
          "music", "news", "religion%252520%2526%252520spirituality",
         "science", "society%252520%2526%252520culture", "sports", "technology",
         "true%252520crime", "tv%252520%2526%252520film"]


GENRES_CLEAN = ["Top Charts", "Arts", "Business", "Comedy", "Education", "Fiction", 
          "Health & Fitness", "History", "Leisure", "Music", "News", 
          "Religion & Spirituality", "Science", "Society & Culture", 
          "Sports", "Technology", "True Crime", "TV & Film"]

genre_mapping = dict(zip(GENRES, GENRES_CLEAN))

In [27]:
frames_shows = []

for genre, genre_clean in tqdm(zip(GENRES, GENRES_CLEAN)):
    url = f"https://podcastcharts.byspotify.com/api/charts//{genre}"
    res = requests.get(url, params=params)
    assert res.status_code==200
    df = pd.DataFrame(json.loads(res.content))
    df["genre"] = genre_clean
    print(genre_clean, len(df))
    frames_shows.append(df)
    time.sleep(1)
        
df = pd.concat(frames_shows)
df.reset_index(drop=True, inplace=True)

# Save result with date stamp, genre and market
market = "de"
df.to_csv(f"_data/spotify_podcasts_{market}.csv", index=False)

0it [00:00, ?it/s]

Top Charts 200
Arts 50
Business 50
Comedy 50
Education 50
Fiction 41
Health & Fitness 50
History 50
Leisure 50
Music 50
News 50
Religion & Spirituality 50
Science 50
Society & Culture 50
Sports 50
Technology 36
True Crime 50
TV & Film 43


In [31]:
print(f"{df.shape[0]} podcasts found in Spotify's top lists.")

1020 podcasts found in Spotify's top lists.


Retrieve **episode data** for all podcats.

In [35]:
frames_episodes = []

for _, data in tqdm(df.iterrows()):
    shows = spotify.show_episodes(data.showUri, market="de")
    shows = pd.DataFrame(shows["items"])
    shows["genre"] = data.genre
    shows["showUri"] = data.showUri
    shows["showName"] = data.showName
    shows["showPublisher"] = data.showPublisher
    frames_episodes.append(shows)

0it [00:00, ?it/s]

In [36]:
df_eps = pd.concat(frames_episodes)
df_eps.reset_index(drop=True, inplace=True)
df_eps["duration_secs"] = df_eps.duration_ms / 1000
df_eps["duration_mins"] = df_eps.duration_secs / 60
display(df_eps.info(memory_usage="deep"))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37953 entries, 0 to 37952
Data columns (total 25 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   audio_preview_url       37655 non-null  object 
 1   description             37953 non-null  object 
 2   duration_ms             37953 non-null  int64  
 3   explicit                37953 non-null  bool   
 4   external_urls           37953 non-null  object 
 5   href                    37953 non-null  object 
 6   html_description        37953 non-null  object 
 7   id                      37953 non-null  object 
 8   images                  37953 non-null  object 
 9   is_externally_hosted    37953 non-null  bool   
 10  is_playable             37953 non-null  bool   
 11  language                37953 non-null  object 
 12  languages               37953 non-null  object 
 13  name                    37953 non-null  object 
 14  release_date            37953 non-null

None

In [37]:
df_eps.to_csv(f"_data/spotify_episodes_{market}.csv", index=False)

## Analyse Spotify episode data

In [18]:
# df_eps = pd.read_csv(f"_data/top_{top_n_podcasts}_episodes_{market}_{date_stamp}.csv")
# Reload saved data.
df_eps = pd.read_csv(f"_data/spotify_episodes_de.csv", low_memory=False)
print(f"{len(df_eps):,.0f} episodes in full data set.")

# Drop episodes without value for duration.
nan_count = len(df_eps[df_eps.duration_secs.isna()])
print(f"{nan_count} episodes without a value for duration.")
df_eps.dropna(subset=["duration_secs"], inplace=True)
df_eps.reset_index(drop=True, inplace=True)

# Remove episodes shorter than 2 minutes which very likely will be trailers and announcements.
to_drop = df_eps[df_eps.duration_secs<120].index
print(f"{len(to_drop):,.0f} episodes shorter than 2 minutes.")
df_eps = df_eps.drop(to_drop)
df_eps.reset_index(drop=True, inplace=True)

# There aren't any episodes that are longer than 10 hours.
# So nothing to remove in terms of unusually long (outlying) episodes.

print(f"{len(df_eps):,.0f} episodes in data set after cleaning.")
print()

# display(df_eps.info(memory_usage="deep"))

37,953 episodes in full data set.
0 episodes without a value for duration.
724 episodes shorter than 2 minutes.
37,229 episodes in data set after cleaning.



In [19]:
g = df_eps.groupby("genre")
results = []

for name, data in g:    
    results.append((name, *get_statistics(data, group_key="showName")))

Observations:
- The median length varies substantially, with a lowest value of ~14 minutes for genre «Religion & Spirituality» up to ~70 minutes for genre «TV und Film».
- Again the percentiles provide meaningful goal posts. As an example: 90% of all episodes in «Comedy» have a length between 44 and 67 minutes. Lengths in this range can be considered prototypical for this genre.

In [20]:
print("Spotify podcast lengths – various statistics, values in minutes")
print(f"Calculated from {len(df_eps):,.0f} episodes of {df_eps.showName.nunique():,.0f} podcasts\n")
stats = pd.DataFrame(results, columns=["genre", "median", "mean", "std", "min", "max", "5%", "95%"]).sort_values(by="median", ascending=False)
stats.rename({"genre": "Spotify Genre"}, axis=1, inplace=True)
print(tabulate(stats, showindex=False, headers="keys"))
print()

Spotify podcast lengths – various statistics, values in minutes
Calculated from 37,229 episodes of 821 podcasts

Spotify Genre              median    mean    std    min    max    5%    95%
-----------------------  --------  ------  -----  -----  -----  ----  -----
TV & Film                      70      71     16      2    290    45     94
Sports                         60      62     13      2    192    40     83
Comedy                         55      55      7      2    228    44     67
Society & Culture              53      60      9      2    520    38     64
True Crime                     46      50     11      2    247    31     62
Music                          44      51      9      2    222    32     60
Top Charts                     43      46      9      2    520    27     58
History                        42      45     10      2    346    28     56
Technology                     39      44     11      2    253    20     60
Leisure                        38      48     11   

Looking at all podcasts regardless of genre we get a **typical length of ~38 minutes** and **a range between 23 and 55 minutes wherein 90% of all episodes lie**. 

This is very close to our findings in the iTunes data (39 minutes median, 20-61 minutes range).

In [21]:
stats_total = get_statistics(df_eps, group_key="showName")
stats_total = pd.DataFrame(stats_total).T
stats_total.columns = ["median", "mean", "std", "min", "max", "5%", "95%"]
print("Statistics for all episodes")
print(tabulate(stats_total, showindex=False, headers="keys"))
print()

Statistics for all episodes
  median    mean    std    min    max    5%    95%
--------  ------  -----  -----  -----  ----  -----
      38      45     10      2    582    23     55



From plotting the lengths for each podcast genre we observe:
- The data is not normally distributed.
- We see a lot of outliers. 
- Distinct length clusters e.g. in genres «News» and «Science». 

In [22]:
rng = range(0, 630, 30)

for genre in df_eps.genre.unique():
    tmp = df_eps[df_eps.genre==genre]
    
    fig, axes = plt.subplots(figsize=(16,6), nrows=2)
    plt.suptitle(f"Spotify podcast genre: «{genre}»", size=24)
    sns.stripplot(data=tmp, x="duration_mins", jitter=0.4, size=3, ax=axes[0])
    sns.boxplot(data=tmp, x="duration_mins", ax=axes[1])

    for idx, ax_ in enumerate(axes):
        ax_.set_xticks(rng)
        ax_.set_xlim(0, 300)
        if idx==0:
            ax_.set_xlabel("")
        else:
            ax_.set_xlabel("Duration, in minutes, cut off at 300 mins for better readability")
        
    plt.tight_layout()
    plt.savefig(f'_plots/spotify_{genre.replace(" ", "_")}.png')
    plt.show()