In [None]:
! pip install spotipy --upgrade

# 0. Login the Spotify Client

In [2]:
import os
with open("spotify-secret.txt", "r") as f:
    os.environ["SPOTIPY_CLIENT_ID"] = f.readline().strip()
    os.environ["SPOTIPY_CLIENT_SECRET"] = f.readline().strip()

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

api = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials())

# 1. Search the shows
Search the shows by specifying the topic and keep all the English shows' uri in `topic_info` varibale.

In [3]:
topics = ["machine learning", "cooking", "crime", "politics", "kid",
          "comedy", "sport", "culture", "lifestyle", "business"]

## 1.1 Utility functions

In [4]:
def write_file(log, file):
    '''
        Write the logging file
    '''
    with open(file+".txt", "a") as f:
        f.write(log.strip()+'\n')

In [5]:
def check_language(show, topic):
    '''
        Keep only English shows
    '''
    log = f"{show['id']} , {show['name']} , {show['languages']} , {topic}"
    if any(["en" in lang for lang in show['languages']]):
        write_file(log, "included")
        return True
    else:
        write_file(log, "excluded")
        return False

def check_duplicate(x: list):
    '''
        Check if x has duplicated value.
    '''
    return len(set(x)) == len(x)

def window_search(topic, offset, limit):
    '''
        Searching by using Spotify API
        while keep the number of API calls as low as possible
    '''
    results = api.search(topic, type="show", market="TH", limit=limit, offset=offset)

    name, id, uri = [], [] , []
    notEN = 0

    for show in results['shows']['items']:
        if show['id'] in id:
            print(f"{show['id']} duplicated .. skipped")
            continue
        if check_language(show, topic):
            name.append(show['name'])
            id.append(show['id'])
            uri.append(show['uri'])
        else:
            notEN += 1
            
    return notEN, name, id, uri

## 1.2 Searching the shows in each topic 

In [7]:
from collections import defaultdict
import time

topics_info = {}
N_SHOWS = 50
n_requests = 0

for topic in topics:
    print(f"\nQuerying {topic} ...")

    show_info = defaultdict(lambda: list())
    offset = 0
    limit = N_SHOWS

    while len(show_info['id']) < N_SHOWS:

        time.sleep(0.5)
        notEN, name, id, uri = window_search(topic, 
                                             offset = offset,
                                             limit = limit)
        n_requests += limit
        show_info['name']+=name
        show_info['id']+=id
        show_info['uri']+=uri

        offset += N_SHOWS
        limit = notEN + 1
    
    topics_info[topic] = dict(show_info)
    print(f"\t {topic} has {len(show_info['id'])} shows")


Querying machine learning ...
	 machine learning has 50 shows

Querying cooking ...
	 cooking has 52 shows

Querying crime ...
	 crime has 52 shows

Querying politics ...
	 politics has 53 shows

Querying kid ...
	 kid has 50 shows

Querying comedy ...
	 comedy has 51 shows

Querying sport ...
	 sport has 52 shows

Querying culture ...
	 culture has 52 shows

Querying lifestyle ...
	 lifestyle has 53 shows

Querying business ...
	 business has 52 shows


In [8]:
print(f'total requests : {n_requests}')

total requests : 950


In [9]:
for topic in topics_info.keys():
    try:
        assert check_duplicate(topics_info[topic]["id"])
    except AssertionError:
        print(f'[{topic} has duplicates]')
    

[politics has duplicates]
[sport has duplicates]
[culture has duplicates]
[business has duplicates]


## 1.3 Export shows' info

In [10]:
import pandas as pd

df = pd.read_csv("included.txt", sep=" , ", header=None, engine="python")
df.columns=["id","name","language","label"]

In [11]:
df

Unnamed: 0,id,name,language,label
0,02e6PZeIOdpmBGT9THuzwR,Machine Learning Street Talk,['en'],machine learning
1,5M9yZpSyF1jc7uFp2MlhP9,Machine Learning Guide,['en'],machine learning
2,1LaCr5TFAgYPK5qHjP3XDp,"Practical AI: Machine Learning, Data Science",['en'],machine learning
3,7o9r3fFig3MhTJwehXDbXm,Gradient Dissent - A Machine Learning Podcast ...,['en'],machine learning
4,6g03vdHyFfYWYdRAPKfdQM,Adventures in Machine Learning,['en'],machine learning
...,...,...,...,...
512,1TeUxfqH1803TE9SZt6VIi,Master Builders Elevate: Building a Better Bus...,['en'],business
513,05sB5LFthkIXOfU6uYNe9A,People Business w/ O'Brien McMahon,['en-US'],business
514,7KCg1Xg0AG82jmqqBDurEx,Small Business Collective Podcast,['en'],business
515,29QJfd80NNfZtMon3SogZZ,Sports Business Updates,['en'],business


# 2. Query 50 episodes of each show

In [15]:
from urllib3.exceptions import MaxRetryError

total_df = pd.DataFrame(columns = ['ep_id','ep_name','show_name','description','label'])

In [16]:
for index, row in df.iterrows():
    print(f"Querying ... show: {row['name']}")

    try:
        result = api.show_episodes(row['id'], market="TH", limit=50)
    except (
        MaxRetryError,
        spotipy.client.SpotifyException) as e:
        print(e)
        write_file(f"{row['id']}, {row['name']}, {row['label']}", "failed_ep_query")
        print(f"skipped : {row['id']}, {row['name']}")

    new_row = defaultdict(lambda: list())
    c = 0
    for ep in result['items']:
        new_row['ep_id'].append(ep['id'])
        new_row['ep_name'].append(ep['name'])
        new_row['description'].append(ep['description'])
        new_row['label'].append(row['label'])
        new_row['show_name'].append(row['name'])
        c+=1

    print(f"\t\tcollected {c} episodes")
    total_df = total_df.append(
                    pd.DataFrame(dict(new_row))
                )
    
    time.sleep(0.5)

Querying ... show: Machine Learning Street Talk
		collected 50 episodes
Querying ... show: Machine Learning Guide
		collected 30 episodes
Querying ... show: Practical AI: Machine Learning, Data Science
		collected 50 episodes
Querying ... show: Gradient Dissent - A Machine Learning Podcast by W&B
		collected 50 episodes
Querying ... show: Adventures in Machine Learning
		collected 50 episodes
Querying ... show: The Medicine & Machine Learning Podcast (MaML)
		collected 12 episodes
Querying ... show: Cocktail Time Machine
		collected 46 episodes
Querying ... show: Learning Machine: The Uncertain Future of Education
		collected 5 episodes
Querying ... show: Machine Learning Africa
		collected 14 episodes
Querying ... show: The Interpretable Machine Learning Podcast
		collected 2 episodes
Querying ... show: Love Machine with James Preece
		collected 50 episodes
Querying ... show: Unstoppable Recording Machine Podcast
		collected 50 episodes
Querying ... show: Neura Pod: Learning about Neu

Max Retries reached


http status: 429, code:-1 - /v1/shows/5us0sMPtcjkKuFL4kNHa08/episodes/?limit=50&offset=0&market=TH:
 Max Retries, reason: too many 500 error responses
skipped : 5us0sMPtcjkKuFL4kNHa08, Costa Rica Pura Vida Lifestyle Podcast
		collected 50 episodes
Querying ... show: The OT Lifestyle Movement
		collected 45 episodes
Querying ... show: Ronak Shah Show | Books | Lifestyle 
		collected 50 episodes
Querying ... show: The Barbell Lifestyle Podcast
		collected 37 episodes
Querying ... show: My Lifestyle Academy Podcast
		collected 50 episodes
Querying ... show: Lifestyle U Podcast
		collected 18 episodes
Querying ... show: RV Lifestyle Expert
		collected 30 episodes
Querying ... show: Health Hero Show: The official Chemical Free Body Lifestyle Podcast
		collected 50 episodes
Querying ... show: The Baller Lifestyle Podcast
		collected 50 episodes
Querying ... show: The Black n Kinky Lifestyle: A Swinger's Podcast
		collected 50 episodes
Querying ... show: Lifestyle Leap
		collected 8 episodes


In [17]:
total_df.shape

(14127, 5)

In [18]:
mask = total_df.duplicated(subset=['description'], keep=False)
mask.sum()

1738

In [48]:
e=0
for i in total_df[mask].description:
    print(i)
    e+=1
    if e == 10:break

  Charles Max Wood discusses several opportunities that came his way early in his podcasting career and other opportunities that have come to other people after only a couple of podcast episodes. He explains why that happens and how you can use this to create more influence as a developer.    Panel      Charles Max Wood   
  Charles Max Wood discusses several opportunities that came his way early in his podcasting career and other opportunities that have come to other people after only a couple of podcast episodes. He explains why that happens and how you can use this to create more influence as a developer.    Panel      Charles Max Wood   
  Chuck outlines how he's used his podcasts to find mentors to continue his learning journey over 12 years of podcasting. Some mentors have been long lived relationships while others have lasted only a few months or even days. This episode shares Chuck's experience learning from the top people in the development community as a programmer and podcas

In [22]:
total_df.to_csv('total_df.csv', index=False)

In [23]:
pd.read_csv('total_df.csv')

Unnamed: 0,ep_id,ep_name,show_name,description,label
0,2QwU0zzueDDoi0fN3Kf0HR,#57 - Prof. Melanie Mitchell - Why AI is harde...,Machine Learning Street Talk,"Since its beginning in the 1950s, the field of...",machine learning
1,1ir7MrWUBwUI6AE7Na5diX,"#56 - Dr. Walid Saba, Gadi Singer, Prof. J. Ma...",Machine Learning Street Talk,It has been over three decades since the stati...,machine learning
2,1ACeGKFk4syDdcRuuuSZP7,#55 Self-Supervised Vision Models (Dr. Ishan M...,Machine Learning Street Talk,Dr. Ishan Misra is a Research Scientist at Fac...,machine learning
3,1vuc4azX8Mb0sl0lKOmxZi,#54 Gary Marcus and Luis Lamb - Neurosymbolic ...,Machine Learning Street Talk,"Professor Gary Marcus is a scientist, best-sel...",machine learning
4,6DrRmtpKKpH3yYJ6yPoMR2,#53 Quantum Natural Language Processing - Prof...,Machine Learning Street Talk,"Bob Coercke is a celebrated physicist, he's be...",machine learning
...,...,...,...,...,...
14122,27NZuo9cAYjf5kOzUTjeIk,Ep 05: Samuel Smith - Ramp Up Your Revenue Wit...,No Bull Business And Brews,Where do you see yourself in ten years? In twe...,business
14123,25UAT6zU2TdtgkspfTacPt,Ep 4: Hill Meats CEO Jim Cheney - Meet The Kin...,No Bull Business And Brews,"Sunday morning sun, the sizzle and pop of hot ...",business
14124,6wNFI4ONRSFzheWcIV2AaK,Ep 3: Level Up Your Business,No Bull Business And Brews,Are you ready to take your business and go big...,business
14125,3fbCPaqqj4iguoqayDowfW,Ep 2: Craig Coleman - So You Want To Open A Br...,No Bull Business And Brews,Are you an avid beer lover thirsty for more in...,business
