In [4]:
#Imports and load csv file of metadata
import pickle
import pandas as pd

metadata = pd.read_csv('Podcast_Title_Metadata.csv', engine='python')
metadata.head()

Unnamed: 0,id,duration,show_name,show_description,genres,episode_name,episode_description,explicit
0,1,23.5,The Daily,This is what the news should sound like. The b...,"Daily News, News, Politics, Business",Targeting Overseas Tax Shelters,"The I.R.S. says that Bristol Myers Squibb, Ame...",no
1,2,32.5,The Daily,This is what the news should sound like. The b...,"Daily News, News, Politics, Business",A Vast Web of Vengeance,How one woman with a grudge was able to slande...,no
2,3,27.0,The Daily,This is what the news should sound like. The b...,"Daily News, News, Politics, Business",A Military That Murders Its Own People,"Two months ago, Myanmar’s military carried out...",no
3,4,52.5,The Daily,This is what the news should sound like. The b...,"Daily News, News, Politics, Business",The Sunday Read: ‘The Beauty of 78.5 Million F...,"During the pandemic, cheerleader-ish girls per...",no
4,5,28.5,The Daily,This is what the news should sound like. The b...,"Daily News, News, Politics, Business",Inside the Biden Infrastructure Plan,President Biden is pushing the boundaries of h...,no


In [5]:
# Combine Text fields from metadata to create a corpus for each episode
# Combining Show Name, Show, Description, genres, Episode Name, Episode Description
# Not including other non-text fields into corpus since we want to provide a fair comparison to methods used for transcripts
metadata["corpus"] = metadata["show_name"]+' '+metadata["show_description"]+' '+metadata["genres"]+' '+metadata["episode_name"]+' '+metadata["episode_description"]
metadata.head()


Unnamed: 0,id,duration,show_name,show_description,genres,episode_name,episode_description,explicit,corpus
0,1,23.5,The Daily,This is what the news should sound like. The b...,"Daily News, News, Politics, Business",Targeting Overseas Tax Shelters,"The I.R.S. says that Bristol Myers Squibb, Ame...",no,The Daily This is what the news should sound l...
1,2,32.5,The Daily,This is what the news should sound like. The b...,"Daily News, News, Politics, Business",A Vast Web of Vengeance,How one woman with a grudge was able to slande...,no,The Daily This is what the news should sound l...
2,3,27.0,The Daily,This is what the news should sound like. The b...,"Daily News, News, Politics, Business",A Military That Murders Its Own People,"Two months ago, Myanmar’s military carried out...",no,The Daily This is what the news should sound l...
3,4,52.5,The Daily,This is what the news should sound like. The b...,"Daily News, News, Politics, Business",The Sunday Read: ‘The Beauty of 78.5 Million F...,"During the pandemic, cheerleader-ish girls per...",no,The Daily This is what the news should sound l...
4,5,28.5,The Daily,This is what the news should sound like. The b...,"Daily News, News, Politics, Business",Inside the Biden Infrastructure Plan,President Biden is pushing the boundaries of h...,no,The Daily This is what the news should sound l...


In [9]:
# Pull out combined "Corpuses"
metacorpi = list(metadata["corpus"])
# Check what it looks like
metacorpi[:2]

['The Daily This is what the news should sound like. The biggest stories of our time, told by the best journalists in the world. Hosted by Michael Barbaro. Twenty minutes a day, five days a week, ready by 6 a.m. Daily News, News, Politics, Business Targeting Overseas Tax Shelters The I.R.S. says that Bristol Myers Squibb, America’s second-largest drug company, has engaged a tax-shelter setup that has deprived the United States of $1.4 billion in tax revenue. The Biden administration is looking to put an end to such practices to pay for its policy ambitions, including infrastructure like improving roads and bridges and revitalizing cities. We look at the structure of these tax arrangements and explore how, and whether, it’s possible to clamp down on them.',
 'The Daily This is what the news should sound like. The biggest stories of our time, told by the best journalists in the world. Hosted by Michael Barbaro. Twenty minutes a day, five days a week, ready by 6 a.m. Daily News, News, Pol

In [10]:
# Pickle files for later use
!mkdir metacorpi
# Create IDs for each transcript
IDs = list(range(1, 301))
# Stringify
str_IDs = list(map(str, IDs))
for i, c in enumerate(str_IDs):
    with open("metacorpi/" + c + ".txt", "wb") as file:
        pickle.dump(metacorpi[i], file)
        

In [11]:
# Read in the pickled data file of IDs - Metadata corpuses
data = {}
for i, c in enumerate(str_IDs):
    with open("metacorpi/" + c + ".txt", "rb") as file:
        data[c] = pickle.load(file)

In [12]:
# Check the podcast ID for first podcast
next(iter(data.keys()))

'1'

In [16]:
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text'''
    combined_text = ''.join(list_of_text)
    return combined_text

In [17]:
# Combine it
data_combined = {key: [combine_text(value)] for (key, value) in data.items()}

In [18]:
# Create DataFrame of Corpus

import pandas as pd
pd.set_option('max_colwidth',150)

data_df = pd.DataFrame.from_dict(data_combined).transpose()
data_df.columns = ['metacorpus']
data_df = data_df.sort_index()
data_df

Unnamed: 0,metacorpus
1,"The Daily This is what the news should sound like. The biggest stories of our time, told by the best journalists in the world. Hosted by Michael B..."
10,"Global News Podcast The days top stories from BBC News. Delivered twice a day on weekdays, daily at weekends Daily News, News Jordanian prince vow..."
100,My Favorite Murder with Karen Kilgariff and Georgia Hardstark Lifelong fans of true crime stories Karen Kilgariff and Georgia Hardstark tell each ...
101,My Favorite Murder with Karen Kilgariff and Georgia Hardstark Lifelong fans of true crime stories Karen Kilgariff and Georgia Hardstark tell each ...
102,My Favorite Murder with Karen Kilgariff and Georgia Hardstark Lifelong fans of true crime stories Karen Kilgariff and Georgia Hardstark tell each ...
103,"Morbid: A True Crime Podcast Its a lighthearted nightmare in here, weirdos! Morbid is a true crime, creepy history and all things spooky podcast h..."
104,"Morbid: A True Crime Podcast Its a lighthearted nightmare in here, weirdos! Morbid is a true crime, creepy history and all things spooky podcast h..."
105,"Morbid: A True Crime Podcast Its a lighthearted nightmare in here, weirdos! Morbid is a true crime, creepy history and all things spooky podcast h..."
106,"Morbid: A True Crime Podcast Its a lighthearted nightmare in here, weirdos! Morbid is a true crime, creepy history and all things spooky podcast h..."
107,"Morbid: A True Crime Podcast Its a lighthearted nightmare in here, weirdos! Morbid is a true crime, creepy history and all things spooky podcast h..."


Data Cleaning - Match steps taken in Transcript Data

In [19]:
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words...'''
    text = text.lower() # lowersace everything
    text = re.sub('\[.*?\]', '', text) # remove square brackets timestamps
    text = re.sub('[%s]' % re.escape(string.punctuation),'', text) #remove punctuation marks
    text = re.sub('\w*\d\w*', '', text) # remove words that contain numbers
    return text

round1 = lambda x: clean_text_round1(x)

In [20]:
# Inspect data after cleaning
data_clean = pd.DataFrame(data_df.metacorpus.apply(round1))
data_clean

Unnamed: 0,metacorpus
1,the daily this is what the news should sound like the biggest stories of our time told by the best journalists in the world hosted by michael barb...
10,global news podcast the days top stories from bbc news delivered twice a day on weekdays daily at weekends daily news news jordanian prince vows t...
100,my favorite murder with karen kilgariff and georgia hardstark lifelong fans of true crime stories karen kilgariff and georgia hardstark tell each ...
101,my favorite murder with karen kilgariff and georgia hardstark lifelong fans of true crime stories karen kilgariff and georgia hardstark tell each ...
102,my favorite murder with karen kilgariff and georgia hardstark lifelong fans of true crime stories karen kilgariff and georgia hardstark tell each ...
103,morbid a true crime podcast its a lighthearted nightmare in here weirdos morbid is a true crime creepy history and all things spooky podcast hoste...
104,morbid a true crime podcast its a lighthearted nightmare in here weirdos morbid is a true crime creepy history and all things spooky podcast hoste...
105,morbid a true crime podcast its a lighthearted nightmare in here weirdos morbid is a true crime creepy history and all things spooky podcast hoste...
106,morbid a true crime podcast its a lighthearted nightmare in here weirdos morbid is a true crime creepy history and all things spooky podcast hoste...
107,morbid a true crime podcast its a lighthearted nightmare in here weirdos morbid is a true crime creepy history and all things spooky podcast hoste...


In [21]:
# Add Categories for all datasets to group by category in EDA
# Data Source out of order, so need to re-order and add category by ID

data_clean.index.name = 'podcast_id'
data_clean['num_id'] = pd.to_numeric(data_clean.index)
data_clean = data_clean.sort_values('num_id')

#data_clean
#

# <b>Organizing the Data:</b><p><p>
1-Corpus (Completed Above)<p> 
2-Document-Term Matrix

In [22]:
# Using CountVectorizer to create a Doc-Term Matrix, exclude Stop Words
from sklearn.feature_extraction.text import CountVectorizer
# CV has ability to exclude stop words, treat bi-grams as one word ('ngram_range=(1,1)', other cool stuff

#Create CountVectorizer
cv = CountVectorizer(stop_words='english')
#Fit CV onto our transcript data
data_cv = cv.fit_transform(data_clean.metacorpus)
#Convert it into an array and label all the columns by 'word'
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean.index
data_dtm

Unnamed: 0_level_0,aaron,abc,abductions,abdul,abdullahs,abigail,ability,able,aboard,abolished,...,zigzag,zomorodi,zone,zoology,zoom,zucchinos,œbachelor,œlocal,œthe,œzack
podcast_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# Pickle the Doc-Term Matrix for later use
data_dtm.to_pickle("meta_dtm.pkl")

In [24]:
# Also pickling the cleaned data and countvectorizer for later use
data_clean.to_pickle('meta_data_clean.pkl')
pickle.dump(cv, open("meta_cv.pkl", "wb"))