In [5]:
import seaborn as sns
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from nltk.corpus import stopwords
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
nlp = spacy.load("en_core_web_sm")
stops_nltk = set(stopwords.words("english"))
stops_spacy = STOP_WORDS.union({'ll', 've', 'pron','okay','oh','like','know','yeah','yea','yep',\
                                "like like","oh like","yeah like","yeah yeah","oh okay","wow"})
stops = stops_nltk.union(stops_spacy)

In [6]:
# import nltk
# nltk.download('stopwords')

In [2]:
import csv
import pandas as pd
import matplotlib.pyplot as plt
import math
global df, show_descriptions

meta_data = []
with open("../data/metadata.tsv") as csvfile:
    csvreader = csv.reader(csvfile,delimiter="\t")
    for row in csvreader:
        meta_data.append(row)

df = pd.DataFrame(meta_data[1:],columns=meta_data[0])
show_filename_prefixes = df.show_filename_prefix
episode_filename_prefixes = df.episode_filename_prefix
show_names = df.show_name
episode_names = df.episode_name
show_duration = df.duration
publisher_name = df.publisher

show_descriptions = {}

for item1,item2 in zip(list(show_names),list(df.show_description)):
    show_descriptions[item1] = item2.strip()
    
episode_descriptions = {}
for item1,item2 in zip(list(episode_filename_prefixes),list(df.episode_description)):
    episode_descriptions[item1] = item2.strip()   

In [3]:
genres_topics = ["comedy","news","crime","science","economics","politics","education",\
                  "sports","lifestyle","health","wellbeing","religion","faith","music",\
                  "art","fashion","literature","humanities","drama","fitness","drama",\
                  "fantasy","scifi","gameshow","news quiz","games","mental health",\
                  "humor","research","technology","society","social","culture","lifestyle",\
                  "songs","cooking","culinary","food","travel","films","movies","tv","tv shows",\
                  "climate","space","planet","digital","artificial intelligence", "ai",\
                  "cars","car","nutrition","wellness","family","history","geography","physics",\
                  "mathematics","math","chemistry","biology","documentary","commentary","nfl",\
                  "mls","nba","mlb","stocks","stock","market","wall street","business",\
                  "reality shows","investing","social media","biography","biographies",\
                  "data science","medicine","media","books","book","europe","asia","canada",\
                  "south america","north america","america","usa","facebook","netflix","google"\
                  "instagram","tiktok","amazon","apple","twitter","adventure","pets","dogs",\
                  "cats","dog","cat","nintendo","xbox","playstation","ps4","ps5","theatre","mars"\
                  "tennis","australia","conspiracy","war","epidemic","pandemic","climate change"\
                  "astrology","novel","church","christ","romance","english","kids","astronomy"\
                  "design"]

formats = ["monologue","interview","storytelling","repurposed",\
           "bite-sized","co-host conversation","debate","narrative",\
           "scripted","improvised"]

In [14]:
def get_key(my_dict, val):
    
    """
    Input: my_dict is a dictionary
           val is a value
    
    returns key of val in my_dict
    """
    
    for key, value in my_dict.items():
        if val == value:
            return key

    return "key doesn't exist"

# # usual cleaning & remove urls and links
def remove_stops(text,stops):
    text = re.sub(r'www\.[a-z0-9A-Z.]*', "", text)
    text = re.sub(r'https://[a-z0-9A-Z.]*', "", text)
    words = text.split()
    final = []
    for word in words:
        if word not in stops:
            final.append(word)
    final = " ".join(final)
    final = final.translate(str.maketrans("","",string.punctuation))
    final = "".join([item for item in final if not item.isdigit()])
    while "  " in final:
        final = final.replace("  "," ")
    return final
                       
def clean_descriptions(docs):
    stops = stopwords.words("english")
    final = []
    for doc in docs:
        clean_doc = remove_stops(doc, stops)
        final.append(clean_doc)
    return final

# lemmatization
def get_lemmatized(text):
    for phrase in text:
        for word in nlp(phrase):
            if word.pos_ == "VERB":
                print(word.lemma_)

def get_named_entities(text):
    return nlp(text.lower()).ents

def get_noun_chunks(text):
    non_stop_noun_chunks = []
    stops = stopwords.words("english")
    for word in nlp(text.lower()).noun_chunks:
        if str(word) not in stops:
            non_stop_noun_chunks.append(word)
    return non_stop_noun_chunks

In [26]:
podcasts_genres_topics = {}

for k,show in enumerate(show_names):
    
    ner = get_named_entities(show.strip())
    if len(ner)>0:
        podcasts_genres_topics[(k,show)] = [str(item) for item in list(ner) if not str(item).isdigit()]
        
    noun_chunks = get_noun_chunks(show.strip())  
    if noun_chunks:
        try:
            podcasts_genres_topics[(k,show)].extend([entry.strip() for entry in\
                                                     [re.sub(r'[^0-9a-zA-Z\" "]+', '', str(item))\
                                                     for item in noun_chunks if not str(item).isdigit()]])
        except KeyError as ke:
            podcasts_genres_topics[(k,show)] = [entry.strip() for entry in\
                                                     [re.sub(r'[^0-9a-zA-Z\" "]+', '', str(item))\
                                                     for item in noun_chunks if not str(item).isdigit()]]
    
    keywords = show.lower().split(" ")
    for word in keywords:
        if word in genres_topics:
            if (k,show) in podcasts_genres_topics:
                if word not in podcasts_genres_topics[(k,show)]:
                    podcasts_genres_topics[(k,show)].append(word)
            else:
                podcasts_genres_topics[(k,show)] = [word]                

In [27]:
len(podcasts_genres_topics)/len(df)

0.9239464692482916

In [34]:
with open("show_names_genres_topics.txt","w+") as f:
    for key,val in podcasts_genres_topics.items():
        f.write(str(key[0])+", ")
        f.write(key[1]+", ")
        for phrase in val:
            f.write(phrase+", ")
        f.write('\n')

In [None]:
# columns = []
# for genre in nlp(" ".join(genres)):
#     columns.append(genre.lemma_)

In [None]:
pos_show_descriptions = {"ADJ":{},"NOUN":{},"VERB":{},"NE":{},"NC":{}}
for (item1,item2) in list(show_descriptions.items()):
    doc = nlp(item2)
    pos_show_descriptions["NE"][item1] = doc.ents
    pos_show_descriptions["NC"][item1] = doc.noun_chunks
    for word in doc:
        if word.pos_ == "ADJ" and str(word) not in stops:
            if item1 not in pos_show_descriptions["ADJ"]:
                pos_show_descriptions["ADJ"][item1] = [word.lemma_]
            else:
                pos_show_descriptions["ADJ"][item1].append(word.lemma_)
        if word.pos_ == "VERB" and str(word) not in stops:
            if item1 not in pos_show_descriptions["VERB"]:
                pos_show_descriptions["VERB"][item1] = [word.lemma_]
            else:
                pos_show_descriptions["VERB"][item1].append(word.lemma_)
        if word.pos_ == "NOUN" and str(word) not in stops:
            if item1 not in pos_show_descriptions["NOUN"]:
                pos_show_descriptions["NOUN"][item1] = [word.lemma_]
            else:
                pos_show_descriptions["NOUN"][item1].append(word.lemma_)

In [35]:
shows = df.groupby(by=['show_name'])

In [36]:
descriptions_aggregated = shows.apply(lambda x: list(x.episode_description)+[x.show_description.unique()[0]])

In [37]:
max([len(val) for key,val in descriptions_aggregated.items()])

1073

In [38]:
hundred_or_more_episode_podcasts = [item1.strip() for (item1,item2) in descriptions_aggregated.items()\
                                if (len(item2) >= 100) and (len(item2) <= 1073)]

In [None]:
# import os
# import subprocess

# cmd = subprocess.Popen('pwd', stdout=subprocess.PIPE)
# cmd_out, cmd_err = cmd.communicate()
# local_path = os.fsdecode(cmd_out).strip() + "/0/"

# transcript_files = {}

# for folder1 in os.listdir(os.fsencode(local_path)):
#     foldername1 = os.fsdecode(folder1)
#     if len(foldername1) == 1:
#         for folder2 in os.listdir(os.fsencode(local_path+foldername1)):
#             foldername2 = os.fsdecode(folder2)
#             if len(foldername2.split("_")) == 2:
#                 s = os.fsencode(local_path + "/" + foldername1 + "/" + foldername2 + "/")
#                 for file in os.listdir(s):
#                     transcript_files[os.fsdecode(s)] = os.fsdecode(file)

In [None]:
topics_ner = {}
for item in ten_or_more_episode_podcasts:
    tup = get_named_entities(item) 
    if len(tup) > 0:
        topics_ner[item] = list(tup)

In [None]:
topics_ner = {}
for item in ten_or_more_episode_podcasts:
    tup = get_named_entities(item) 
    if len(tup) > 0:
        topics_ner[item] = list(tup)

In [None]:
## computing interquartile range

# N = len(arr)
# if N%2 == 0:
#     m = (N//2)
#     a = arr[m:]
#     if m%2 == 0:
#         Q1 = 0.5*(arr[m//2] + arr[(m//2)-1])
#         Q3 = 0.5*(a[m//2] + a[(m//2)-1])
#     else:
#         Q1 = arr[m//2]
#         Q3 = a[m//2]
# else:
#     m = (N-1)//2
#     a = arr[m+1:]
#     if m%2 == 0:
#         Q1 = 0.5*(arr[m//2] + arr[(m//2)-1])
#         Q3 = 0.5*(a[m//2] + a[(m//2)-1])
#     else:
#         Q1 = arr[m//2]
#         Q3 = a[m//2]

# print(Q3-Q1)

In [None]:
# import textacy

# patterns = [{"POS":"ADV"},{"POS":"VERB"}]

# verb_phrases = textacy.extract.matches.token_matches(nlp(transcripts[key[0]].lower()), pattern = patterns)

# for verb_phrase in verb_phrases:
#     print(verb_phrase)

In [None]:
## using show description along with episode description (restricting shows that have ten or more episodes)
# descriptions = [" ".join([phrase for phrase in entry]) for entry in descriptions_aggregated if len(entry) > 10]
# cleaned_descriptions = clean_descriptions(descriptions)

## using show descriptions only
# cleaned_descriptions = clean_descriptions(list(show_descriptions.values()))

In [None]:
# code snippet to process named entities
def process_nes(docs):
    final = []
    for val in docs:
        if val:
            phrases = []
            if len(val) > 1:
                for phrase in list(val):
                    phrase = re.sub(r'www\.[a-z0-9A-Z.]*', '', str(phrase))
                    phrase = re.sub(r'https://[a-z0-9A-Z.]*', '', phrase)
                    phrase = re.sub(r'http://[a-z0-9A-Z.]*', '', phrase)
                    phrase = re.sub(r'[^0-9a-zA-Z\" "]+', '', phrase)
                    phrase = re.sub(r"  ", " ", phrase)
                    phrase = re.sub(r"   ", " ", phrase)
                    phrase = re.sub(r"   ", " ", phrase)
                    #phrases.append(phrase)
                    #phrases.append("_".join(phrase.split(" ")))
                    if phrase!="":final.append(phrase)
            else:
                phrase = re.sub(r'www\.[a-z0-9A-Z.]*', '', str(val[0]))
                phrase = re.sub(r'https://[a-z0-9A-Z.]*', '', phrase)
                phrase = re.sub(r'http://[a-z0-9A-Z.]*', '', phrase)
                phrase = re.sub(r'[^0-9a-zA-Z\" "]+', '', phrase)
                phrase = re.sub(r"  ", " ", phrase)
                phrase = re.sub(r"   ", " ", phrase)
                phrase = re.sub(r"   ", " ", phrase)
                #phrases.append(phrase)
                #phrases.append("_".join(phrase.split(" ")))
                if phrase!="":final.append(phrase)
                #(" ".join([item for item in phrases if not item.isdigit()]))
    return final

In [None]:
nes = [list(val) for key,val in pos_show_descriptions["NE"].items()]
nes = process_nes(nes)
nes_descriptions = process_nes([[description] for description in show_descriptions.values()])

In [None]:
nes_tfidf_vectorizer = TfidfVectorizer(lowercase=True,
                                       max_features = 1000,
                                       min_df = 1,
                                       max_df = 0.5,
                                       ngram_range = (2,3),
                                       stop_words = STOP_WORDS.union({'ll','ve','pron','okay','oh','like','know','yeah','yea',\
                                                                  'yep',"like like","oh like","yeah like","yeah yeah","oh okay",\
                                                                  "wow","podcast","support","talk","talking","people","shit",\
                                                                 "stuff","things","join","week","weekly"}))

nes_tfidf_vectorizer.fit_transform(nes)
nes_features = nes_tfidf_vectorizer.get_feature_names() #contains the same information as mapping, as a list
mapping = nes_tfidf_vectorizer.vocabulary_ #contains the same information as features, as a dictiionary 
nes_matrix = nes_tfidf_vectorizer.transform(nes_descriptions)
nes_matrix_dense = nes_matrix.todense()
matrix_denselist = nes_matrix_dense.tolist()

In [None]:
len(nes)

In [None]:
nes_features

#### For noun-based model

In [None]:
# non_stop_nouns = [" ".join(val) for key,val in pos_show_descriptions["NOUN"].items()]
# cleaned_noun_descriptions = clean_descriptions(non_stop_nouns)
# cleaned_descriptions = clean_descriptions(list(show_descriptions.values()))

In [None]:
tfidf_vectorizer = TfidfVectorizer(lowercase=True,
                                   max_features = 1000,
                                   min_df = 5,
                                   max_df = 0.9,
                                   ngram_range = (1,1),
                                   stop_words = STOP_WORDS.union({'ll','ve','pron','okay','oh','like','know','yeah','yea',\
                                                                  'yep',"like like","oh like","yeah like","yeah yeah","oh okay",\
                                                                  "wow","podcast","support","talk","talking","people","shit",\
                                                                 "stuff","things","join","week","weekly"}))

tfidf_vectorizer.fit_transform(list(cleaned_noun_descriptions))
features = tfidf_vectorizer.get_feature_names() #contains the same information as mapping, as a list
mapping = tfidf_vectorizer.vocabulary_ #contains the same information as features, as a dictiionary 
matrix = tfidf_vectorizer.transform(cleaned_descriptions)
matrix_dense = matrix.todense()
matrix_denselist = matrix_dense.tolist()

all_keywords = []

for transcript in matrix_denselist:
    k = 0
    keywords = []
    for word in transcript:
        if word > 0:
            keywords.append(get_key(mapping,k))
        k += 1
    all_keywords.append(keywords)

number_of_clusters = 30

model = KMeans(n_clusters = number_of_clusters, init = "k-means++", max_iter = 100,n_init=1)

model.fit(matrix)

order_centers = model.cluster_centers_.argsort()[:,::-1]

with open("show_descriptions_nouns_tfidf_kemans_clusters.txt", "w") as f:
    for i in range(number_of_clusters):
        f.write(f'Cluster {i}')
        f.write('\n')
        for ind in order_centers[i, :10]:
            f.write(' %s' %get_key(mapping,ind),)
            f.write('\n')
        f.write('\n')
        f.write('\n')

In [None]:
all_keywords = {}

for transcript,name in zip(matrix_denselist,list(show_descriptions.keys())):
    k = 0
    keywords = []
    for word in transcript:
        if word > 0:
            keywords.append(get_key(mapping,k))
        k += 1
    all_keywords[name] = keywords
    
# all_keywords_v2 = []

# for transcript in matrix_denselist:
#     k = 0
#     keywords = []
#     for word in transcript:
#         if word > 0:
#             keywords.append(features[k])
#         k += 1
#     all_keywords_v2.append(keywords)

In [None]:
import pandas as pd

keyword_counts_show_descriptions = pd.DataFrame(matrix.toarray(),columns = features,index=list(show_descriptions.keys()))

In [None]:
keyword_counts_show_descriptions[keyword_counts_show_descriptions['music'] > 0].index

In [None]:
# for keyword in [features[k] for k in order_centers[0][1:2]]:
#     print(list(keyword_counts_show_descriptions[keyword_counts_show_descriptions[keyword] > 0].index))

In [None]:
# get the show description to cluster association 

show_vs_cluster = {}
for name in list(show_descriptions.keys()):
    for k in range(number_of_clusters):
        n = set(all_keywords[name]).intersection(set([features[j] for j in order_centers[k][:20]]))
        if len(n) >= 3:
            if name not in show_vs_cluster:
                show_vs_cluster[name] = [k]
            else:
                show_vs_cluster[name].extend([k])