# Imports

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import matplotlib.pyplot as plt
import matplotlib.cm as cm
plt.style.use('ggplot')

import seaborn as sns

params = {
    'text.color': (0.25, 0.25, 0.25),
    'figure.figsize': [18, 6],
   }

plt.rcParams.update(params)

import pandas as pd
from pandas import json_normalize

pd.options.mode.chained_assignment = None
pd.options.display.max_rows = 500
pd.options.display.max_seq_items = 500

import numpy as np
from numpy import percentile
np.random.seed(42)

from bs4 import BeautifulSoup
import unicodedata

import logging
import re
import copy
import os
import sys
import json
from datetime import datetime
from collections import Counter
import string
import glob
import ast
from tqdm.notebook import tqdm
import xmltodict

from tabulate import tabulate

from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA

import scipy.spatial.distance

import umap

TITLE_SIZE = 24
TITLE_PAD = 20

DEFAULT_COLORS = plt.rcParams['axes.prop_cycle'].by_key()['color']

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

In [2]:
import multiprocess as mp

In [104]:
# ----------- NLP modules ----------- #

import langdetect

import spacy
nlp = spacy.load('de_core_news_lg')

from gensim.corpora import Dictionary
from gensim.models import Phrases, hdpmodel, LdaModel, CoherenceModel

# Data cleaning for LDA
---
In this notebook I prepare the pre-cleaned data for modelling with LDA.

- [X] Concatenate content
- [X] Create a domain specific extended stop word list
- [X] Detect texts in languages other than German
- [X] Strip HTML, links, remove non-ASCII characters, remove stop words etc.
- [X] Remove names
- [X] Lemmatize
- [X] Tokenize
- [X] Extract n-grams

I extend the exploratory insights by retrieving more statistics from the processed data.

## Load data

In [100]:
df = pd.read_parquet("_data/podcasts_cleaned.parq")

In [101]:
# Remove all podcast that weren't updated after 2018.
df.releaseDate = pd.to_datetime(df.releaseDate)
to_drop = df[df.releaseDate.dt.year<2019].index
df.drop(to_drop, inplace=True)
df.reset_index(drop=True, inplace=True)
df.shape

(6636, 29)

## Concatenate textual content

I concatenate the available textual content to two single features – one for the texts about the podcast and one for the texts that describe the episodes.

In [6]:
def join_text_columns(data):
    text = []
    for row in data.values:
        if type(row) is str:
            text.append(row)
        elif isinstance(row, (np.ndarray, list)):
            text.append(" ".join(row))
        else:
            assert row is None
            text.append("")
            continue
    return " ".join(text)    

In [7]:
podcast_columns = ['artistName', 'title', 'subtitle', 'summary']

episode_columns = ['ep_authors', 'ep_titles', 'ep_itunes_titles', 
                   'ep_subtitles', 'ep_summaries', 'ep_contents']

df_pod = df[podcast_columns].apply(join_text_columns, axis=1) 
df_eps = df[episode_columns].apply(join_text_columns, axis=1) 

# Sanity check.
assert df_pod.shape[0] == df_eps.shape[0]

Sanity check with single samples. Have we properly joined all columns and textual content? It seems we have...

In [15]:
# idx = 1703
# display(df_pod[idx][:1000])
# print()
# display(df[podcast_columns].loc[idx].values)

## Load and extend stop word list

I'll use Spacy's stop word list. However, I extend this list with many additional stop words, that I found by analysing the most frequent non-salient words from all texts.

In [None]:
with open("_data/_input/stopwords.txt") as file:
    stopwords = file.readlines()
stopwords = sorted(set([x.strip() for x in stopwords]))

## Detect foreign language samples

I already have filtered out all podcasts that weren't tagged as German. Nonetheless I validate this by running a language detection on the concatenated texts that describe the podcast (title, subtitle, summary).

- **I find ~200 podcasts for which German is not the detected predominant language.**
- 140 samples are detected as English. A couple of other languages like Dutch, Portuguese and Swedish were detected too.
- **Only a handful of the detected samples are actually not German language podcasts. I decide to leave these in the data for now.**

In [None]:
%%time
non_german = []
for idx, text in tqdm(enumerate(df_pod.values)):
    try:
        language = langdetect.detect_langs(text)
        if str(language[0]).split(":")[0]!="de":
            non_german.append((idx, language))
    except Exception as e:
        continue

In [None]:
df_lang = pd.DataFrame(non_german, columns=["idx", "language"])
df_lang.to_csv("_data/language_detection.csv")
print(f"{len(df_lang)} podcasts found that might contain mostly non German textual content.")

In [None]:
top_n = 5
display(df_lang.language.apply(lambda x: x[0].lang).value_counts()[:top_n])

In [None]:
display(df.loc[df_lang.idx.values][["artistName", "title", "subtitle", "summary", "primary_genre", "not_longtail"]].head(top_n))

## Clean texts

In a first cleaning step I do this:

- [X] strip HTML
- [X] keep only ASCII and european characters 
- [X] remove hyperlinks
- [X] remove words shorter than 2 characters
- [X] remove stop words (based on the extended stop word list)

Since the lemmatization in the next step only works properly on cased text, I'll lowercase after that.

In [None]:
STOP_WORDS = list(spacy.lang.de.stop_words.STOP_WORDS)
STOP_WORDS.extend(sorted(list(stopwords)))
STOP_WORDS = set(STOP_WORDS)

RE_ASCII = re.compile(r"[^A-Za-zÀ-žäüöÄÜÖ ]", re.IGNORECASE)

REMOVE_LESS_THAN = 2

def clean_text(text):
    
    # Strip HTML tags.
    text = BeautifulSoup(text, "lxml").get_text()
    
    # Normalize form of unicode strings.
    # https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize
    # text = unicodedata.normalize("NFKD", text)
    
    # Keep only ASCII + European Chars and whitespace
    text = re.sub(RE_ASCII, " ", text)

    # # Lower case all text.
    # text = text.lower()
    
    # Remove links.
    text = re.sub(r'http\S+', ' ', text)
    
    # Remove all words less than 2 characters long.
    text = " ".join([token for token in text.split(" ") if len(token)>REMOVE_LESS_THAN])

    # # Remove consecutive hyphens (long or short ones).
    # text = re.sub(r"[-–]+", "-", text)
    
    # Remove stop words.
    text = " ".join([token for token in text.split(" ") if token.lower() not in STOP_WORDS])
    
    return text

In [None]:
%%time
df_pod_clean = df_pod.apply(clean_text)

In [None]:
%%time

# I use multiprocessing for this CPU bound job of speeding up text cleaning.
# multiprocessing.Pool unfortunately doesn't work in iPython. 
# Therefore I use «multiprocess» which is a package that has to be installed separately.
# https://stackoverflow.com/a/65001152/7117003

with mp.Pool(8) as pool:
    df_eps_clean = pool.map(clean_text, df_eps.values)

In [None]:
df_pod_clean = pd.DataFrame(df_pod_clean, columns=["pod_text"])
df_eps_clean = pd.DataFrame(df_eps_clean, columns=["eps_text"])

df_pod_clean.to_parquet("_data/clean_pod.parq")
df_eps_clean.to_parquet("_data/clean_eps.parq")

## Remove names

- From first tests and iterations with LDA I could see that peoples names rank high as topic terms. A person's name in most of the cases I examined was noise, e.g., the creators name is not significant with respect to a topic and does not help the model to generalize to unseen samples. 
- However, names like `Angela Merkel`, `Joe Biden` or `Pablo Picasso` would be salient information in regard to a topic, so I have to make a tradeoff here. 
- I decide to filter out names from the data because most of the names do not seem salient in regard to the task of finding meaningful topic vectors.

In [4]:
df_pod_clean = pd.read_parquet("_data/clean_pod.parq")
df_eps_clean = pd.read_parquet("_data/clean_eps.parq")

In [6]:
# Shorten episode texts to 10_000 characters.
df_eps_clean_short = pd.DataFrame(df_eps_clean.eps_text.apply(lambda x: x[:10_000]))

In [7]:
%%time
names_list = []

# To accommodate for long texts I increase Spacy's text length limit.
nlp.max_length = 10_000_000

# Use nlp.pipe with batches to speed up entity recognition. 
# Note: Increasing n_process does not decrease processing times.
for data in [df_pod_clean.pod_text.values, df_eps_clean_short.eps_text.values]:
    for doc in tqdm(nlp.pipe(data, 
                             batch_size=50, 
                             n_process=1, 
                             disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"])):
        names_list.extend([token.text for token in doc.ents if token.label_=="PER"])

0it [00:00, ?it/s]

0it [00:00, ?it/s]

CPU times: user 12min 57s, sys: 5min 32s, total: 18min 30s
Wall time: 2min 43s


We can see that the entity recognition makes errors. E.g. a topic term like `Corona` gets labeled as a person. Also `Jesus`, `Sohn`, `Mann`, `Hass` are salient topical terms rather than names.

In [12]:
with open("_data/_input/stopwords_names_raw.txt", "w") as file:
    file.write("\n".join(names_list))

In [9]:
cnt = Counter(names_list)
display(cnt.most_common(n=20))

[('Corona', 1261),
 ('Jesus', 1056),
 ('Sohn', 358),
 ('Angela Merkel', 305),
 ('Donald Trump', 278),
 ('Bobby Schuller', 276),
 ('Trump', 260),
 ('Mann', 241),
 ('Tobias', 234),
 ('Merkel', 216),
 ('Hass', 210),
 ('Sebastian', 205),
 ('Rainer Zitelmann', 192),
 ('Olaf Scholz', 179),
 ('Joe Biden', 174),
 ('Thomas', 172),
 ('Anna', 169),
 ('Dirk Kreuter', 159),
 ('Armin Laschet', 158),
 ('Teufel', 155)]

In [61]:
# Reduce to most common names of n_min.
n_min = 20
names_list = [x[0] for x in cnt.most_common() if x[1]>=n_min]

# Sort stop word list with names so that first we replace the longer terms and then the shorter ones.
stop_words_names = sorted(names_list, key=lambda x: len(x), reverse=True)
print(f"Names list reduced to {len(stop_words_names)} terms.")

Names list reduced to 1263 terms.


In [80]:
# Use replace instead of regex. 
# This is ~2x faster but less flexible.
def clean_names(data):
    for name in stop_words_names:
        data = data.replace(f" {name} ", " ")
    return re.sub(r"\s+", " ", data)

In [84]:
%%time
df_pod_clean.pod_text = df_pod_clean.pod_text.apply(clean_names)

CPU times: user 2.16 s, sys: 9.44 ms, total: 2.17 s
Wall time: 2.17 s


In [85]:
%%time
tmp = []
for text in tqdm(df_eps_clean.eps_text.values):
    tmp.append(clean_names(text))

df_eps_clean.eps_text = tmp

  0%|          | 0/6636 [00:00<?, ?it/s]

CPU times: user 4min 27s, sys: 806 ms, total: 4min 27s
Wall time: 4min 27s


In [93]:
df_pod_clean.to_parquet("_data/clean_pod_names.parq")
df_eps_clean.to_parquet("_data/clean_eps_names.parq")

## Lemmatize texts

In [94]:
%%time
results_pod = []

# Lemmatize episode texts.
# Use nlp.pipe with batches to speed up lemmatization. 
# Increasing n_process didn't result in shorter processing times.
for doc in nlp.pipe(df_pod_clean.pod_text.values, batch_size=50, n_process=1, disable=['tok2vec', 'tagger', 'morphologizer', 'parser', 'attribute_ruler', 'ner']):
    results_pod.append(" ".join([token.lemma_ for token in doc]))
    
results_pod = [x.lower() for x in results_pod]

CPU times: user 777 ms, sys: 64.2 ms, total: 841 ms
Wall time: 843 ms


In [95]:
%%time
results_eps = []

nlp.max_length = 10_000_000

# Lemmatize podcasts texts.
# Use nlp.pipe with batches to speed up lemmatization. 
# Increasing n_process didn't result in shorter processing times.
for doc in nlp.pipe(df_eps_clean.eps_text.values, batch_size=50, n_process=1, disable=['tok2vec', 'tagger', 'morphologizer', 'parser', 'attribute_ruler', 'ner']):
    results_eps.append(" ".join([token.lemma_ for token in doc]))
    
results_eps = [x.lower() for x in results_eps]

CPU times: user 2min 7s, sys: 954 ms, total: 2min 8s
Wall time: 2min 8s


In [96]:
results_pod = pd.DataFrame(results_pod, columns=["pod_text"])
results_eps = pd.DataFrame(results_eps, columns=["eps_text"])

results_pod.to_parquet("_data/clean_pod_lemma.parq")
results_eps.to_parquet("_data/clean_eps_lemma.parq")

## Statistics of textual content after data cleaning
- **95% of podcasts have 6 words or more describing text on podcast level and 60 words or more on episode level.**
- In next steps I might remove samples with shorter texts (the *other* 5%) from the data set, assuming that these do not contain sufficient information for meaningful modelling.
- **Interestingly, Podcasts from charts or top lists have more than 3 times the word count compared to longtail podcasts.** 
- The volume of text data that describes the episode of a podcast is ~164 times the volume of that, which describes the podcast itself (title, subtitle, summary).

In [97]:
tmp = results_pod.pod_text.apply(lambda x: len(x.split(" ")))
all_pod = tmp.sum()

perc_5 = np.percentile(tmp, 5)
perc_95 = np.percentile(tmp, 95)

print(f"The 5/95% percentiles are {perc_5:,.0f} / {perc_95:,.0f} words in podcast texts.\n")
display(tmp.describe().astype(int))

# df.loc[tmp[tmp<perc_5].index].shape[0]

The 5/95% percentiles are 6 / 88 words in podcast texts.



count    6636
mean       37
std        29
min         1
25%        18
50%        32
75%        51
max       759
Name: pod_text, dtype: int64

In [98]:
tmp = results_eps.eps_text.apply(lambda x: len(x.split(" ")))
all_eps = tmp.sum()

perc_5 = np.percentile(tmp, 5)
perc_95 = np.percentile(tmp, 95)

print(f"The 5/95% percentiles are {perc_5:,.0f} / {perc_95:,.0f} words in episode texts.\n")
display(tmp.describe().astype(int))

The 5/95% percentiles are 60 / 24,305 words in episode texts.



count      6636
mean       6233
std       13937
min           1
25%         585
50%        2027
75%        6303
max      318957
Name: eps_text, dtype: int64

In [102]:
df["text_word_count"] = tmp.values
df["top"]= np.where(df.not_longtail>0, True, False)
tmp_median = df.groupby("top").text_word_count.median()
ratio = 100 / tmp_median[False] * tmp_median[True]
print(f"Podcasts from top lists or charts in this data set have {ratio:.0f}% the textual content of longtail podcasts.")

all_words = all_pod + all_eps
ratio = 100 / all_pod * all_eps
print(f"Textual data for episodes is {ratio/100:.0f} times the text volume for podcasts.")

Podcasts from top lists or charts in this data set have 327% the textual content of longtail podcasts.
Textual data for episodes is 164 times the text volume for podcasts.


## Tokenize and extract n-grams

In [105]:
%%time 
pod_bigrams = []
tmp = [x[0].split(" ") for x  in results_pod.values]

# min_count ==  Ignore all words and bigrams with total collected count lower than this value.
# threshold == Threshold for forming the phrases (higher means fewer phrases).
bigram = Phrases(tmp, min_count=5, threshold=10)

for idx in range(len(tmp)):
    for token in bigram[tmp[idx]]:
        if '_' in token:
            pod_bigrams.append(token)
            tmp[idx].append(token)

results_pod_tokens = tmp
unique_pod_bigrams = sorted(set(pod_bigrams))        
print(f"{len(unique_pod_bigrams)} unique bigrams found for podcast texts.")

906 unique bigrams found for podcast texts.
CPU times: user 424 ms, sys: 9.5 ms, total: 434 ms
Wall time: 432 ms


In [106]:
%%time 
eps_bigrams = []
tmp = [x[0].split(" ") for x  in results_eps.values]

bigram = Phrases(tmp, min_count=5, threshold=10)

for idx in range(len(tmp)):
    for token in bigram[tmp[idx]]:
        if '_' in token:
            eps_bigrams.append(token)
            tmp[idx].append(token)

results_eps_tokens = tmp
unique_eps_bigrams = sorted(set(eps_bigrams))        
print(f"{len(unique_eps_bigrams)} unique bigrams found for episode texts.")

269417 unique bigrams found for episode texts.
CPU times: user 1min 9s, sys: 2.77 s, total: 1min 12s
Wall time: 1min 13s


In [107]:
results_pod_tokens = pd.DataFrame(pd.Series(results_pod_tokens), columns=["pod_text"])
results_eps_tokens = pd.DataFrame(pd.Series(results_eps_tokens), columns=["eps_text"])

results_pod_tokens.to_parquet("_data/clean_pod_token.parq")
results_eps_tokens.to_parquet("_data/clean_eps_token.parq")

# Conclusions

- Again, the data proved to be quite diverse and full of noise (e.g., lots of HTML, links).
- The amount of information I have for each podcast is very diverse. Some podcasts have almost no metadata to work with. Others have very high word counts, mainly due to lots of available episodes and their extensive descriptions.
- From episodes we get ~164 times the textual content than from podcasts. This is an inbalance we have to keep in mind for all further steps.
- Interestingly, podcasts from charts or top lists have on average three times the textual data than those from less popular, long-tail podcasts. If this has statistical significance, we can speculate if either creators of successful podcasts have the resources to generate proper metadata or if podcasts are more likely to be successful if they have proper metadata.