# Imports

In [1]:
import pandas as pd

pd.options.mode.chained_assignment = None
pd.options.display.max_rows = 500
pd.options.display.max_seq_items = 500

import numpy as np
np.random.seed(42)

from bs4 import BeautifulSoup

import re
import copy
import os
import sys
import json
from collections import Counter
import string
import glob
from tqdm.notebook import tqdm
import pickle

from tabulate import tabulate

import spacy
nlp = spacy.load('de_core_news_lg')

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

# Named entity recognition with Spacy
---

- To improve search, named entities like people's names, organization and locations can be a useful enrichment of metadata. 
- I can imagine using entities like persons or organizations as explicit filter criteria for a search application. Or the NER could be part of an NLP pipeline and used to remove e.g., person's names as stop words.
- [Spacy offers named entity recognition (NER) out of the box](https://spacy.io/usage/linguistic-features#named-entities) and provides three models for German language. 
- In this notebook, I try to assess how good and performant Spacy's entity recognition is.  

# Load and prepare data

In [3]:
df = pd.read_parquet("_data/podcasts_cleaned.parq")

# Remove all podcast that weren't updated after 2018.
df.releaseDate = pd.to_datetime(df.releaseDate)
to_drop = df[df.releaseDate.dt.year<2019].index
df.drop(to_drop, inplace=True)
df.reset_index(drop=True, inplace=True)
df.shape

(6636, 29)

I'll join texts in all relevant feature columns into one single text feature.

In [4]:
def join_text_columns(data):
    text = []
    for row in data.values:
        if type(row) is str:
            text.append(row)
        elif isinstance(row, (np.ndarray, list)):
            text.append(" ".join(row))
        else:
            assert row is None
            text.append("")
            continue
    return " ".join(text) 

In [5]:
podcast_columns = ['artistName', 'title', 'subtitle', 'summary', "tags", 
                   'ep_titles', 'ep_itunes_titles', 'ep_tags', 'primary_genre',
                   'ep_subtitles', 'ep_summaries', 'ep_contents']
df_pod = df[podcast_columns].apply(join_text_columns, axis=1)    

In [6]:
RE_ASCII = re.compile(r"[^A-Za-zÀ-žäüöÄÜÖ.!? ]", re.IGNORECASE)
MAX_WORDS = 10_000
REMOVE_LESS_THAN = 2

def clean_text(text):
    
    # Reduce to MAX_WORDS to speed up processing.
    text = " ".join(text.split(" ")[:MAX_WORDS])
    
    # Strip HTML tags.
    text = BeautifulSoup(text, "lxml").get_text()
    
    # Keep only ASCII + European Chars, some punctuation and whitespace
    text = re.sub(RE_ASCII, " ", text)
   
    # Remove links.
    text = re.sub(r'http\S+', ' ', text)
    text = re.sub(r'www.\S+', ' ', text)
    
    # Remove all words less than 2 digits long.
    text = " ".join([token for token in text.split(" ") if len(token)>REMOVE_LESS_THAN])
       
    return text

In [7]:
%%time
df_pod = df_pod.apply(clean_text)
df["text"] = df_pod

CPU times: user 41.4 s, sys: 1.79 s, total: 43.1 s
Wall time: 44.3 s


# Detect entities

In a first step I retrieve all entities in every text document.

In [72]:
%%time
ents_list = []

# We have long texts in the data and need to increase Spacy's text length limit accordingly.
nlp.max_length = 10_000_000

# Use nlp.pipe with batches to speed up entity recognition. 
# Increasing n_process does not decrease processing times.
for doc in tqdm(nlp.pipe(df.text.values, 
                             batch_size=50, 
                             n_process=1, 
                             disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"])):
        ents_list.append([(token.text, token.label_) for token in doc.ents])

0it [00:00, ?it/s]

CPU times: user 47min 52s, sys: 18min 48s, total: 1h 6min 40s
Wall time: 16min 15s


In [132]:
# # Save entity list to disk to avoid repeated time consuming runs of entity extraction.
# with open("_data/entity_list.pkl", "wb+") as file:
#     pickle.dump(ents_list, file)

In [8]:
with open("_data/entity_list.pkl", "rb") as file:
    ents_list = pickle.load(file)

- **Spacy recognized around 316k person names, 76k locations, 143k organisations and 334k miscellaneous entities.**
- By flattening the list, using a counter and printing the 50 most and least common entities, I can see that **the results are promising.**
- However, I notice **a lot of errors** too. **Especially less common entities are noisy.**
- We have to keep in mind, that many podcasts have hundreds of available episodes that we joined into the text document per podcast. In order to find common terms *across* podcasts we have to count differently.

In [157]:
ent_types = ["PER", "LOC", "ORG", "MISC"]
for ent_type in ent_types:
    cnt = Counter([y[0] for x in ents_list for y in x if y[1]==ent_type])
    print("_"*80)
    print(f"Spacy recognized {len(cnt):,.0f} unique entities of type {ent_type}.\n")
    print([x[0] for x in cnt.most_common(50)])
    print()
    print([x[0] for x in cnt.most_common()[::-1][:25]])
    print()

________________________________________________________________________________
Spacy recognized 316,925 unique entities of type PER.

['Instagram', 'Link', 'Jesus', 'Corona', 'Jahren', 'Sebastian', 'Thomas', 'Chris', 'Daniel', 'Christian', 'Sohn', 'Alex', 'http', 'David', 'Trump', 'Michael', 'Andreas', 'Philipp', 'Anna', 'Jan', 'Sascha', 'Max', 'Martin', 'Donald Trump', 'bit.ly', 'Julian', 'Stefan', 'http bit.ly', 'linktr.ee', 'Tobi', 'Matthias', 'Tobias Teichen', 'Julia', 'Angela Merkel', 'Simon', 'Sven', 'Friedrich Nietzsche', 'Lena', 'Merkel', 'Markus', 'Hugo Egon Balder', 'Flo', 'Florian', 'Johannes', 'Felix', 'Tobias', 'Nina', 'Tim', 'Peter', 'Danke']

['Thomas Sachsenmaier Simon Ihlenfeldt', 'Daniela Schröder leisure Was', 'Daniela Schröder Minding', 'konflikt mann', 'beziehungsdrama beziehungskonflikte dauersingle', 'Michaela Beck Free Your Heart Podcast', 'Tobias Maucher', 'Frank Eilers Sebastian Krämer Wie', 'You Can', 'Frank Eilers Sebastian Krämer', 'Peter Schanz Christian

# Filter results
To remove as much errors as possible I filter out all entities that do not occur in at least `n` unique podcasts.

In [2]:
# Separate entities by type.
results = []
for row in ents_list:
    per = [x[0] for x in row if x[1]=="PER"]
    loc = [x[0] for x in row if x[1]=="LOC"]    
    org = [x[0] for x in row if x[1]=="ORG"]
    msc = [x[0] for x in row if x[1]=="MISC"] 
    results.append((per, loc, org, msc))
    
columns = ["person", "location", "organisation", "misc"]
df_ents = pd.DataFrame(results, columns=columns)

# Set cutoff to filter out noisy entities.
# An entity has to occur in min_occurence unique podcasts or more.
min_occurence = 5

ents_filtered = {}

# Create a dictionary with entity types as keys and the corresponding list of filtered entities as the value. 
for column in columns:
    cnt = Counter([y for x in df_ents[column].values for y in set(x)]).most_common()
    ents_filtered[column] = [x[0] for x in cnt if x[1]>=min_occurence]

In [131]:
%%time
# Filter out all likely erroneous entities from the data set, row by row.
for column in tqdm(columns):
    df_ents[column] = df_ents[column].apply(lambda x: [y for y in x if y in ents_filtered[column]])

  0%|          | 0/4 [00:00<?, ?it/s]

CPU times: user 1min 20s, sys: 77.2 ms, total: 1min 20s
Wall time: 1min 20s


In [159]:
# df_ents.to_parquet("_data/entity_list_cleaned.parq")
# df_ents = pd.read_parquet("_data/entity_list_cleaned.parq")

# Analyze results

## Persons
Looking at unique terms and counting in how many podcasts these occur I notice:

- `Corona` (very comprehensibly) ranks very high and occurs in the metadata of more than 1k podcasts. (It's misclassified as a person's name, but nonetheless...)
- Politician's names rank high: `Angela Merkel`, `Donald Trump`, `Joe Biden` etc.
- We get a lot of first names, which are less useful.
- The names of several «usual suspects» like `James Bond`, `Elon Musk` rank high.

In [43]:
cnt = Counter([y.lower() for x in df_ents.person.apply(lambda x: set(x)).values for y in x ])
print(cnt.most_common(100))

[('jahren', 1099), ('corona', 1026), ('link', 984), ('instagram', 884), ('sohn', 550), ('christian', 291), ('angela merkel', 290), ('thomas', 288), ('donald trump', 278), ('jesus', 267), ('michael', 267), ('trump', 262), ('daniel', 254), ('anna', 253), ('alex', 244), ('ehrlich', 234), ('julia', 217), ('andreas', 216), ('http', 214), ('sebastian', 212), ('teufel', 202), ('merkel', 200), ('david', 196), ('martin', 193), ('stefan', 190), ('philipp', 184), ('jan', 178), ('besser', 176), ('danke', 175), ('matthias', 172), ('markus', 164), ('elon musk', 160), ('joe biden', 157), ('peter', 151), ('max', 150), ('olaf scholz', 149), ('armin laschet', 143), ('chris', 142), ('dir', 139), ('james bond', 138), ('simon', 138), ('gott', 137), ('hass', 135), ('patrick', 135), ('felix', 134), ('covid', 132), ('christoph', 131), ('florian', 131), ('sich', 130), ('katharina', 126), ('johannes', 125), ('maria', 125), ('silvester', 125), ('vaters', 123), ('biden', 122), ('paul', 122), ('alexander', 122), (

- If we reduce the printout to just the entities that consists of two or more words, we get more meaningful results. 
- Still, I notice errors like `http bit.ly`, `instagram kanal` etc.

In [45]:
cnt = Counter([y.lower() for x in df_ents.person.apply(lambda x: set(x)).values for y in x if len(y.split())>1])
print([x[0] for x in cnt.most_common(100)])

['angela merkel', 'donald trump', 'elon musk', 'joe biden', 'olaf scholz', 'armin laschet', 'james bond', 'jens spahn', 'annalena baerbock', 'greta thunberg', 'http bit.ly', 'bill gates', 'jeff bezos', 'george floyd', 'karl lauterbach', 'markus söder', 'steve jobs', 'tobias beck', 'christian lindner', 'dirk kreuter', 'gerald hüther', 'jesus christus', 'robert habeck', 'jogi löw', 'dieter bohlen', 'stefanie stahl', 'friedrich merz', 'kevin kühnert', 'laura malina seiler', 'frank thelen', 'helene fischer', 'albert einstein', 'thomas gottschalk', 'christian drosten', 'britney spears', 'sebastian kurz', 'barack obama', 'instagram kanal', 'udo lindenberg', 'günther jauch', 'boris johnson', 'uli hoeneß', 'herbert grönemeyer', 'til schweiger', 'sherlock holmes', 'hansi flick', 'gerhard schröder', 'warren buffett', 'veit lindau', 'sebastian fitzek', 'arnold schwarzenegger', 'micky beisenherz', 'jürgen klopp', 'luisa neubauer', 'joshua kimmich', 'kanye west', 'julien backhaus', 'tony robbins', 

## Locations
- Since we got the podcast data from the German iTunes store it's not suprising that `deutschland` ranks highest. The term occurs in the metadata of more than 2.7k podcasts. 
- A couple of errors are noticeable here too: e.g., `corona`, `veröffentlichung`, `jährig`, `beiden`.

In [47]:
cnt = Counter([y.lower() for x in df_ents.location.apply(lambda x: set(x)).values for y in x])
print(cnt.most_common(100))

[('deutschland', 2721), ('berlin', 1584), ('corona', 1416), ('usa', 1216), ('europa', 1073), ('deutschlands', 961), ('hamburg', 807), ('münchen', 738), ('erde', 721), ('stadt', 644), ('österreich', 618), ('china', 558), ('köln', 529), ('frankreich', 497), ('bayern', 486), ('schweiz', 483), ('italien', 452), ('europas', 432), ('russland', 395), ('wien', 385), ('new york', 380), ('paris', 353), ('japan', 343), ('london', 313), ('spanien', 309), ('indien', 304), ('england', 297), ('afrika', 290), ('australien', 289), ('stuttgart', 288), ('amerika', 287), ('schweden', 286), ('frankfurt', 279), ('düsseldorf', 279), ('leipzig', 267), ('afghanistan', 262), ('hause', 261), ('polen', 254), ('türkei', 252), ('israel', 250), ('essen', 244), ('großbritannien', 240), ('den usa', 226), ('nrw', 217), ('dortmund', 209), ('infos', 209), ('staaten', 199), ('bremen', 198), ('asien', 195), ('griechenland', 191), ('kanada', 190), ('kalifornien', 182), ('hollywood', 181), ('rom', 180), ('dresden', 175), ('p

Selecting only multi word terms in recognized locations reveals more meaningful entities, but more errors too: `home office`, `instagram kanal`, `youtube kanal` etc.

In [48]:
cnt = Counter([y.lower() for x in df_ents.location.apply(lambda x: set(x)).values for y in x if len(y.split())>1])
print(cnt.most_common(100))

[('new york', 380), ('den usa', 226), ('home office', 132), ('corona pandemie', 122), ('der schweiz', 114), ('los angeles', 94), ('silicon valley', 93), ('baden württemberg', 87), ('rheinland pfalz', 76), ('vereinigten staaten', 71), ('st. pauli', 63), ('bundesrepublik deutschland', 61), ('san francisco', 59), ('instagram kanal', 57), ('youtube kanal', 55), ('las vegas', 49), ('new yorker', 48), ('new york city', 48), ('frankfurt main', 48), ('höhle der löwen', 47), ('tel aviv', 44), ('sachsen anhalt', 42), ('schleswig holstein', 38), ('nordrhein westfalen', 37), ('sri lanka', 33), ('mecklenburg vorpommern', 31), ('corona welle', 28), ('costa rica', 28), ('bayern münchen', 27), ('new orleans', 26), ('wall street', 25), ('mount everest', 24), ('berliner mauer', 22), ('new work', 22), ('kölner dom', 21), ('erzbistum köln', 21), ('hong kong', 21), ('saudi arabien', 18), ('weiße haus', 18), ('sand meer', 17), ('corona lockdowns', 17), ('gran canaria', 17), ('st. petersburg', 17), ('tik tok

## Organizations
- `Corona` again ranks very high and again is misclassified.
- I notice a lot of meaningful terms that actually are organisations.
- There are a lot of errors too.

In [51]:
cnt = Counter([y.lower() for x in df_ents.organisation.apply(lambda x: set(x)).values for y in x])
print(cnt.most_common(100))

[('corona', 680), ('instagram', 652), ('corona pandemie', 571), ('social media', 509), ('apple', 444), ('spd', 435), ('cdu', 388), ('deutschland', 384), ('infos', 384), ('kirche', 361), ('bundesliga', 346), ('ard', 341), ('ddr', 332), ('bundestag', 314), ('fdp', 303), ('zdf', 287), ('spiegel', 275), ('wdr', 259), ('afd', 249), ('amazon', 248), ('dfb', 222), ('bundeswehr', 218), ('microsoft', 211), ('partei', 202), ('ndr', 188), ('hört', 176), ('die grünen', 175), ('disney', 172), ('grüne', 171), ('rtl', 167), ('tesla', 164), ('grünen', 162), ('csu', 156), ('zentrum', 155), ('spotify', 140), ('schalke', 133), ('bvb', 123), ('who', 121), ('linkedin', 120), ('fifa', 119), ('pandemie', 118), ('hsv', 117), ('gehst', 117), ('hier', 114), ('swr', 113), ('bmw', 112), ('adac', 112), ('taliban', 111), ('gemeinschaft', 111), ('porsche', 108), ('wir', 107), ('union', 103), ('bayern münchen', 103), ('nato', 101), ('ikea', 100), ('nationalmannschaft', 96), ('sony', 95), ('westdeutscher rundfunk', 95

In [52]:
cnt = Counter([y.lower() for x in df_ents.organisation.apply(lambda x: set(x)).values for y in x if len(y.split())>1])
print(cnt.most_common(100))

[('corona pandemie', 571), ('social media', 509), ('die grünen', 175), ('bayern münchen', 103), ('westdeutscher rundfunk', 95), ('werder bremen', 87), ('borussia dortmund', 83), ('katholischen kirche', 75), ('deutsche bahn', 70), ('europäischen union', 67), ('vfb stuttgart', 66), ('hertha bsc', 63), ('europäische union', 62), ('deutschen bahn', 61), ('deutschen nationalmannschaft', 61), ('new york times', 60), ('union berlin', 60), ('katholische kirche', 59), ('eintracht frankfurt', 57), ('deutschen bundestag', 55), ('start ups', 52), ('new work', 50), ('deutschland österreich', 49), ('frankfurter buchmesse', 48), ('deutsche nationalmannschaft', 48), ('bayerischer rundfunk', 48), ('vereinten nationen', 47), ('weimarer republik', 47), ('corona krise', 45), ('fridays for future', 45), ('vfl wolfsburg', 45), ('covid pandemie', 44), ('wir uns', 44), ('bündnis die grünen', 43), ('borussia mönchengladbach', 42), ('bayer leverkusen', 42), ('mehr infos', 41), ('free agency', 41), ('saudi arabi

## Miscellaneous entities
- As to be expected we get a wide variety of terms for entity type `MISC`.
- We get a lot of terms with pronouns, as well as fragments of sentences: `wir freuen uns`, `viel spaß`, `send voice message anchor.fm` etc.

In [50]:
cnt = Counter([y.lower() for x in df_ents.misc.apply(lambda x: set(x)).values for y in x])
print(cnt.most_common(100))

[('zeit', 3563), ('deutschen', 2263), ('deutsche', 2005), ('youtube', 1871), ('facebook', 1723), ('weihnachten', 1273), ('internet', 1105), ('sozialen', 805), ('pandemie', 778), ('twitter', 747), ('berliner', 739), ('deutscher', 731), ('corona krise', 690), ('hört', 679), ('videos', 650), ('soziale', 649), ('itunes', 528), ('lockdown', 511), ('infos', 491), ('netflix', 466), ('hamburger', 458), ('virus', 439), ('amazon', 433), ('schweizer', 419), ('amerikanischen', 398), ('google', 388), ('europäischen', 380), ('podcast', 377), ('millionen', 374), ('anschluss', 373), ('deutschsprachigen', 365), ('euro', 359), ('twitter.com', 350), ('spiele', 342), ('business', 340), ('deutsch', 333), ('amerikanische', 324), ('bundestagswahl', 317), ('whatsapp', 303), ('talk', 287), ('spotify', 287), ('instagram', 278), ('europäische', 269), ('krieg', 265), ('ostern', 257), ('englischen', 254), ('corona', 247), ('kölner', 247), ('quarantäne', 242), ('bibel', 238), ('einfach', 233), ('englisch', 229), ('

In [53]:
cnt = Counter([y.lower() for x in df_ents.misc.apply(lambda x: set(x)).values for y in x if len(y.split())>1])
print(cnt.most_common(100))

[('corona krise', 690), ('das jahr', 175), ('star wars', 163), ('herzlich willkommen', 159), ('das leben', 155), ('corona virus', 153), ('corona zeit', 151), ('viel spaß', 151), ('der podcast', 149), ('champions league', 147), ('die zeit', 123), ('spotify open.spotify.com', 103), ('dieses jahr', 100), ('zweiten weltkrieg', 96), ('game thrones', 87), ('fake news', 85), ('special guest', 81), ('höchste zeit', 79), ('home office', 77), ('die zukunft', 76), ('facebook instagram', 76), ('new work', 76), ('apple podcast', 75), ('mein name', 72), ('send voice message anchor.fm', 69), ('ein interview', 68), ('olympischen spiele', 67), ('die wahrheit', 67), ('die angst', 66), ('youtube videos', 66), ('super league', 65), ('star trek', 63), ('olympischen spielen', 62), ('fridays for future', 61), ('youtube video', 61), ('feier des tages', 61), ('hör dir', 60), ('wir freuen uns', 59), ('der weg', 56), ('freut euch', 54), ('true crime podcast', 53), ('frohe weihnachten', 52), ('corona lockdown', 5

# Conclusions

- I assess the results as promising. 
- The detected entities in many cases are correct and meaningful. 
- There are a lot of errors and misclassifications too. The resulting collections of terms need further cleaning, e.g. by removing stop words. 
- I assume that a «perfect» cleaning isn't feasible with reasonable effort. The variety of errors seems too wide. Nonetheless, the entities could prove very useful.
- The detection is fast – processing the whole data set just took 15 minutes (Mac mini M1).