#test

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime
import re


# Set style for better looking plots
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
# Load 2024 data files
p3_2024 = pd.read_csv('../data/p3_2024.csv')
p4_2024 = pd.read_csv('../data/p4_2024.csv')
p6_2024 = pd.read_csv('../data/p6_2024.csv')
p3_2025 = pd.read_csv('../data/p3_oct_2025.csv')
p4_2025 = pd.read_csv('../data/p4_oct_2025.csv')
p6_2025 = pd.read_csv('../data/p6_oct_2025.csv')


In [None]:

def add_host_to_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    pattern = r'\b[Vv](?:æ|ae)rt(?:er)?[:,]? +(?![Vv](?:æ|ae)rt(?:er)?)((?:\(? *[A-ZÆØÅ][a-zæøå]*\)?,?)*(?: *(?:og|\&)(?:\(? *[A-ZÆØÅ][a-zæøå]*\)?,?)+)?)'
    df["hosts"] = df["episodeDescription"].str.extract(pattern)
    return df



# Comma cases ## Problem is that some cases have 3 hosts and therefor have a comma so this would destroy cases where that was relevant
# værter, Oliver Seppo og Thomas Bugge, failed
# værter, Thomas Bugge og Oliver Seppo, 
# værter, Andrew Moyo og Mo, failed
# værter, Marie Hobitz og Mo, failed
# værter, Kasper Reippurt og Nynne Givskov, failed


# known cases where this regex fails ## they are all to open ended a regex could take data we are not interested in
# værter Thomas Bugge og Oliver Seppo failed
# værter (Oliver Seppo og Thomas Bugge), failed
# værter Oliver Seppo og Thomas Bugge! failed
# Værter: Nikkie Niyibigira & Laila Aden Hjortshøj
# værter, Oliver Seppo og Thomas Bugge, Chris Anker failed
# Værter: Mathilde Muus og Anton Ringdal failed
# Værter: Liva Manghezi, Lasse Knudsen og Andreas Kousholt failed
# Vært: Julie Rahbek failed
# Vært: Frederik Birch failed
# Værter: Nikkie Niyibigira & Laila Aden Hjortshøj failed




In [None]:
p3_host_2024 = add_host_to_dataframe(p3_2024)


# a manual fix for a spelling mistake in extracted episodeDescription original "TV-Vært og Melodi Grand Prix"
p3_host_2024["hosts"] = p3_host_2024["hosts"].replace("og Melodi Grand Prix", "")


#p3_host_2024.to_csv('../data/p3_2024.csv', index=False)

p4_host_2024 = add_host_to_dataframe(p4_2024)
#p4_host_2024.to_csv('../data/p4_2024.csv', index=False)

p6_host_2024 = add_host_to_dataframe(p6_2024)
#p6_host_2024.to_csv('../data/p6_2024.csv', index=False)



p3_host_2025 = add_host_to_dataframe(p3_2025)
#p3_host_2025.to_csv('../data/p3_oct_2025.csv', index=False)

p4_host_2025 = add_host_to_dataframe(p4_2025)
#p4_host_2025.to_csv('../data/p4_oct_2025.csv', index=False)

p6_host_2025 = add_host_to_dataframe(p6_2025)
#p6_host_2025.to_csv('../data/p6_oct_2025.csv', index=False)






In [6]:
missing_hosts_count = p3_host_2024.loc[p3_host_2024["hosts"].isna(), "episodeDescription"].nunique()
print(f"2024 p3 we have {missing_hosts_count} unique episodeDescriptions with no hosts")

missing_hosts_count = p4_host_2024.loc[p4_host_2024["hosts"].isna(), "episodeDescription"].nunique()
print(f"2024 p4 we have {missing_hosts_count} unique episodeDescriptions with no hosts")

missing_hosts_count = p6_host_2024.loc[p6_host_2024["hosts"].isna(), "episodeDescription"].nunique()
print(f"2024 p6 we have {missing_hosts_count} unique episodeDescriptions with no hosts")

missing_hosts_count = p3_host_2025.loc[p3_host_2025["hosts"].isna(), "episodeDescription"].nunique()
print(f"2025 p3 we have {missing_hosts_count} unique episodeDescriptions with no hosts")

missing_hosts_count = p4_host_2025.loc[p4_host_2025["hosts"].isna(), "episodeDescription"].nunique()
print(f"2025 p4 we have {missing_hosts_count} unique episodeDescriptions with no hosts")

missing_hosts_count = p6_host_2025.loc[p6_host_2025["hosts"].isna(), "episodeDescription"].nunique()
print(f"2025 p6 we have {missing_hosts_count} unique episodeDescriptions with no hosts")



2024 p3 we have 519 unique episodeDescriptions with no hosts
2024 p4 we have 249 unique episodeDescriptions with no hosts
2024 p6 we have 216 unique episodeDescriptions with no hosts
2025 p3 we have 95 unique episodeDescriptions with no hosts
2025 p4 we have 55 unique episodeDescriptions with no hosts
2025 p6 we have 36 unique episodeDescriptions with no hosts


In [7]:
have_hosts_count = p3_host_2024.loc[p3_host_2024["hosts"].notnull(), "episodeDescription"].nunique()
print(f"2024 p3 we have {have_hosts_count} unique episodeDescriptions with hosts")

have_hosts_count = p4_host_2024.loc[p4_host_2024["hosts"].notnull(), "episodeDescription"].nunique()
print(f"2024 p4 we have {have_hosts_count} unique episodeDescriptions with hosts")

have_hosts_count = p6_host_2024.loc[p6_host_2024["hosts"].notnull(), "episodeDescription"].nunique()
print(f"2024 p6 we have {have_hosts_count} unique episodeDescriptions with hosts")

have_hosts_count = p3_host_2025.loc[p3_host_2025["hosts"].notnull(), "episodeDescription"].nunique()
print(f"2025 p3 we have {have_hosts_count} unique episodeDescriptions with hosts")

have_hosts_count = p4_host_2025.loc[p4_host_2025["hosts"].notnull(), "episodeDescription"].nunique()
print(f"2025 p4 we have {have_hosts_count} unique episodeDescriptions with hosts")

have_hosts_count = p6_host_2025.loc[p6_host_2025["hosts"].notnull(), "episodeDescription"].nunique()
print(f"2025 p6 we have {have_hosts_count} unique episodeDescriptions with hosts")


2024 p3 we have 855 unique episodeDescriptions with hosts
2024 p4 we have 252 unique episodeDescriptions with hosts
2024 p6 we have 301 unique episodeDescriptions with hosts
2025 p3 we have 73 unique episodeDescriptions with hosts
2025 p4 we have 34 unique episodeDescriptions with hosts
2025 p6 we have 27 unique episodeDescriptions with hosts


In [14]:
p3_host_2024_non_host = p3_host_2024.loc[p3_host_2024["hosts"].isna()]
p4_host_2024_non_host = p4_host_2024.loc[p4_host_2024["hosts"].isna()]
p6_host_2024_non_host = p6_host_2024.loc[p6_host_2024["hosts"].isna()]
p3_host_2025_non_host = p3_host_2025.loc[p3_host_2025["hosts"].isna()]
p4_host_2025_non_host = p4_host_2025.loc[p4_host_2025["hosts"].isna()]
p6_host_2025_non_host = p6_host_2025.loc[p6_host_2025["hosts"].isna()]


<class 'pandas.core.frame.DataFrame'>
Index: 102078 entries, 0 to 135266
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   localTime           102078 non-null  object
 1   channel             102078 non-null  object
 2   episodeTitle        102078 non-null  object
 3   episodeStartTime    102078 non-null  object
 4   episodeDescription  40887 non-null   object
 5   trackTitle          102062 non-null  object
 6   artistString        102045 non-null  object
 7   gender              94929 non-null   object
 8   hosts               0 non-null       object
dtypes: object(9)
memory usage: 7.8+ MB


In [None]:
# Collect all non-host dataframes
dfs = [
    p3_host_2024_non_host,
    p4_host_2024_non_host,
    p6_host_2024_non_host,
    p3_host_2025_non_host,
    p4_host_2025_non_host,
    p6_host_2025_non_host
]

# Concatenate
combined = pd.concat(dfs, ignore_index=True)

# Keep only episodeDescription + new host column
df_unique = (
    combined[['episodeDescription']]
    .drop_duplicates()
    .assign(host=np.nan)
)

# Save to CSV
df_unique.to_csv("../data/episode_descriptions_for_annotation.csv", index=False)
df_unique.head()

Unnamed: 0,episodeDescription,host
0,,
467,Du smider bare jakken i entréen - der er åbent...,
496,"Forelsk dig i den nyeste musik, og vær med i k...",
675,Dagens drømmehold er sat! Det består af værten...,
797,"I aftenens program diskuterer Laila og Mo, hvo...",
