# Podcast Metadata
This notebook contains the code used to generate a metadata table for each podcast transcript in our data. This will be especially useful for associating podcasts with their publish date to explore temporal dynamics.

## Imports
All necessary imports.

In [None]:
import os # file haldling
import re # parsing dates
from datetime import datetime, timedelta # parsing dates
import glob # file handling
import pandas as pd # metadata table construction
from collections import defaultdict

## Parsing Dates
Date tags in filenames are in a slightly jumbled format. Turning these into DateTime objects.

In [None]:
def extract_tag(filepath):
    """ Extract the date tag from a filename"""
    base = os.path.basename(filepath)
    # looking the the date in the parentheses at the end
    match = re.search(r'\((\d+)\)\.txt$', base)
    if not match:
        return None, None
    tag = match.group(1)
    # returning the year if included
    if len(tag) > 4 and tag[-4:].isdigit():
        return tag[:-4], int(tag[-4:])
    # else just returning the day/month
    else:
        return tag, None

In [None]:
def generate_all_possible_dates(years_back=2, most_recent=datetime(2025,7,1)):
    "Generate all possible (tag, date) pairs for the last N years."
    dates = []
    today = most_recent
    start_date = today - timedelta(days=365*years_back)
    dt = today
    while dt > start_date:
        tag = f"{dt.month}{dt.day}"
        dates.append((tag, dt))
        dt -= timedelta(days=1)
    return dates

## Creating Metadata Table

The below code creates a csv that contains each transcript's filepath, podcast name, episode name, and release date. 

### Helper Functions

In [None]:
def extract_podcast_name(path):
    name = os.path.basename(os.path.dirname(path))
    return name.replace("-", " ").title()

def extract_episode_name(path):
    base = os.path.basename(path)
    base = os.path.splitext(base)[0]
    clean_name = re.sub(r'\s*\([^)]*\)$', '', base)
    return clean_name

def extract_date_from_filename(filepath):
    return filename_to_date.get(os.path.relpath(filepath, root))

### Generate Table

In [None]:
root = r"C:\Users\sfont\podcast-project"
directory = os.path.join(root, "data", "podcasts")
pod_dirs = [x[0] for x in os.walk(directory)][1:]

filename_to_date = dict()

for pod in pod_dirs:
    filenames = glob.glob(os.path.join(pod, "*.txt"), recursive=True)
    filenames = sorted(filenames, key=os.path.getmtime, reverse=True)
    tag_years = [extract_tag(fn) for fn in filenames]

    all_possible_dates = generate_all_possible_dates(years_back=3, most_recent=datetime(2025,6,30))

    tag_to_dates = defaultdict(list)
    for tag, dt in all_possible_dates:
        tag_to_dates[tag].append([dt, 0])

    assigned_dates = []
    for idx, (tag, year) in enumerate(tag_years):
        found = False
        if tag is None:
            assigned_dates.append(None)
            continue
        if year is not None:
            for i, (dt, used) in enumerate(tag_to_dates[tag]):
                if dt.year == year:
                    assigned_dates.append(dt)
                    tag_to_dates[tag][i][1] += 1  
                    found = True
                    break
        else:
            if tag_to_dates[tag]:
                dt, _ = tag_to_dates[tag][0]
                assigned_dates.append(dt)
                tag_to_dates[tag][0][1] += 1
                found = True
        if not found:
            assigned_dates.append(None)
            print(f"Could not assign date for tag {tag}{year if year else ''} in file {filenames[idx]}")

    date_dict = {os.path.relpath(f, root): d for f, d in zip(filenames, assigned_dates)}
    filename_to_date.update(date_dict)


files = glob.glob(os.path.join(directory, "*", "*.txt"), recursive=True)
metadata = []
for file in files:
    podcast_name = extract_podcast_name(file)
    episode_name = extract_episode_name(file)
    publish_date = filename_to_date.get(os.path.relpath(file, root))
    metadata.append({
        'filename': file,
        'podcast_name': podcast_name,
        'episode_title': episode_name,
        'publish_date': publish_date,
    })

df = pd.DataFrame(metadata)

### Export Table

In [None]:
df.to_csv("all_podcasts_metadata.csv", index=False)