# Process Short Clips Transcripts

In [68]:
import os
import re
import pandas as pd

def load_transcripts(transcripts_dir, host="huberman", length="short"):
   
    all_data = []

    # --- Load files ---
    for filename in os.listdir(transcripts_dir):
        if not filename.endswith(".txt"):
            continue
        
        path = os.path.join(transcripts_dir, filename)  # full path here
        with open(path, "r", encoding="utf-8") as f:
            lines = f.readlines()

        content_lines = []
        durations = []
        for line in lines:
            match = re.match(r"(\d{1,3}:\d{2})\s+(.*)", line)
            if match:
                timestamp = match.group(1)
                text = match.group(2)
                try:
                    mins, secs = map(int, timestamp.split(":"))
                    durations.append(mins * 60 + secs)
                except:
                    pass
                content_lines.append(text)

        clean_text = " ".join(content_lines)
        est_duration = (max(durations) - min(durations)) // 60 if durations else None

        episode_title = os.path.splitext(filename)[0]
        all_data.append({
            "title": episode_title,
            "host": host,
            "transcript": clean_text,
            "duration_min": est_duration or 15
        })

    print(f"Loaded {len(all_data)} transcripts")
    if len(all_data) > 0:
        print(f"Sample keys in first item: {all_data[0].keys()}")

    df = pd.DataFrame(all_data)
    if "title" not in df.columns:
        print("Warning: 'title' column missing!")

    df["length"] = length

    return df


In [69]:
transcripts_dir = "transcripts/huberman_transcripts/short_transcripts/"
df1 = load_transcripts(transcripts_dir)

Loaded 25 transcripts
Sample keys in first item: dict_keys(['title', 'host', 'transcript', 'duration_min'])


In [70]:
transcripts_dir = "transcripts/huberman_transcripts/short_transcripts_2/"
df2 = load_transcripts(transcripts_dir)

Loaded 50 transcripts
Sample keys in first item: dict_keys(['title', 'host', 'transcript', 'duration_min'])


In [71]:
transcripts_dir = "transcripts/huberman_transcripts/short_transcripts_3/"
df3 = load_transcripts(transcripts_dir)

Loaded 50 transcripts
Sample keys in first item: dict_keys(['title', 'host', 'transcript', 'duration_min'])


In [72]:
transcripts_dir = "transcripts/huberman_transcripts/short_transcripts_4/"
df4 = load_transcripts(transcripts_dir)

df4

Loaded 20 transcripts
Sample keys in first item: dict_keys(['title', 'host', 'transcript', 'duration_min'])


Unnamed: 0,title,host,transcript,duration_min,length
0,The Truth About Dietary Cholesterol | Dr. Pete...,huberman,- Most people are operating that eating satura...,6,short
1,How to Use Caffeine & Coffee to Improve Focus ...,huberman,- I know many people are caffeine can improve ...,4,short
2,How Nicotine Impacts Your Brain & Enhances Foc...,huberman,- Today it's really simple. Nicotine only bind...,5,short
3,How to Reach Your Goals with a Science-Backed ...,huberman,- What's the link between and how can we lever...,12,short
4,"How to Quit Smoking, Vaping or Dipping Tobacco",huberman,- Vaping is actually harder to quit than cigar...,16,short
5,What to Do & Not Do When Setting Goals | Dr. E...,huberman,"- The simple question is, can I use this finis...",8,short
6,Simple Tool to Make Better Food Choices | Jeff...,huberman,"- How should people - So yeah, I have what and...",10,short
7,A Simple Test for Gauging Recovery & Workout “...,huberman,- If the goal is to challenge muscles and one ...,9,short
8,The Ideal Length of Time for Focused Work,huberman,- The question I often get is how long should ...,5,short
9,How Does Alcohol Impact Your Gut Microbiome & ...,huberman,"- For those of you that I really, I like to sa...",9,short


In [73]:
transcripts_dir = "transcripts/huberman_transcripts/short_clip_transcripts/"
df5 = load_transcripts(transcripts_dir)

df5

Loaded 36 transcripts
Sample keys in first item: dict_keys(['title', 'host', 'transcript', 'duration_min'])


Unnamed: 0,title,host,transcript,duration_min,length
0,The Truth About Seed Oils | Dr. Mark Hyman & D...,huberman,what's your view on seed oils oi no just you k...,7,short
1,How to Maximize Muscle Protein Synthesis | Ala...,huberman,If you can give an acrosstheboard recommendati...,15,short
2,Impact of Lockdowns & Vaccine Mandates on Publ...,huberman,big segments of the public feel like they caug...,22,short
3,How to Beat Social Media Addiction | Dr. Anna ...,huberman,"So first of all, social media um how addicting...",9,short
4,Best Tools for Managing Stress & Addiction | R...,huberman,at the heart of addiction but also at the hear...,11,short
5,How to Improve Serotonin Levels for Depression...,huberman,having adequate levels of serotonin is absolut...,10,short
6,"Dopamine Baseline, Impulsivity & Addiction | ...",huberman,i and many listeners of this podcast are obses...,8,short
7,Simple Practice to Lower Stress While Mourning...,huberman,what do we know about the way to let emotion o...,6,short
8,The Science of First World Problems | Michael ...,huberman,life feels hard it's like work is hard uh you ...,11,short
9,How to Make America Healthy Again | Dr. Mark H...,huberman,what do you think it's going to take to make A...,6,short


In [74]:
# Example: combine list of dfs vertically
df_combined = pd.concat([df1, df2, df3, df4, df5], ignore_index=True)
df_combined

Unnamed: 0,title,host,transcript,duration_min,length
0,Exercises to Improve Your Sprinting & Running ...,huberman,what are some ways of doing exercises in the g...,14,short
1,The Truth About Fasting for Women | Dr. Stacy...,huberman,"Andrew Huberman: Fasting. Stacy Sims: Oh, yes....",8,short
2,8 Pillars for Building Your Immune System | Dr...,huberman,if you were to think about the the major pilla...,4,short
3,Why Men & Women Show Attraction Differently | ...,huberman,what are the data on how females signal um let...,12,short
4,"How & Why to Skip for Speed, Mobility & Health...",huberman,yesterday we didn't Sprint but we did a lot of...,13,short
...,...,...,...,...,...
176,Daily Habits for Increasing Grit & Resilience ...,huberman,i want to distinguish between daily self-induc...,15,short
177,Simple Tool to Prevent Relapse in Recovery | R...,huberman,i think recovery is it's not about finding a l...,9,short
178,How to Change Your Brain & Increase Neuroplast...,huberman,as we get into adulthood most of our neural ma...,10,short
179,The Truth About Fluoride | Dr. Staci Whitman &...,huberman,let's talk about fluoride I've already been ac...,23,short


In [75]:
short_clips_combined_final = df_combined.drop_duplicates(subset=['title'])
short_clips_combined_final

Unnamed: 0,title,host,transcript,duration_min,length
0,Exercises to Improve Your Sprinting & Running ...,huberman,what are some ways of doing exercises in the g...,14,short
1,The Truth About Fasting for Women | Dr. Stacy...,huberman,"Andrew Huberman: Fasting. Stacy Sims: Oh, yes....",8,short
2,8 Pillars for Building Your Immune System | Dr...,huberman,if you were to think about the the major pilla...,4,short
3,Why Men & Women Show Attraction Differently | ...,huberman,what are the data on how females signal um let...,12,short
4,"How & Why to Skip for Speed, Mobility & Health...",huberman,yesterday we didn't Sprint but we did a lot of...,13,short
...,...,...,...,...,...
140,How Does Blue Light & Other Light Affect Your ...,huberman,- So now let's talk about what I'm calling cri...,9,short
141,Testosterone & Testosterone Replacement Therap...,huberman,- Would love to talk a little bit about hormon...,16,short
142,What Alcohol Does to Your Brain | Dr. Andrew H...,huberman,"- Because of the structure of alcohol, it is w...",8,short
143,How Does Alcohol Increase the Risk of Cancer? ...,huberman,- One of the really but that's extremely well ...,9,short


# Load Entire Episodes

In [76]:
transcripts_dir = "transcripts/huberman_transcripts/full_episode_transcripts/"
huberman_full_episodes_df = load_transcripts(transcripts_dir, host="huberman", length="full")

Loaded 304 transcripts
Sample keys in first item: dict_keys(['title', 'host', 'transcript', 'duration_min'])


In [77]:
transcripts_dir = "transcripts/peter_attia_transcripts/transcripts/"
peter_attia_full_episodes_df = load_transcripts(transcripts_dir, host="peter_attia", length="full")

Loaded 35 transcripts
Sample keys in first item: dict_keys(['title', 'host', 'transcript', 'duration_min'])


# Concat all

In [78]:
all_combined_final = pd.concat([peter_attia_full_episodes_df, huberman_full_episodes_df, short_clips_combined_final], ignore_index=True)
all_combined_final

Unnamed: 0,title,host,transcript,duration_min,length
0,197 - The science of obesity & how to improve ...,peter_attia,hey everyone welcome to the drive podcast i'm ...,141,full
1,"#114 – Eileen White, Ph.D. Autophagy, fasting,...",peter_attia,hey everyone welcome to the drive podcast I'm ...,118,full
2,"#14 – Robert Lustig, M.D., M.S.L. fructose, pr...",peter_attia,[Music] hey everyone welcome to the Peter Atia...,83,full
3,When time-restricted feeding can be problemati...,peter_attia,i think most of the benefit of time-restricted...,4,full
4,"#59–Jason Fung, M.D Fasting as an antidote to ...",peter_attia,hey everyone welcome to the Peter Atia drive I...,161,full
...,...,...,...,...,...
416,How Does Blue Light & Other Light Affect Your ...,huberman,- So now let's talk about what I'm calling cri...,9,short
417,Testosterone & Testosterone Replacement Therap...,huberman,- Would love to talk a little bit about hormon...,16,short
418,What Alcohol Does to Your Brain | Dr. Andrew H...,huberman,"- Because of the structure of alcohol, it is w...",8,short
419,How Does Alcohol Increase the Risk of Cancer? ...,huberman,- One of the really but that's extremely well ...,9,short


# Scale duration so it doesnt mess with cosine

In [79]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

all_combined_final["length_scaled"] = scaler.fit_transform(all_combined_final[["duration_min"]])

In [80]:
all_combined_final

Unnamed: 0,title,host,transcript,duration_min,length,length_scaled
0,197 - The science of obesity & how to improve ...,peter_attia,hey everyone welcome to the drive podcast i'm ...,141,full,0.501805
1,"#114 – Eileen White, Ph.D. Autophagy, fasting,...",peter_attia,hey everyone welcome to the drive podcast I'm ...,118,full,0.418773
2,"#14 – Robert Lustig, M.D., M.S.L. fructose, pr...",peter_attia,[Music] hey everyone welcome to the Peter Atia...,83,full,0.292419
3,When time-restricted feeding can be problemati...,peter_attia,i think most of the benefit of time-restricted...,4,full,0.007220
4,"#59–Jason Fung, M.D Fasting as an antidote to ...",peter_attia,hey everyone welcome to the Peter Atia drive I...,161,full,0.574007
...,...,...,...,...,...,...
416,How Does Blue Light & Other Light Affect Your ...,huberman,- So now let's talk about what I'm calling cri...,9,short,0.025271
417,Testosterone & Testosterone Replacement Therap...,huberman,- Would love to talk a little bit about hormon...,16,short,0.050542
418,What Alcohol Does to Your Brain | Dr. Andrew H...,huberman,"- Because of the structure of alcohol, it is w...",8,short,0.021661
419,How Does Alcohol Increase the Risk of Cancer? ...,huberman,- One of the really but that's extremely well ...,9,short,0.025271


In [81]:
# import yake

# def extract_keywords(text):
#     kw_extractor = yake.KeywordExtractor(lan="en", n=1, top=10)
#     keywords = kw_extractor.extract_keywords(text)
#     return [kw for kw, score in keywords]



# all_combined_final["keywords"] = all_combined_final["transcript"].apply(extract_keywords)
# all_combined_final

# Add summary

In [82]:
# from transformers import pipeline
# from nltk.tokenize import sent_tokenize
# import nltk
# nltk.download("punkt", quiet=True)

# # Load summarization pipeline (this can take a few seconds)
# summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# def summarize_start_end(text, n_sentences=5):
#     """
#     Summarize the first and last N sentences of a transcript using a transformer model.
#     """
#     sentences = sent_tokenize(text)
    
#     if len(sentences) <= 2 * n_sentences:
#         start = " ".join(sentences[:len(sentences)//2])
#         end = " ".join(sentences[len(sentences)//2:])
#     else:
#         start = " ".join(sentences[:n_sentences])
#         end = " ".join(sentences[-n_sentences:])
    
#     snippet = f"{start} {end}"
    
#     # Limit snippet size for model input (max tokens ~1024 for BART)
#     max_input_chars = 1024
#     snippet = snippet[:max_input_chars]

#     try:
#         summary = summarizer(snippet, max_length=100, min_length=25, do_sample=False)[0]["summary_text"]
#     except Exception as e:
#         summary = "Summarization failed"
    
#     return summary

# all_combined_final["summary_edge"] = all_combined_final["transcript"].apply(summarize_start_end)

# Add Tags

In [83]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# import re

# def extract_tags_from_transcript(episode_text, top_n=5):
#     # Return empty list if input is None or empty after stripping whitespace
#     if not episode_text or not episode_text.strip():
#         return []

#     vectorizer = TfidfVectorizer(
#         stop_words='english',
#         max_features=10000,
#         ngram_range=(1, 3)
#     )

#     try:
#         tfidf_matrix = vectorizer.fit_transform([episode_text])
#     except ValueError:
#         # This happens if the document contains only stop words or is empty
#         return []

#     feature_names = vectorizer.get_feature_names_out()
#     tfidf_scores = tfidf_matrix.toarray()[0]

#     filtered_terms_scores = [
#         (term, score) for term, score in zip(feature_names, tfidf_scores)
#         if len(term) > 2 and re.match(r'^[a-zA-Z\s]+$', term)
#     ]

#     # If no terms after filtering, return empty list
#     if not filtered_terms_scores:
#         return []

#     filtered_terms_scores.sort(key=lambda x: x[1], reverse=True)

#     top_terms = [term for term, _ in filtered_terms_scores[:top_n]]
    
#     return top_terms


In [84]:
#all_combined_final["tags"] = all_combined_final['transcript'].apply(lambda t: extract_tags_from_transcript(t, top_n=10))

# Transformer Text embedding

In [85]:
from sentence_transformers import SentenceTransformer
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import normalize

nltk.download('stopwords')
nltk.download('wordnet')

# Load sentence transformer model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return ' '.join(tokens)

# --- CLEAN TITLE ---
def clean_title(title):
    title = title.lower()
    title = re.sub(r'\d+', '', title)
    title = re.sub(r'[^a-z\s]', '', title)
    title = re.sub(r'\s+', ' ', title).strip()
    return title

# --- CLEAN TRANSCRIPT ---
def clean_transcript(raw_text: str) -> str:
    if not isinstance(raw_text, str):
        return ""
    
    text = raw_text.lower()

    # Remove timestamps like 00:01, 0:01:23, [00:01], etc.
    text = re.sub(r'\[?\d{1,2}:\d{2}(?::\d{2})?\]?', '', text)

    # Remove repeated words (e.g., "i think i think")
    text = re.sub(r'\b(\w+)( \1\b)+', r'\1', text, flags=re.IGNORECASE)

    # Remove artifacts like [inaudible], --, extra spaces
    text = re.sub(r'\[.*?\]', '', text)        # Remove [inaudible], [laughs], etc.
    text = re.sub(r'--+', ' ', text)           # Replace long dashes
    text = re.sub(r'\s+', ' ', text)           # Normalize whitespace
    
    # preprocess text
    text = preprocess(text)

    return text.strip()


# --- EMBED METADATA ---
def embed_metadata(row):
    title = clean_title(row['title']) if isinstance(row.get('title'), str) and row['title'].strip() else ''
    host = row['host'].strip() if isinstance(row.get('host'), str) and row['host'].strip() else ''
    tags = ' '.join(row['tags']) if isinstance(row.get('tags'), list) and len(row['tags']) > 0 else ''
    
    combined_parts = []
    if title:
        combined_parts.append(f"Title: {title}")
    if host:
        combined_parts.append(f"Host: {host}")
    if tags:
        combined_parts.append(f"Tags: {tags}")
    
    combined_text = '. '.join(combined_parts)
    return combined_text if combined_text else None


# --- SPLIT TRANSCRIPT INTO CHUNKS---
def split_into_chunks(text, chunk_size=250, overlap=50):
    '''
    Sliding window embeddings create overlapping chunks that capture context across boundaries. 
    The overlap ensures important information isn't lost at chunk edges.
    '''
    cleaned = clean_transcript(text)
    
    words = cleaned.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = words[i:i+chunk_size]
        if len(chunk) > 10:  # skip too-short fragments
            chunks.append(" ".join(chunk))
    return chunks


# --- EMBED TRANSCRIPT (TITLE + CHUNK + MEAN POOLING) ---
def embed_transcript_mean(row, weighted=False):
    transcript = row.get('transcript')
    title = row.get('title', '')

    if not isinstance(transcript, str) or not transcript.strip():
        return None

    chunks = split_into_chunks(transcript)
    if title:
        chunks = [f"Title: {title}"] + chunks

    try:
        chunk_embeddings = embedding_model.encode(chunks)
    except Exception as e:
        print(f"Embedding failed for row {row.name}: {e}")
        return None
    
    if weighted:
        vectorizer = TfidfVectorizer(
            stop_words='english',
            max_features=10000,
            ngram_range=(1, 3)
        )
        tfidf_matrix = vectorizer.fit_transform(chunks)
        weights = tfidf_matrix.sum(axis=1).A1  # sum of TF-IDF scores per chunk
        weights /= weights.sum()  # normalize to sum 1

        # WEIGHTED MEAN POOLING
        weighted_embedding = np.average(chunk_embeddings, axis=0, weights=weights)
        return normalize(weighted_embedding.reshape(1, -1))[0]
    
    else:
        # MEAN POOLING
        mean_embedding = np.mean(chunk_embeddings, axis=0)
        return normalize(mean_embedding.reshape(1, -1))[0]
   
# --- MAIN EXECUTION ---
embedded_df = all_combined_final.copy()

# Embed metadata
embedded_df['metadata_text'] = embedded_df.apply(embed_metadata, axis=1)
embedded_df['metadata_embedding'] = embedded_df['metadata_text'].apply(lambda x: embedding_model.encode(x) if x else None)

embedded_df['transcript_embedding_mean'] = embedded_df.apply(lambda x: embed_transcript_mean(x, weighted=False) if isinstance(x.get('transcript'), str) else None, axis=1)

embedded_df['transcript_embedding_weighted_mean'] = embedded_df.apply(lambda x: embed_transcript_mean(x, weighted=True) if isinstance(x.get('transcript'), str) else None, axis=1)

# Save the model
embedding_model.save("models/embedding_model")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rodrigoazevedo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rodrigoazevedo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [86]:
def is_valid_embedding(x):
    return isinstance(x, np.ndarray) and x.shape == (384,)

embedded_df = embedded_df[
    embedded_df['transcript_embedding_mean'].apply(is_valid_embedding) &
    embedded_df['transcript_embedding_weighted_mean'].apply(is_valid_embedding)
].copy()

In [87]:
embedded_df

Unnamed: 0,title,host,transcript,duration_min,length,length_scaled,metadata_text,metadata_embedding,transcript_embedding_mean,transcript_embedding_weighted_mean
0,197 - The science of obesity & how to improve ...,peter_attia,hey everyone welcome to the drive podcast i'm ...,141,full,0.501805,Title: the science of obesity how to improve n...,"[-0.022833344, 0.07802306, -0.032226842, 0.068...","[0.017965756, -0.0331852, 0.0027354055, 0.0559...","[0.01925291836452334, -0.03365230841329864, 0...."
1,"#114 – Eileen White, Ph.D. Autophagy, fasting,...",peter_attia,hey everyone welcome to the drive podcast I'm ...,118,full,0.418773,Title: eileen white phd autophagy fasting and ...,"[-0.040437456, 0.002098971, -0.020459592, -0.0...","[-0.037567433, -0.055975065, -0.0055511263, -0...","[-0.0354052775989637, -0.05567652891876326, -0..."
2,"#14 – Robert Lustig, M.D., M.S.L. fructose, pr...",peter_attia,[Music] hey everyone welcome to the Peter Atia...,83,full,0.292419,Title: robert lustig md msl fructose processed...,"[-0.042696442, 0.0070618135, -0.064535595, 0.0...","[-0.0036058724, -0.08237494, -0.021123055, 0.0...","[-0.0021875247440077223, -0.08259342449547517,..."
3,When time-restricted feeding can be problemati...,peter_attia,i think most of the benefit of time-restricted...,4,full,0.007220,Title: when timerestricted feeding can be prob...,"[0.0016957741, -0.038639758, -0.056927297, 0.0...","[0.003614387, -0.029351646, 0.0112810545, 0.06...","[0.005216625573611037, -0.028937832436603056, ..."
4,"#59–Jason Fung, M.D Fasting as an antidote to ...",peter_attia,hey everyone welcome to the Peter Atia drive I...,161,full,0.574007,Title: jason fung md fasting as an antidote to...,"[-0.069303975, 0.06841706, -0.04217706, 0.0588...","[-0.0111459615, -0.021815524, 0.0053058765, 0....","[-0.010397689449145419, -0.022196597107664997,..."
...,...,...,...,...,...,...,...,...,...,...
416,How Does Blue Light & Other Light Affect Your ...,huberman,- So now let's talk about what I'm calling cri...,9,short,0.025271,Title: how does blue light other light affect ...,"[0.009075403, -0.0018783506, 0.04214326, 0.098...","[0.005855187, -0.025002873, 0.05419137, 0.0915...","[0.00378464498835268, -0.03193023803732182, 0...."
417,Testosterone & Testosterone Replacement Therap...,huberman,- Would love to talk a little bit about hormon...,16,short,0.050542,Title: testosterone testosterone replacement t...,"[-0.06041417, 0.04818749, -0.057231482, -0.031...","[-0.0019385928, -0.056293856, -0.02548357, 0.0...","[0.004932264989533421, -0.05950293917152512, -..."
418,What Alcohol Does to Your Brain | Dr. Andrew H...,huberman,"- Because of the structure of alcohol, it is w...",8,short,0.021661,Title: what alcohol does to your brain dr andr...,"[0.058442254, -0.013256706, -0.023135036, -0.0...","[0.086912274, -0.11602823, -0.029870214, 0.036...","[0.08201410616434474, -0.12933055547196115, -0..."
419,How Does Alcohol Increase the Risk of Cancer? ...,huberman,- One of the really but that's extremely well ...,9,short,0.025271,Title: how does alcohol increase the risk of c...,"[0.09300353, 0.029863423, -0.041747954, 0.0032...","[0.07066936, -0.011708548, -0.058251742, 0.013...","[0.05588628606690752, -0.020930629532909712, -..."


In [88]:
import pickle

# Save
with open("transformers_embedded_podcast_data.pkl", "wb") as f:
    pickle.dump(embedded_df, f)

# TF-IDF vectorizer Only

In [96]:
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

# Preprocessing and cleaning functions same as before

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return ' '.join(tokens)

def clean_title(title):
    title = title.lower()
    title = re.sub(r'\d+', '', title)
    title = re.sub(r'[^a-z\s]', '', title)
    title = re.sub(r'\s+', ' ', title).strip()
    return title

def clean_transcript(raw_text: str) -> str:
    if not isinstance(raw_text, str):
        return ""
    
    text = raw_text.lower()
    text = re.sub(r'\[?\d{1,2}:\d{2}(?::\d{2})?\]?', '', text)
    text = re.sub(r'\b(\w+)( \1\b)+', r'\1', text, flags=re.IGNORECASE)
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'--+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = preprocess(text)
    return text.strip()

def embed_metadata_text(row):
    title = clean_title(row['title']) if isinstance(row.get('title'), str) and row['title'].strip() else ''
    host = row['host'].strip() if isinstance(row.get('host'), str) and row['host'].strip() else ''
    tags = ' '.join(row['tags']) if isinstance(row.get('tags'), list) and row['tags'] else ''
    combined = []
    if title:
        combined.append(f"Title: {title}")
    if host:
        combined.append(f"Host: {host}")
    if tags:
        combined.append(f"Tags: {tags}")
    return '. '.join(combined) if combined else ''

def split_into_chunks(text, chunk_size=250, overlap=50):
    cleaned = clean_transcript(text)
    words = cleaned.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = words[i:i + chunk_size]
        if len(chunk) > 10:
            chunks.append(" ".join(chunk))
    return chunks

# --- FIT vectorizers once on the whole dataset ---

# Fit metadata vectorizer
metadata_texts = embedded_df.apply(embed_metadata_text, axis=1).fillna("")
metadata_vectorizer = TfidfVectorizer(stop_words='english', max_features=2000, ngram_range=(1,3))
metadata_vectorizer.fit(metadata_texts)

# Fit chunk vectorizer on all transcript chunks
all_chunks = []
for transcript in embedded_df['transcript'].dropna():
    all_chunks.extend(split_into_chunks(transcript))
chunk_vectorizer = TfidfVectorizer(stop_words='english', max_features=2000, ngram_range=(1,3))
chunk_vectorizer.fit(all_chunks)

# --- EMBED functions using TF-IDF only ---

def embed_metadata_tfidf(text):
    if not text:
        return None
    vec = metadata_vectorizer.transform([text])
    return normalize(vec).toarray()[0]

def embed_transcript_tfidf_mean(row, weighted=False):
    transcript = row.get('transcript')
    title = row.get('title', '')

    if not isinstance(transcript, str) or not transcript.strip():
        return None

    chunks = split_into_chunks(transcript)
    if title:
        chunks = [f"Title: {title}"] + chunks

    tfidf_matrix = chunk_vectorizer.transform(chunks)

    if weighted:
        weights = tfidf_matrix.sum(axis=1).A1
        if weights.sum() == 0:
            weights = np.ones_like(weights) / len(weights)
        else:
            weights /= weights.sum()
        weighted_vec = np.average(tfidf_matrix.toarray(), axis=0, weights=weights)
        return normalize(weighted_vec.reshape(1, -1))[0]
    else:
        mean_vec = tfidf_matrix.mean(axis=0).A1
        return normalize(mean_vec.reshape(1, -1))[0]

# --- MAIN EXECUTION ---

embedded_df['metadata_text2'] = embedded_df.apply(embed_metadata_text, axis=1)
embedded_df['metadata_embedding_TfidfVectorizer'] = embedded_df['metadata_text2'].apply(embed_metadata_tfidf)

embedded_df['transcript_embedding_mean_TfidfVectorizer'] = embedded_df.apply(
    lambda x: embed_transcript_tfidf_mean(x, weighted=False) if isinstance(x.get('transcript'), str) else None,
    axis=1
)

embedded_df['transcript_embedding_weighted_mean_TfidfVectorizer'] = embedded_df.apply(
    lambda x: embed_transcript_tfidf_mean(x, weighted=True) if isinstance(x.get('transcript'), str) else None,
    axis=1
)

embedded_df = embedded_df.drop('metadata_text2', axis=1)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rodrigoazevedo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rodrigoazevedo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [97]:
import pickle

# Save vectorizer
with open('vectorizers/metadata_vectorizer.pkl', 'wb') as f:
    pickle.dump(metadata_vectorizer, f)

with open('vectorizers/chunk_vectorizer.pkl', 'wb') as f:
    pickle.dump(chunk_vectorizer, f)


# Save File

In [91]:
import pickle

# Save
with open("embedded_podcast_data.pkl", "wb") as f:
    pickle.dump(embedded_df, f)

In [98]:
embedded_df

Unnamed: 0,title,host,transcript,duration_min,length,length_scaled,metadata_text,metadata_embedding,transcript_embedding_mean,transcript_embedding_weighted_mean,metadata_embedding_TfidfVectorizer,transcript_embedding_mean_TfidfVectorizer,transcript_embedding_weighted_mean_TfidfVectorizer
0,197 - The science of obesity & how to improve ...,peter_attia,hey everyone welcome to the drive podcast i'm ...,141,full,0.501805,Title: the science of obesity how to improve n...,"[-0.022833344, 0.07802306, -0.032226842, 0.068...","[0.017965756, -0.0331852, 0.0027354055, 0.0559...","[0.01925291836452334, -0.03365230841329864, 0....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.004160176688622932, 0.01188641823912653, 0....","[0.004096056364292649, 0.012722903600817069, 0..."
1,"#114 – Eileen White, Ph.D. Autophagy, fasting,...",peter_attia,hey everyone welcome to the drive podcast I'm ...,118,full,0.418773,Title: eileen white phd autophagy fasting and ...,"[-0.040437456, 0.002098971, -0.020459592, -0.0...","[-0.037567433, -0.055975065, -0.0055511263, -0...","[-0.0354052775989637, -0.05567652891876326, -0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.007601766887527106, 0.03728617366450731, 0....","[0.009293147441094505, 0.039956359186599864, 0..."
2,"#14 – Robert Lustig, M.D., M.S.L. fructose, pr...",peter_attia,[Music] hey everyone welcome to the Peter Atia...,83,full,0.292419,Title: robert lustig md msl fructose processed...,"[-0.042696442, 0.0070618135, -0.064535595, 0.0...","[-0.0036058724, -0.08237494, -0.021123055, 0.0...","[-0.0021875247440077223, -0.08259342449547517,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.006325479031395795, 0.01924090337618289, 0....","[0.006475382630851746, 0.019900861297508902, 0..."
3,When time-restricted feeding can be problemati...,peter_attia,i think most of the benefit of time-restricted...,4,full,0.007220,Title: when timerestricted feeding can be prob...,"[0.0016957741, -0.038639758, -0.056927297, 0.0...","[0.003614387, -0.029351646, 0.0112810545, 0.06...","[0.005216625573611037, -0.028937832436603056, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,"#59–Jason Fung, M.D Fasting as an antidote to ...",peter_attia,hey everyone welcome to the Peter Atia drive I...,161,full,0.574007,Title: jason fung md fasting as an antidote to...,"[-0.069303975, 0.06841706, -0.04217706, 0.0588...","[-0.0111459615, -0.021815524, 0.0053058765, 0....","[-0.010397689449145419, -0.022196597107664997,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0018862534221210023, 0.005619946629528033, ...","[0.0013883944818373077, 0.006085805603589657, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
416,How Does Blue Light & Other Light Affect Your ...,huberman,- So now let's talk about what I'm calling cri...,9,short,0.025271,Title: how does blue light other light affect ...,"[0.009075403, -0.0018783506, 0.04214326, 0.098...","[0.005855187, -0.025002873, 0.05419137, 0.0915...","[0.00378464498835268, -0.03193023803732182, 0....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.011773254225196818, 0.029801260942695872, 0...","[0.013280903204594352, 0.03389155447567327, 0...."
417,Testosterone & Testosterone Replacement Therap...,huberman,- Would love to talk a little bit about hormon...,16,short,0.050542,Title: testosterone testosterone replacement t...,"[-0.06041417, 0.04818749, -0.057231482, -0.031...","[-0.0019385928, -0.056293856, -0.02548357, 0.0...","[0.004932264989533421, -0.05950293917152512, -...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.008956614061720571, 0.0, 0.0, 0.0, 0.0, 0.0...","[0.009041282265669956, 0.0, 0.0, 0.0, 0.0, 0.0..."
418,What Alcohol Does to Your Brain | Dr. Andrew H...,huberman,"- Because of the structure of alcohol, it is w...",8,short,0.021661,Title: what alcohol does to your brain dr andr...,"[0.058442254, -0.013256706, -0.023135036, -0.0...","[0.086912274, -0.11602823, -0.029870214, 0.036...","[0.08201410616434474, -0.12933055547196115, -0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.02955548117737999, 0.0, 0.0, 0.0, 0.0,...","[0.0, 0.034003542350124864, 0.0, 0.0, 0.0, 0.0..."
419,How Does Alcohol Increase the Risk of Cancer? ...,huberman,- One of the really but that's extremely well ...,9,short,0.025271,Title: how does alcohol increase the risk of c...,"[0.09300353, 0.029863423, -0.041747954, 0.0032...","[0.07066936, -0.011708548, -0.058251742, 0.013...","[0.05588628606690752, -0.020930629532909712, -...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [93]:
embedded_df.columns

Index(['title', 'host', 'transcript', 'duration_min', 'length',
       'length_scaled', 'metadata_text', 'metadata_embedding',
       'transcript_embedding_mean', 'transcript_embedding_weighted_mean',
       'metadata_embedding_TfidfVectorizer',
       'transcript_embedding_mean_TfidfVectorizer',
       'transcript_embedding_weighted_mean_TfidfVectorizer'],
      dtype='object')

In [101]:
TfidfVectorizer_embedded_df = embedded_df[['title', 'host', 'transcript', 'duration_min', 'length',
       'length_scaled', 'metadata_text', 
       'metadata_embedding_TfidfVectorizer',
       'transcript_embedding_weighted_mean_TfidfVectorizer']]

In [102]:
import pickle

# Save
with open("TfidfVectorizer_embedded_podcast_data.pkl", "wb") as f:
    pickle.dump(TfidfVectorizer_embedded_df, f)