In [4]:
import pandas as pd

df = pd.read_csv("ted_talks_en.csv")

df.head() 

Unnamed: 0,talk_id,title,speaker_1,all_speakers,occupations,about_speakers,views,recorded_date,published_date,event,native_lang,available_lang,comments,duration,topics,related_talks,url,description,transcript
0,1,Averting the climate crisis,Al Gore,{0: 'Al Gore'},{0: ['climate advocate']},{0: 'Nobel Laureate Al Gore focused the world’...,3523392,2006-02-25,2006-06-27,TED2006,en,"['ar', 'bg', 'cs', 'de', 'el', 'en', 'es', 'fa...",272.0,977,"['alternative energy', 'cars', 'climate change...","{243: 'New thinking on the climate crisis', 54...",https://www.ted.com/talks/al_gore_averting_the...,With the same humor and humanity he exuded in ...,"Thank you so much, Chris. And it's truly a gre..."
1,92,The best stats you've ever seen,Hans Rosling,{0: 'Hans Rosling'},{0: ['global health expert; data visionary']},"{0: 'In Hans Rosling’s hands, data sings. Glob...",14501685,2006-02-22,2006-06-27,TED2006,en,"['ar', 'az', 'bg', 'bn', 'bs', 'cs', 'da', 'de...",628.0,1190,"['Africa', 'Asia', 'Google', 'demo', 'economic...","{2056: ""Own your body's data"", 2296: 'A visual...",https://www.ted.com/talks/hans_rosling_the_bes...,You've never seen data presented like this. Wi...,"About 10 years ago, I took on the task to teac..."
2,7,Simplicity sells,David Pogue,{0: 'David Pogue'},{0: ['technology columnist']},{0: 'David Pogue is the personal technology co...,1920832,2006-02-24,2006-06-27,TED2006,en,"['ar', 'bg', 'de', 'el', 'en', 'es', 'fa', 'fr...",124.0,1286,"['computers', 'entertainment', 'interface desi...","{1725: '10 top time-saving tech tips', 2274: '...",https://www.ted.com/talks/david_pogue_simplici...,New York Times columnist David Pogue takes aim...,"(Music: ""The Sound of Silence,"" Simon & Garfun..."
3,53,Greening the ghetto,Majora Carter,{0: 'Majora Carter'},{0: ['activist for environmental justice']},{0: 'Majora Carter redefined the field of envi...,2664069,2006-02-26,2006-06-27,TED2006,en,"['ar', 'bg', 'bn', 'ca', 'cs', 'de', 'en', 'es...",219.0,1116,"['MacArthur grant', 'activism', 'business', 'c...",{1041: '3 stories of local eco-entrepreneurshi...,https://www.ted.com/talks/majora_carter_greeni...,"In an emotionally charged talk, MacArthur-winn...",If you're here today — and I'm very happy that...
4,66,Do schools kill creativity?,Sir Ken Robinson,{0: 'Sir Ken Robinson'},"{0: ['author', 'educator']}","{0: ""Creativity expert Sir Ken Robinson challe...",65051954,2006-02-25,2006-06-27,TED2006,en,"['af', 'ar', 'az', 'be', 'bg', 'bn', 'ca', 'cs...",4931.0,1164,"['children', 'creativity', 'culture', 'dance',...","{865: 'Bring on the learning revolution!', 173...",https://www.ted.com/talks/sir_ken_robinson_do_...,Sir Ken Robinson makes an entertaining and pro...,Good morning. How are you? (Audience) Good. It...


In [5]:

df.columns


Index(['talk_id', 'title', 'speaker_1', 'all_speakers', 'occupations',
       'about_speakers', 'views', 'recorded_date', 'published_date', 'event',
       'native_lang', 'available_lang', 'comments', 'duration', 'topics',
       'related_talks', 'url', 'description', 'transcript'],
      dtype='object')

In [6]:


def _strip_html(text: str) -> str:
    # Removing anything between < and > (simple tag stripper)
    out = []
    in_tag = False
    for ch in str(text):
        if ch == "<":
            in_tag = True
            continue
        if ch == ">" and in_tag:
            in_tag = False
            continue
        if not in_tag:
            out.append(ch)
    return "".join(out)

def _remove_square_brackets_content(text: str) -> str:
    # Remove content inside [ ... ] including nested levels
    out = []
    depth = 0
    for ch in text:
        if ch == "[":
            depth += 1
            continue
        if ch == "]" and depth > 0:
            depth -= 1
            continue
        if depth == 0:
            out.append(ch)
    return "".join(out)

def clean_text(s: str) -> str:
    s = "" if pd.isna(s) else str(s)
    s = _strip_html(s)                      # HTML tags
    s = _remove_square_brackets_content(s)  #  [timestamps] / [notes]
    s = " ".join(s.split())                 # normalize whitespace
    return s.strip()


In [8]:
transcript_col = "transcript"   
language_col = "native_lang"    

# Apply cleaning
df[transcript_col] = df[transcript_col].astype(str).apply(clean_text)



In [9]:
# Drop empty/short transcripts
df = df[df[transcript_col].str.len() > 5].copy()

# Keeping only relevant columns
keep_cols = [c for c in ["title","speaker_1","url", transcript_col] if c in df.columns]
df_clean = df[keep_cols].copy()


In [10]:
# Standardize column names
if transcript_col != "transcript":
    df_clean.rename(columns={transcript_col: "transcript"}, inplace=True)
if language_col and language_col != "language" and language_col in df_clean.columns:
    df_clean.rename(columns={language_col: "language"}, inplace=True)


In [12]:
df_clean.head()

Unnamed: 0,title,speaker_1,url,transcript
0,Averting the climate crisis,Al Gore,https://www.ted.com/talks/al_gore_averting_the...,"Thank you so much, Chris. And it's truly a gre..."
1,The best stats you've ever seen,Hans Rosling,https://www.ted.com/talks/hans_rosling_the_bes...,"About 10 years ago, I took on the task to teac..."
2,Simplicity sells,David Pogue,https://www.ted.com/talks/david_pogue_simplici...,"(Music: ""The Sound of Silence,"" Simon & Garfun..."
3,Greening the ghetto,Majora Carter,https://www.ted.com/talks/majora_carter_greeni...,If you're here today — and I'm very happy that...
4,Do schools kill creativity?,Sir Ken Robinson,https://www.ted.com/talks/sir_ken_robinson_do_...,Good morning. How are you? (Audience) Good. It...


In [13]:
# Save cleaned version
df_clean.to_csv("ted_talks_clean.csv", index=False, encoding="utf-8")
print(f"Saved cleaned dataset with {len(df_clean)} rows.")

Saved cleaned dataset with 4005 rows.
