In [1]:
import os
import pandas as pd
import webvtt
import re
from io import StringIO
import csv

In [2]:
vtt_folder = "vtt"

In [3]:
def read_vtt_file(filepath):
    for encoding in ['utf-8', 'latin-1', 'ISO-8859-1', 'cp1252']:
        try:
            with open(filepath, encoding=encoding) as f:
                return f.read()
        except UnicodeDecodeError:
            continue
    raise ValueError(f"Unable to decode file {filepath} with any of the tested encodings.")


In [5]:
results = []


for filename in os.listdir(vtt_folder):
    if filename.endswith(".vtt"):
        vtt_path = os.path.join(vtt_folder, filename)

        try:
            # read vtt file content
            vtt_content = read_vtt_file(vtt_path)

            vtt_io = StringIO(vtt_content)
            captions = webvtt.read_buffer(vtt_io)
            data = []
            for caption in captions:
                start_time = caption.start
                end_time = caption.end
                text = caption.text.replace('\n', ' ')
                data.append([start_time, end_time, text])

            # create df from data
            vtt_df = pd.DataFrame(data, columns=['StartTime', 'EndTime', 'Text'])

            # concatenate and clean transcript text
            transcript = ' '.join(vtt_df['Text'])
            transcript = re.sub(' +', ' ', transcript).strip()

            # Calculate duration
            start_time = pd.to_timedelta(vtt_df['StartTime'].iloc[0])
            end_time = pd.to_timedelta(vtt_df['EndTime'].iloc[-1])
            duration = end_time - start_time
            hours, remainder = divmod(duration.seconds, 3600)
            minutes, seconds = divmod(remainder, 60)
            formatted_duration = "{:02}:{:02}:{:02}".format(hours, minutes, seconds)

            results.append({
                "file": filename,
                "transcript": transcript,
                "duration": formatted_duration
            })

            results_df = pd.DataFrame(results)

        except ValueError as e:
            print(e)
            continue



In [6]:
results_df.to_csv("tmp.csv")

In [7]:
pattern = r'episode_(\d+)'

results_df['episode number'] = results_df['file'].apply(lambda x: re.search(pattern, x).group(1) if re.search(pattern, x) else None)

results_df['episode number'] = results_df['episode number'].astype(int)

results_df

Unnamed: 0,file,transcript,duration,episode number
0,episode_311_large.vtt,you have to have the free markets in order to ...,03:33:11,311
1,episode_319_small.vtt,"I mean, I've definitely experienced moments wh...",02:35:48,319
2,episode_297_small.vtt,We have two tubes that are right next to each ...,02:43:50,297
3,episode_274_small.vtt,Where are the darkest places you've ever gone ...,03:21:41,274
4,episode_077_small.vtt,The following is a conversation with Alex Garl...,01:10:59,77
...,...,...,...,...
631,episode_169_small.vtt,The following is a conversation with Ryan Hall...,02:53:38,169
632,episode_076_large.vtt,The following is a conversation with John Hopf...,01:12:40,76
633,episode_103_small.vtt,The following is a conversation with Ben Gertz...,04:09:13,103
634,episode_306_small.vtt,at which point is the neural network a being v...,02:10:17,306


In [None]:
# title_df = pd.read_excel('title.xlsx', header=None)

In [8]:
title_df = pd.read_table('./title.xlsx', names=['title'])

title_df = title_df.to_numpy()

print(title_df)

[['1 Max Tegmark: Life 3.0 | Lex Fridman Podcast #1']
 ['2 Christof Koch: Consciousness | Lex Fridman Podcast #2']
 ['3 Steven Pinker: AI in the Age of Reason | Lex Fridman Podcast #3']
 ['4 Yoshua Bengio: Deep Learning | Lex Fridman Podcast #4']
 ['5 Vladimir Vapnik: Statistical Learning | Lex Fridman Podcast #5']
 ['6 Guido van Rossum: Python | Lex Fridman Podcast #6']
 ['7 Jeff Atwood: Stack Overflow and Coding Horror | Lex Fridman Podcast #7']
 ['8 Eric Schmidt: Google | Lex Fridman Podcast #8']
 ['9 Stuart Russell: Long-Term Future of Artificial Intelligence | Lex Fridman Podcast #9']
 ['10 Pieter Abbeel: Deep Reinforcement Learning | Lex Fridman Podcast #10']
 ['11 Juergen Schmidhuber: Godel Machines, Meta-Learning, and LSTMs | Lex Fridman Podcast #11']
 ['12 Tuomas Sandholm: Poker and Game Theory | Lex Fridman Podcast #12']
 ['13 Tomaso Poggio: Brains, Minds, and Machines | Lex Fridman Podcast #13']
 ['14 Kyle Vogt: Cruise Automation | Lex Fridman Podcast #14']
 ['15 Leslie Kael

In [9]:
episode_indices = []
guests = []
episode_names = []
host_names = []
episode_numbers = []


pattern = r'(\d+)\s(.*?):\s(.*?)\|\s(Lex Fridman)(?:\sPodcast)?\s#?(\d*)'


# Parse each row in the df
# for title in title_df[0]:
for title in title_df:
    title = title[0]
    print(title)
    
    match = re.match(pattern, title)
    if match:
        episode_index = int(match.group(1))
        guest = match.group(2).strip()
        episode_name = match.group(3).strip()
        host_name = match.group(4).strip()  # Excludes "Podcast"
        episode_number = int(match.group(5)) if match.group(5) else episode_index

        episode_indices.append(episode_index)
        guests.append(guest)
        episode_names.append(episode_name)
        host_names.append(host_name)
        episode_numbers.append(episode_number)

parsed_title_df = pd.DataFrame({
    'episode index': episode_indices,
    'guest': guests,
    'episode name': episode_names,
    'host name': host_names,
    'episode number': episode_numbers
})

1 Max Tegmark: Life 3.0 | Lex Fridman Podcast #1
2 Christof Koch: Consciousness | Lex Fridman Podcast #2
3 Steven Pinker: AI in the Age of Reason | Lex Fridman Podcast #3
4 Yoshua Bengio: Deep Learning | Lex Fridman Podcast #4
5 Vladimir Vapnik: Statistical Learning | Lex Fridman Podcast #5
6 Guido van Rossum: Python | Lex Fridman Podcast #6
7 Jeff Atwood: Stack Overflow and Coding Horror | Lex Fridman Podcast #7
8 Eric Schmidt: Google | Lex Fridman Podcast #8
9 Stuart Russell: Long-Term Future of Artificial Intelligence | Lex Fridman Podcast #9
10 Pieter Abbeel: Deep Reinforcement Learning | Lex Fridman Podcast #10
11 Juergen Schmidhuber: Godel Machines, Meta-Learning, and LSTMs | Lex Fridman Podcast #11
12 Tuomas Sandholm: Poker and Game Theory | Lex Fridman Podcast #12
13 Tomaso Poggio: Brains, Minds, and Machines | Lex Fridman Podcast #13
14 Kyle Vogt: Cruise Automation | Lex Fridman Podcast #14
15 Leslie Kaelbling: Reinforcement Learning, Planning, and Robotics | Lex Fridman Podca

In [10]:
parsed_title_df.dtypes

episode index      int64
guest             object
episode name      object
host name         object
episode number     int64
dtype: object

In [11]:
parsed_title_df

# 100, 268, 282, 283, 287, 291 missing,
# 165 missing Podcast AFTER Lex Fridman

Unnamed: 0,episode index,guest,episode name,host name,episode number
0,1,Max Tegmark,Life 3.0,Lex Fridman,1
1,2,Christof Koch,Consciousness,Lex Fridman,2
2,3,Steven Pinker,AI in the Age of Reason,Lex Fridman,3
3,4,Yoshua Bengio,Deep Learning,Lex Fridman,4
4,5,Vladimir Vapnik,Statistical Learning,Lex Fridman,5
...,...,...,...,...,...
314,321,Ray Kurzweil,"Singularity, Superintelligence, and Immortality",Lex Fridman,321
315,322,Rana el Kaliouby,"Emotion AI, Social Robots, and Self-Driving Cars",Lex Fridman,322
316,323,Will Sasso,"Comedy, MADtv, AI, Friendship, Madness, and Pr...",Lex Fridman,323
317,324,Daniel Negreanu,Poker,Lex Fridman,324


In [12]:
final_df = pd.merge(parsed_title_df, results_df, on='episode number')

final_df = final_df[['episode index', 'guest', 'episode name', 'host name', 'episode number', 'file', 'transcript', 'duration']]
final_df

Unnamed: 0,episode index,guest,episode name,host name,episode number,file,transcript,duration
0,1,Max Tegmark,Life 3.0,Lex Fridman,1,episode_001_large.vtt,"As part of MIT course 6S099, Artificial Genera...",01:22:40
1,1,Max Tegmark,Life 3.0,Lex Fridman,1,episode_001_small.vtt,As part of MIT course 6S 099 Artificial Genera...,01:23:00
2,2,Christof Koch,Consciousness,Lex Fridman,2,episode_002_large.vtt,As part of MIT course 6S099 on artificial gene...,00:57:55
3,2,Christof Koch,Consciousness,Lex Fridman,2,episode_002_small.vtt,As part of MIT course 6S099 on artificial gene...,00:57:56
4,3,Steven Pinker,AI in the Age of Reason,Lex Fridman,3,episode_003_large.vtt,"You've studied the human mind, cognition, lang...",00:37:33
...,...,...,...,...,...,...,...,...
631,322,Rana el Kaliouby,"Emotion AI, Social Robots, and Self-Driving Cars",Lex Fridman,322,episode_322_small.vtt,"there's a broader question here, right? As we ...",02:36:23
632,323,Will Sasso,"Comedy, MADtv, AI, Friendship, Madness, and Pr...",Lex Fridman,323,episode_323_small.vtt,Once this whole thing falls apart and we are c...,02:21:49
633,323,Will Sasso,"Comedy, MADtv, AI, Friendship, Madness, and Pr...",Lex Fridman,323,episode_323_large.vtt,Once this whole thing falls apart and we are c...,02:22:02
634,324,Daniel Negreanu,Poker,Lex Fridman,324,episode_324_large.vtt,you could be the seventh best player in the wh...,02:21:40


In [13]:
final_df.to_csv('vtt_data.csv', sep="|", index=False, quoting=csv.QUOTE_ALL)

In [None]:
# output_file = 'vtt_data.txt'

# with open(output_file, 'w', encoding='utf-8') as file:
#     # Write the columns
#     file.write('\t'.join(final_df.columns) + '\n')

#     # Write the rows
#     for index, row in final_df.iterrows():
#         file.write('\t'.join(str(value) for value in row) + '\n')
