In [1]:
import os
import re

## Data Cleaning

In [2]:
NO_TIMESTAMP_FILES = ['episode1', 'episode2', 'episode3']

In [3]:
# Read a file from the data/transcripts directory which is one level up from the current directory and return a list of lines
def read_file_lines(filename):
    with open(os.path.join('..', 'data', 'transcripts', filename), 'r') as f:
        return f.readlines()

In [4]:
def extract_episode_info(lines):
    # Extract the episode number and title from the 13th line of the transcript
    lines[12] = lines[12].replace(' – ', ' - ')
    episode_info = lines[12].split(' - ')
    episode_number = episode_info[0].split(' | ')[-1].strip()
    episode_title = episode_info[1].strip()

    return episode_number, episode_title

In [5]:
def extract_line_metadata(line):
    # Extract the speaker and timestamp from the line
    # Check if the timestamp is present like [00:00:00]
    if not re.search(r'\[\d{2}:\d{2}:\d{2}\]', line):
        line_metadata = line.split(':')
        speaker = line_metadata[0].strip()
        timestamp = None
        text = line_metadata[1].strip()
    else:
        # Check if line starts with a timestamp like [timestamp] speaker text
        if re.search(r'^\[\d{2}:\d{2}:\d{2}\]', line):
            line_metadata = line.split(']')
            timestamp = line_metadata[0].strip().replace('[', '')
            # First word of the line is the speaker
            speaker = line_metadata[1].strip().split(' ')[0]
            # Rest of the line is the text
            text = ' '.join(line_metadata[1].strip().split(' ')[1:])
        else:
            # Example line: speaker [timestamp] text
            line_metadata = line.split('[')
            speaker = line_metadata[0].strip()
            timestamp = line_metadata[1].split(']')[0].strip()
            text = line_metadata[1].split(']')[1].strip()

    return speaker, timestamp, text

In [6]:
def clean_and_extract_data(filename):
    lines = read_file_lines(filename)

    # Extract the episode number and title 
    episode_number, episode_title = extract_episode_info(lines)

    # Remove the first 12 lines which are not part of the transcript
    lines = lines[12:]

    # Remove the new line character from each line and remove empty lines
    lines = [line.strip() for line in lines if line.strip()]

    # Remove lines that are "COMMERCIAL BREAK" or "BREAK" or "-"
    lines = [line for line in lines if line != 'COMMERCIAL BREAK' and line != 'BREAK' and line != '-']
    
    # Find the index of the first line that contains the phrase "We're the Office Ladies" as a substring
    for i, line in enumerate(lines):
        if "we're the office ladies" in line.replace('"', '').lower():
            index = i
            break

    # Remove all lines before and including the line that contains the phrase "We're the Office Ladies"
    lines = lines[index + 1:]

    # Find the index of the last line that contains the phrase "Thank you for listening to Office Ladies" as a substring and remove all lines after that line (including the line that contains the phrase)
    for i, line in enumerate(lines):
        if "thank you for listening to office ladies" in line.replace('"', '').lower():
            index = i

    lines = lines[:index]

    # Extract the speaker, timestamp, and text from each line and store the data in a list of disctionaries
    lines_data = []
    for line in lines:
        speaker, timestamp, text = extract_line_metadata(line)
        lines_data.append({
            'speaker': speaker,
            'timestamp': timestamp,
            'text': text
        })

    # Create a dictionary with the episode number, title, and lines data
    episode_data = {
        'episode_number': episode_number,
        'episode_title': episode_title,
        'lines': lines_data
    }

    return episode_data

In [7]:
all_episodes_data = []
for i in range(1, 161):
    filename = f'episode{i}'
    print(filename)
    episode_data = clean_and_extract_data(filename)
    all_episodes_data.append(episode_data)

episode1
episode2
episode3
episode4
episode5
episode6
episode7
episode8
episode9
episode10
episode11
episode12
episode13
episode14
episode15
episode16
episode17
episode18
episode19
episode20
episode21
episode22
episode23
episode24
episode25
episode26
episode27
episode28
episode29
episode30
episode31
episode32
episode33
episode34
episode35
episode36
episode37
episode38
episode39
episode40
episode41
episode42
episode43
episode44
episode45
episode46
episode47
episode48
episode49
episode50
episode51
episode52
episode53
episode54
episode55
episode56
episode57
episode58
episode59
episode60
episode61
episode62
episode63
episode64
episode65
episode66
episode67
episode68
episode69
episode70
episode71
episode72
episode73
episode74
episode75
episode76
episode77
episode78
episode79
episode80
episode81
episode82
episode83
episode84
episode85
episode86
episode87
episode88
episode89
episode90
episode91
episode92
episode93
episode94
episode95
episode96
episode97
episode98
episode99
episode100
episode1

In [8]:
all_episodes_data[65]

{'episode_number': 'EPISODE 66',
 'episode_title': 'BRANCH WARS',
 'lines': [{'speaker': 'Angela',
   'timestamp': '00:00:30',
   'text': 'Fakey Stache Day on Office Ladies.'},
  {'speaker': 'Jenna',
   'timestamp': '00:00:33',
   'text': "It's Season four, Episode 10, written by Mindy Kaling and directed by Joss Whedon. There's so much good fake stache. Stache is lingo for Mustache."},
  {'speaker': 'Angela',
   'timestamp': '00:00:46',
   'text': 'Oh Jenna, thank you. Thank you for breaking that down for us.'},
  {'speaker': 'Jenna',
   'timestamp': '00:00:48',
   'text': "I wanted to break it down. I break down everything on Office Ladies. It's what we do. I'm going to hit you with a summary. Guys, Karen is back. She is now the regional manager of the Utica branch and she offers Stanley a higher paying job to leave Scranton. Ooh."},
  {'speaker': 'Angela', 'timestamp': '00:01:06', 'text': 'Oof.'},
  {'speaker': 'Jenna',
   'timestamp': '00:01:07',
   'text': "Guess what Michael's re

In [9]:
all_episodes_data[0]

{'episode_number': 'EPISODE 1',
 'episode_title': 'THE PILOT',
 'lines': [{'speaker': 'ANGELA',
   'timestamp': None,
   'text': 'Hi, this is Angela Kinsey.'},
  {'speaker': 'JENNA',
   'timestamp': None,
   'text': 'And this is our very first Office Ladies podcast.'},
  {'speaker': 'ANGELA', 'timestamp': None, 'text': 'I am so excited, Jenna!'},
  {'speaker': 'JENNA',
   'timestamp': None,
   'text': "I am so excited. I'm a little bit freaking out."},
  {'speaker': 'ANGELA',
   'timestamp': None,
   'text': "I'm freaking out. We hope you guys like it."},
  {'speaker': 'JENNA',
   'timestamp': None,
   'text': "We have been working really hard. This is a dream we've had for years of getting to work together again."},
  {'speaker': 'ANGELA', 'timestamp': None, 'text': 'Yes.'},
  {'speaker': 'JENNA',
   'timestamp': None,
   'text': 'Because we became best friends while working on The Office together.'},
  {'speaker': 'ANGELA',
   'timestamp': None,
   'text': 'We did. We ate lunch toget

## Embedding Data

In [10]:
from tqdm.auto import tqdm

In [11]:
def create_segments(episode_data):
    window = 6  # number of lines to combine
    stride = 3  # number of lines to stride over, used to create overlap

    lines = episode_data['lines']

    transcript_segments = []
    for i in tqdm(range(0, len(lines), stride)):
        i_end = min(i + window, len(lines)-1)
        text = ' '.join(line['text'] for line in lines[i:i_end+1])

        start = lines[i]['timestamp']
        end = lines[i_end]['timestamp']
        
        transcript_segments.append({
            'start_time': start,
            'end_time': end,
            'episode_title': episode_data['episode_title'],
            'episode_number': episode_data['episode_number'],
            'text': text,
            'id': f"{episode_data['episode_number'].replace(' ', '_')}-{i}-{i_end}"
        })

    return transcript_segments

In [12]:
all_episodes_segments = []
for episode_data in all_episodes_data:
    episode_segments = create_segments(episode_data)
    all_episodes_segments.append(episode_segments)

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/131 [00:00<?, ?it/s]

  0%|          | 0/183 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/128 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

  0%|          | 0/137 [00:00<?, ?it/s]

  0%|          | 0/224 [00:00<?, ?it/s]

  0%|          | 0/179 [00:00<?, ?it/s]

  0%|          | 0/238 [00:00<?, ?it/s]

  0%|          | 0/287 [00:00<?, ?it/s]

  0%|          | 0/207 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

  0%|          | 0/227 [00:00<?, ?it/s]

  0%|          | 0/298 [00:00<?, ?it/s]

  0%|          | 0/317 [00:00<?, ?it/s]

  0%|          | 0/268 [00:00<?, ?it/s]

  0%|          | 0/252 [00:00<?, ?it/s]

  0%|          | 0/196 [00:00<?, ?it/s]

  0%|          | 0/445 [00:00<?, ?it/s]

  0%|          | 0/228 [00:00<?, ?it/s]

  0%|          | 0/309 [00:00<?, ?it/s]

  0%|          | 0/287 [00:00<?, ?it/s]

  0%|          | 0/365 [00:00<?, ?it/s]

  0%|          | 0/203 [00:00<?, ?it/s]

  0%|          | 0/354 [00:00<?, ?it/s]

  0%|          | 0/174 [00:00<?, ?it/s]

  0%|          | 0/256 [00:00<?, ?it/s]

  0%|          | 0/228 [00:00<?, ?it/s]

  0%|          | 0/210 [00:00<?, ?it/s]

  0%|          | 0/170 [00:00<?, ?it/s]

  0%|          | 0/216 [00:00<?, ?it/s]

  0%|          | 0/190 [00:00<?, ?it/s]

  0%|          | 0/188 [00:00<?, ?it/s]

  0%|          | 0/169 [00:00<?, ?it/s]

  0%|          | 0/138 [00:00<?, ?it/s]

  0%|          | 0/248 [00:00<?, ?it/s]

  0%|          | 0/199 [00:00<?, ?it/s]

  0%|          | 0/272 [00:00<?, ?it/s]

  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/166 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/193 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/249 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/233 [00:00<?, ?it/s]

  0%|          | 0/217 [00:00<?, ?it/s]

  0%|          | 0/168 [00:00<?, ?it/s]

  0%|          | 0/169 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/261 [00:00<?, ?it/s]

  0%|          | 0/179 [00:00<?, ?it/s]

  0%|          | 0/189 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/146 [00:00<?, ?it/s]

  0%|          | 0/119 [00:00<?, ?it/s]

  0%|          | 0/147 [00:00<?, ?it/s]

  0%|          | 0/197 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/137 [00:00<?, ?it/s]

  0%|          | 0/177 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/165 [00:00<?, ?it/s]

  0%|          | 0/145 [00:00<?, ?it/s]

  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/121 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/141 [00:00<?, ?it/s]

  0%|          | 0/195 [00:00<?, ?it/s]

  0%|          | 0/123 [00:00<?, ?it/s]

  0%|          | 0/198 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/196 [00:00<?, ?it/s]

  0%|          | 0/164 [00:00<?, ?it/s]

  0%|          | 0/208 [00:00<?, ?it/s]

  0%|          | 0/192 [00:00<?, ?it/s]

  0%|          | 0/203 [00:00<?, ?it/s]

  0%|          | 0/189 [00:00<?, ?it/s]

  0%|          | 0/201 [00:00<?, ?it/s]

  0%|          | 0/213 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/177 [00:00<?, ?it/s]

  0%|          | 0/191 [00:00<?, ?it/s]

  0%|          | 0/158 [00:00<?, ?it/s]

  0%|          | 0/234 [00:00<?, ?it/s]

  0%|          | 0/182 [00:00<?, ?it/s]

  0%|          | 0/224 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

  0%|          | 0/172 [00:00<?, ?it/s]

  0%|          | 0/178 [00:00<?, ?it/s]

  0%|          | 0/172 [00:00<?, ?it/s]

  0%|          | 0/232 [00:00<?, ?it/s]

  0%|          | 0/279 [00:00<?, ?it/s]

  0%|          | 0/228 [00:00<?, ?it/s]

  0%|          | 0/201 [00:00<?, ?it/s]

  0%|          | 0/224 [00:00<?, ?it/s]

  0%|          | 0/214 [00:00<?, ?it/s]

  0%|          | 0/174 [00:00<?, ?it/s]

  0%|          | 0/195 [00:00<?, ?it/s]

  0%|          | 0/90 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/179 [00:00<?, ?it/s]

  0%|          | 0/131 [00:00<?, ?it/s]

  0%|          | 0/148 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/131 [00:00<?, ?it/s]

  0%|          | 0/203 [00:00<?, ?it/s]

  0%|          | 0/188 [00:00<?, ?it/s]

  0%|          | 0/172 [00:00<?, ?it/s]

  0%|          | 0/194 [00:00<?, ?it/s]

  0%|          | 0/178 [00:00<?, ?it/s]

  0%|          | 0/165 [00:00<?, ?it/s]

  0%|          | 0/268 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/124 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/246 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/203 [00:00<?, ?it/s]

  0%|          | 0/242 [00:00<?, ?it/s]

  0%|          | 0/202 [00:00<?, ?it/s]

  0%|          | 0/193 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/197 [00:00<?, ?it/s]

  0%|          | 0/238 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

  0%|          | 0/283 [00:00<?, ?it/s]

  0%|          | 0/232 [00:00<?, ?it/s]

  0%|          | 0/238 [00:00<?, ?it/s]

  0%|          | 0/228 [00:00<?, ?it/s]

  0%|          | 0/189 [00:00<?, ?it/s]

  0%|          | 0/179 [00:00<?, ?it/s]

  0%|          | 0/193 [00:00<?, ?it/s]

  0%|          | 0/236 [00:00<?, ?it/s]

  0%|          | 0/251 [00:00<?, ?it/s]

  0%|          | 0/233 [00:00<?, ?it/s]

  0%|          | 0/248 [00:00<?, ?it/s]

  0%|          | 0/230 [00:00<?, ?it/s]

  0%|          | 0/183 [00:00<?, ?it/s]

  0%|          | 0/196 [00:00<?, ?it/s]

  0%|          | 0/217 [00:00<?, ?it/s]

  0%|          | 0/305 [00:00<?, ?it/s]

  0%|          | 0/211 [00:00<?, ?it/s]

  0%|          | 0/189 [00:00<?, ?it/s]

  0%|          | 0/198 [00:00<?, ?it/s]

  0%|          | 0/182 [00:00<?, ?it/s]

  0%|          | 0/214 [00:00<?, ?it/s]

  0%|          | 0/186 [00:00<?, ?it/s]

  0%|          | 0/176 [00:00<?, ?it/s]

  0%|          | 0/204 [00:00<?, ?it/s]

  0%|          | 0/192 [00:00<?, ?it/s]

  0%|          | 0/236 [00:00<?, ?it/s]

In [13]:
all_episodes_segments[0]

[{'start_time': None,
  'end_time': None,
  'episode_title': 'THE PILOT',
  'episode_number': 'EPISODE 1',
  'text': "Hi, this is Angela Kinsey. And this is our very first Office Ladies podcast. I am so excited, Jenna! I am so excited. I'm a little bit freaking out. I'm freaking out. We hope you guys like it. We have been working really hard. This is a dream we've had for years of getting to work together again. Yes.",
  'id': 'EPISODE_1-0-6'},
 {'start_time': None,
  'end_time': None,
  'episode_title': 'THE PILOT',
  'episode_number': 'EPISODE 1',
  'text': "I am so excited. I'm a little bit freaking out. I'm freaking out. We hope you guys like it. We have been working really hard. This is a dream we've had for years of getting to work together again. Yes. Because we became best friends while working on The Office together. We did. We ate lunch together every single day. Every single day. And I think the hardest part for me about the show ending was not getting to see you everyday, A

In [14]:
all_episodes_segments[100]

[{'start_time': '00:00:25',
  'end_time': '00:00:43',
  'episode_title': 'BROKE',
  'episode_number': 'EPISODE 101',
  'text': "Good morning, everyone. I always say good morning. It's cuz we're here in the morning. Cuz we're here in the morning. Every time I do it, Jenna gives me this look like, Angela, it might be morning for everyone else. Don't stop. Don't stop saying good morning. Should we talk about the episode we're going to be breaking down today? I would love nothing more. I found this episode hilarious.",
  'id': 'EPISODE_101-0-6'},
 {'start_time': '00:00:37',
  'end_time': '00:00:51',
  'episode_title': 'BROKE',
  'episode_number': 'EPISODE 101',
  'text': "Don't stop. Don't stop saying good morning. Should we talk about the episode we're going to be breaking down today? I would love nothing more. I found this episode hilarious. This snuck up on me. Yes, it was a sneaker upper. I absolutely loved it. You know, you look back and you think about The Office and all the like sor

In [15]:
from sentence_transformers import SentenceTransformer

In [16]:
model_id = "multi-qa-MiniLM-L6-cos-v1"
sentence_transformer_model = SentenceTransformer(model_id)

In [17]:
sentence_transformer_model.get_sentence_embedding_dimension()

384

In [18]:
batch_text = [segment['text'] for segment in all_episodes_segments[0]]

In [19]:
len(batch_text)

154

In [20]:
batch_text[0]

"Hi, this is Angela Kinsey. And this is our very first Office Ladies podcast. I am so excited, Jenna! I am so excited. I'm a little bit freaking out. I'm freaking out. We hope you guys like it. We have been working really hard. This is a dream we've had for years of getting to work together again. Yes."

In [21]:
batch_embeddings = sentence_transformer_model.encode(batch_text, show_progress_bar=True).tolist()

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

In [22]:
len(batch_embeddings)

154

In [23]:
len(batch_embeddings[0])

384