<a href="https://colab.research.google.com/github/shaoyinguo-portfolio/CorpGenie-exp/blob/main/MeetingTranscripting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Audio File Download

In [1]:
import cv2
from google.colab import drive
from matplotlib import pyplot as plt
import numpy as np
# from google.colab.patches import cv2_imshow
from tqdm.notebook import tqdm
from multiprocessing import Pool, cpu_count
from pathlib import Path
from time import time

try:
    import gdown
except:
    !pip install gdown
    import gdown

In [2]:
try:
    drive.mount('/content/drive')
    data_path = Path('/content/drive/MyDrive/Colab Notebooks/data')
    print('Mounted Google Drive')
except:
    data_path = Path('./data')
    print('Mounted local drive')

if not data_path.exists():
    data_path.mkdir()

VIDEO_PATH = f'{data_path}/video.data'
AUDIO_PATH = f'{data_path}/audio.data'
TRANSCRIPT_PATH = f'{data_path}/transcripts.txt'
KEYFRAME_PATH = f'{data_path}/key_frames'

# gdown.download(url='https://drive.google.com/uc?id=1XfDxDUFQ2bSOCO0DzQtyH2Yzg9Ff0S-F', output=VIDEO_PATH, quiet=False)
gdown.download(url='https://drive.google.com/uc?id=1EhqRX_hnPeyc13Zimh11gdUnUIyCEhN6', output=AUDIO_PATH, quiet=False)

Mounted at /content/drive
Mounted Google Drive


Downloading...
From: https://drive.google.com/uc?id=1EhqRX_hnPeyc13Zimh11gdUnUIyCEhN6
To: /content/drive/MyDrive/Colab Notebooks/data/audio.data
100%|██████████| 22.2M/22.2M [00:02<00:00, 11.0MB/s]


'/content/drive/MyDrive/Colab Notebooks/data/audio.data'

## Transcript using OpenAI Whisper Locally

- Use T4 GPU to accelerate
- One might see that Whisper may capture a lot of terminologies wrong. But with the key frames and more powerful LLMs, they will be corrected.

In [3]:
try:
    import whisper
except:
    !pip install -q openai-whisper
    import whisper


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/803.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone


In [4]:
def transcript_audio(audio_path, transcript_path):
    all_lines = []

    # Change this to 'base' if 'small' causes memory issues,
    # but 'small' is generally recommended for Colab balance.
    t = time()
    model = whisper.load_model("small.en")
    print(f"Whisper model is loaded onto device: {model.device}. Start transcripting...")
    result = model.transcribe(audio_path, word_timestamps=True)

    # print(f"\nTranscripting complete. Took {int(time() - t)} seconds. Writing to file...")
    # Open a file and write the results line by line
    with open(transcript_path, "w") as f:
        # f.write(f"Transcription of '{audio_path}' using '{transcript_path}' model:\n\n")

        for segment in result['segments']:
            # Format the line as [start_time -> end_time] Text
            output_line = f"[{segment['start']:.2f}] {segment['text'].strip()}\n"
            all_lines.append((float(segment['start']), segment['text'].strip()))
            # print(output_line.strip()) # Print to console
            f.write(output_line)      # Write to file

    print(f"\nSaving complete. Took {int(time() - t)} seconds.")

    return all_lines

In [5]:
all_lines = transcript_audio(AUDIO_PATH, TRANSCRIPT_PATH)
all_lines[:10]


100%|███████████████████████████████████████| 461M/461M [00:22<00:00, 21.7MiB/s]


Whisper model is loaded onto device: cpu. Start transcripting...





Saving complete. Took 4072 seconds.


[(8.959999999999997,
  "Today, I'm going to talk about TSMC and Intel, COVOS, EMIP, Furbils and Chiplets."),
 (17.54,
  'This is part of my class about introduction to packaging process technologies.'),
 (23.28,
  'As you know, TSMC is currently number one in semiconductor manufacturing.'),
 (31.08,
  'Their strength, not just in a high yield of 3 nanometers or even 2 nanometers,'),
 (38.38,
  'their packaging is also not just one of the best, but actually the best.'),
 (46.62, 'Among the three giants, TSMC, Samsung, and Intel.'),
 (52.04, 'And their process, of course, is more than COVOS.'),
 (56.08, 'They have their 3D fabrics and everything else.'),
 (61.02, "But today, I'm going to focus pretty much on COVOS only."),
 (64.96, "And then we'll explore the other arena when ready.")]