In [177]:
from openai import OpenAI
import os
import math
import base64
from pydub import AudioSegment

from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv("API_KEY")
client = OpenAI(api_key=api_key)

In [178]:
chunk_size_bytes = 20 * 1024 * 1024
file_path = "tedx.mp3"
output_folder = "chunks"
num_chunks = -1

In [179]:
def chunk_file_by_size(file_name, chunk_size_bytes, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    
    with open(file_name, 'rb') as file:
        file_size = os.path.getsize(file_name)
        num_chunks = (file_size + chunk_size_bytes -  1) // chunk_size_bytes
        for i in range(num_chunks):
            chunk_file_name = os.path.join(output_folder, f"{os.path.splitext(os.path.basename(file_name))[0]}_chunk{i}.mp3")
            chunk = file.read(chunk_size_bytes)
            with open(chunk_file_name, 'wb') as chunk_file:
                chunk_file.write(chunk)

    return num_chunks

In [180]:
num_chunks = chunk_file_by_size(file_path, chunk_size_bytes, output_folder)

In [181]:
def transcribe_chunk(chunk_file):
    with open(chunk_file, "rb") as audio_file:
        transcription = client.audio.transcriptions.create(
            model="whisper-1",
            file=audio_file,
            response_format="verbose_json",
            timestamp_granularities=["word"]
        )
    return transcription

In [182]:
def get_transcription(file_path, num_chunks):
    transcriptions = []
    
    for i in range(num_chunks):
        chunk_file_name = f"{os.path.splitext(os.path.basename(file_path))[0]}_chunk{i}.mp3"
        
        path = os.path.join(os.getcwd(), output_folder, chunk_file_name)
        transcription = transcribe_chunk(path)
        os.remove(path)

        transcriptions.append(transcription)

    return transcriptions

In [183]:
import time
start_time = time.perf_counter()

transcriptions = get_transcription(file_path, num_chunks)

end_time = time.perf_counter()

time_spent = end_time - start_time
print(f"The line of code took {time_spent:.6f} seconds to execute.")

The line of code took 42.955328 seconds to execute.


In [184]:
print(len(transcriptions[0].text.split(" ")))

2104


In [185]:
# from moviepy.editor import VideoFileClip

# def convert_mp4_to_mp3(mp4_file_path, mp3_file_path):
#     video_clip = VideoFileClip(mp4_file_path)
#     audio_clip = video_clip.audio
#     audio_clip.write_audiofile(mp3_file_path)


In [186]:
import tiktoken
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

num_tokens_from_string(transcriptions[0].text, "cl100k_base")


2556

In [187]:
# transcriptions[0].text

In [188]:
response = client.chat.completions.create(
    model="gpt-3.5-turbo-0125",
    # response_format={"type":"json_object"},
    messages=[
        {"role":"system", "content":"You are a helpful assistant designed to output json."},
        {"role":"user", "content":"Summarize the following transcription." + transcriptions[0].text},
    ]
)

In [189]:
print(response.choices[0].message.content)

{
  "summary": "The transcription is a recount of a conversation with a student who learned a valuable lesson at a Vietnamese monastery about finding meaning in work. It touches on the modern challenge of finding purpose in work and proposes a new approach to understanding and cultivating purpose. It suggests that purpose is not a fixed concept but a dynamic intersection of personal interests, skills, and societal demands. The importance of continuous learning, discovery, and impact in work is emphasized as key components of fulfilling one's purpose.",
  "main_ideas": [
    "Learning from a story at a Vietnamese monastery about finding meaning in work",
    "Modern challenge of finding purpose in work",
    "Proposing a new approach to understanding and cultivating purpose",
    "Purpose as a dynamic intersection of personal interests, skills, and societal demands",
    "Importance of continuous learning, discovery, and impact in work"
  ]
}


In [192]:
response = client.chat.completions.create(
  model="gpt-3.5-turbo-0125",
  # response_format={ "type": "json_object" },
  messages=[
    {"role": "system", "content": "You are a helpful text summarizer assistant."},
    {"role": "user", "content": "Summarize this in about 30 lines: " + transcriptions[0].text}
  ]
)
print(response.choices[0].message.content)

At a party in San Francisco, a student shares a Zen story about moving dirt with no clear purpose at a Vietnamese monastery. Feeling disillusioned with his own work, he learns from a fellow apprentice the value of finding meaning in seemingly mundane tasks. The student's realization sparks a journey to help others find purpose in their work. Despite societal emphasis on purpose-driven work, many people still feel disconnected and unfulfilled in their jobs. Traditional approaches like one-day workshops based on bestselling books fail to address individual complexities and changing circumstances. A new perspective emphasizes self-impact fit, aligning personal desires, abilities, market demands, and external opportunities. Recognizing the human need for impact and contribution, the narrative explores how individuals can find fulfillment by embracing curiosity, developing skills, and making small discoveries in their work. By shifting focus from grand achievements to incremental progress a