# Imports & Globals

In [94]:
import os
import replicate
import string
import random
import json
import instructor
from pytube import YouTube
from typing_extensions import Annotated
from typing import List, Optional, Dict
from pydantic import Field, BaseModel, AfterValidator
from google.cloud import storage
from openai import OpenAI
from dotenv import load_dotenv
from nltk import tokenize

# Set your environment variables. see .env.example
load_dotenv() 
client = storage.Client()
o_client = instructor.patch(OpenAI())
data_dir = "data"

# Functions

In [82]:
alphabet = string.ascii_lowercase + string.digits
def random_choice():
    '''
    Return a 8 letter random string
    '''
    return ''.join(random.choices(alphabet, k=8))

In [83]:
def upload_to_bucket(blob_name, file_path):
    '''
    Upload a file to Google Cloud Storage
        Input: blob_name: String, name of Blob on GCS
               file_path: String, path to local file to upload
        Output: public URL: String, of the uploaded blob
    '''
    bucket = client.get_bucket(os.getenv('GCS_BUCKET'))
    blob = bucket.blob(blob_name)
    blob.upload_from_filename(file_path)
    return blob.public_url

In [84]:
def get_transcript(url, fname):
    '''
    Replicate Deployment call to fetch transcript.
    Writes out the transcript JSON and text to files
        Input: url: String, of the uploaded blob
               fname: String, filename to write out JSON and text files
        Output: out: dictionary, output of transcript call with timestamps
                concatenated_text: String, full transcript text
    '''
    out = replicate.run(
    "daanelson/whisperx:9aa6ecadd30610b81119fc1b6807302fd18ca6cbb39b3216f430dcf23618cedd",
    input={"audio": url, "batch_size": 16, "align_output": True}
    )

    # Write out the JSON transcript
    fp = open(f"{data_dir}/{fname}.json", "w")
    fp.write(json.dumps(out))

    # Write out the text transcript
    concatenated_text = " ".join(item['text'] for item in out)
    fp = open(f"{data_dir}/{fname}.txt", "w")
    fp.write(concatenated_text)

    return out, concatenated_text

In [85]:
def get_youtube(yt_link, fname):
    '''
    Download a Youtube Video and write out a unique mp4 and mp3 file
    Input: yt_link: String, URL of youtube video
               fname: String, filename to write out Audio and Video files
    Output: out: audio_path: String path to MP3 audio file 
                 video_path: String path to MP4 video file
    '''
    fname = f"{data_dir}/{fname}.mp4"
    yt = YouTube(yt_link)
    video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download(filename=fname)
    audio_path = video_path.replace(".mp4", ".mp3")
    cmd = f"ffmpeg -i {video_path} -vn {audio_path} -hide_banner -loglevel error"
    os.system(cmd)
    return audio_path, video_path

In [86]:
def chunk_text(text, chunk_size, overlap):
    """
    Splits a text into chunks with a specified size and overlap.

    Input:
    text (str): The text to be chunked.
    chunk_size (int): The size of each chunk in words.
    overlap (int): The number of words to overlap between consecutive chunks.

    Output:
    list: A list of text chunks.
    """
    words = text.split()
    chunks = []
    start = 0

    while start < len(words):
        end = start + chunk_size
        chunk = ' '.join(words[start:end])
        chunks.append(chunk)
        start = end - overlap  # Overlap words for the next chunk

        # Break the loop if we reach the end of the text
        if end >= len(words):
            break

    return chunks

In [87]:
def find_timestamps(json_data, sentence):
    """
    Finds the start and end timestamps of a specific sentence within a given JSON data structure.

    The function searches for the first 20 characters of the sentence at the beginning and the last 20 characters 
    at the end within the 'text' field of each JSON object. When the start of the sentence is found, it records 
    the 'start' timestamp. It continues to search for the end of the sentence, and when found, records the 'end' 
    timestamp of the last word that matches. The search is case-insensitive.

    Parameters:
    json_data (list of dicts): A list of dictionaries, each containing 'text' and 'words' fields, 
                               where 'words' is a list of dicts with 'word' and 'end' fields.
    sentence (str): The sentence for which the start and end timestamps are to be found.

    Returns:
    tuple: A tuple (start_time, end_time) where both are timestamps. 'start_time' is the timestamp 
           when the sentence starts, and 'end_time' is the timestamp when the sentence ends. 
           Returns (None, None) if the sentence is not found.
    """
    start_time = None
    end_time = None
    start = sentence[:20]
    end = sentence[-20:]
    #print(f"start: {start}, end: {end}")
    for item in json_data:
        if start.lower() in item['text'].lower():
            #print(item['text'])
            start_time = item['start']
        if start_time and end.lower() in item['text'].lower():
            #print(item['text'])
            end_words = end.split(" ")
            end_w = end_words[-1]
            item_words = item['words']
            for i_w in item_words:
                #print(f"i_w = {i_w}")
                ip_w = i_w['word'].translate(str.maketrans('', '', string.punctuation))
                end_w = end_w.translate(str.maketrans('', '', string.punctuation))
                #print(f"ip_w = {ip_w}")
                if end_w == ip_w:
                    end_time = i_w['end']
                    break
        if start_time is not None and end_time is not None:
            break

    return start_time, end_time

In [126]:
class Quote(BaseModel):
    '''
    A quote is a sentence or few sentences extracted verbatim from a long transcript.
    Quotes are interesting because they may be insightful or funny or contrarian or a punchline.
    Quotes are valuable because they create great social media engagement.
    A quote is also independent & standalone and doesnt need context defined before it to be consumed.
    They are entites that can be understood without needing the transcript
    '''
    quote: str

class Section(BaseModel):
    '''
    A transcript string is long and can contain many topics spoken at length in sequence.
    A section represents the summary of a subset of the transcript
    We have Top 10 Quotes that can be part of a single Section. There can be fewer than 10 but no more than 10
    There are usually many Sections in a transcript.
    '''
    section: str
    quotes: List[Quote]

class AllSections(BaseModel):
    '''
    All Sections found are part of a List
    '''
    sections : List[Section]
    
# class TranscriptClip(BaseModel):
#     '''
#     A Clip is a small sequential monologue or dialogue that is extracted from a long transcript which may be an interview or podcast.
#     A Clip can qualify for short form standalone engaging content.
#     '''
#     #clip : Annotated[str, AfterValidator(clip_length_check)] = Field(description="An engaging clip extracted verbatim from a long transcript")
#     clip : str = Field(description="An engaging clip extracted verbatim from a long transcript")
    
# class AllClips(BaseModel):
#     '''
#     A List of TranscriptClips
#     '''
#     clips: List[TranscriptClip]

def get_quotes(transcript):
    """
    Extracts engaging clips from a given transcript for social media content.

    The function utilizes a GPT-4 model to analyze the transcript and extract relevant clips. 

    Parameters:
    transcript (str): The long text input, such as a transcript of a talk, interview, or podcast.

    Returns:
    AllSections: A list of Sections found and many Quotes for each Section
    """
    sections = o_client.chat.completions.create(
      model="gpt-4-0125-preview",
      response_model= AllSections,
      messages=[
        {"role": "user", "content": transcript}
      ],
        temperature=0.0,
        max_tokens=4095,
        max_retries=3
    )
    return sections.model_dump_json()

# Start Here

In [127]:
# Step 1 : Generate a random name, and fetch a Youtube video's MP4 and MP3 with that name
fname = random_choice()
audio_path, video_path = get_youtube("https://youtu.be/3EJHMoh3Q1k?si=bjug0g2Ynis3rGh4", fname)

In [128]:
#Step 2 : Upload the audio path to the storage bucket and get its transcript
url = upload_to_bucket(f"{fname}.mp3", audio_path)
out, concatenated_text = get_transcript(url, fname)

In [129]:
# Step 3: Get Quotes from the transcript
oai_response = get_quotes(concatenated_text)

In [130]:
print(oai_response)

{"sections":[{"section":"Introduction","quotes":[{"quote":"Today, a successful man and a thorough contrarian. Why everything you thought you knew about business may very well be wrong. Peter Thiel, Uncommon Knowledge, now."},{"quote":"Welcome to Uncommon Knowledge. I'm Peter Robinson."},{"quote":"A co-founder of PayPal, a co-founder of Palantir, the president of Clarium Capital, a managing partner in the Founders Fund, and the first outside investor in Facebook, Peter Thiel is one of Silicon Valley's leading investors, one of its leading thinkers, and since finding himself portrayed in the movie The Social Network, one of its leading celebrities."},{"quote":"You can't escape it now. The son of German immigrants, Mr. Thiel grew up just up the road in Foster City, California. After majoring in philosophy as a Stanford undergraduate, he attended Stanford Law School, which is where he and I first became friends. Before returning to California to begin his career in business, Mr. Thiel prac

In [None]:
# Step 4: Extract the clips
clip_dict = json.loads(oai_response)
clip_list = clip_dict["clips"]

In [None]:
len(clip_list)

In [None]:
# Step 5: Clip it!
count = 0
for c in clip_list:
    s_t , e_t = find_timestamps(out, c['clip'])
    #print(f"found {s_t} and {e_t}\n\n")
    if s_t and e_t:
        cmd = f"ffmpeg -y -ss {s_t} -to {e_t} -i {video_path}  -c copy {data_dir}/{fname}_clipped_{count}.mp4 -hide_banner -loglevel error"
        os.system(cmd)
        count = count+1
    else:
        print(f"Couldnt find {c['title']}")