# Imports & Globals

In [None]:
import os
import replicate
import string
import random
import json
from pytube import YouTube
from google.cloud import storage
from openai import OpenAI
from dotenv import load_dotenv

# Set your environment variables. see .env.example
load_dotenv() 
client = storage.Client()
o_client = OpenAI()
data_dir = "data"

# Functions

In [None]:
alphabet = string.ascii_lowercase + string.digits
def random_choice():
    '''
    Return a 8 letter random string
    '''
    return ''.join(random.choices(alphabet, k=8))

In [None]:
def upload_to_bucket(blob_name, file_path):
    '''
    Upload a file to Google Cloud Storage
        Input: blob_name: String, name of Blob on GCS
               file_path: String, path to local file to upload
        Output: public URL: String, of the uploaded blob
    '''
    bucket = client.get_bucket(os.getenv('GCS_BUCKET'))
    blob = bucket.blob(blob_name)
    blob.upload_from_filename(file_path)
    return blob.public_url

In [None]:
def get_transcript(url, fname):
    '''
    Replicate Deployment call to fetch transcript.
    Writes out the transcript JSON and text to files
        Input: url: String, of the uploaded blob
               fname: String, filename to write out JSON and text files
        Output: out: dictionary, output of transcript call with timestamps
                concatenated_text: String, full transcript text
    '''
    deployment = replicate.deployments.get("shrihacker/miniseconds-whisperx")
    prediction = deployment.predictions.create(
      input={"audio": url, "batch_size": 16, "align_output": True}
    )
    prediction.wait()
    out = prediction.output

    # Write out the JSON transcript
    fp = open(f"{data_dir}/{fname}.json", "w")
    fp.write(json.dumps(out))

    # Write out the text transcript
    concatenated_text = " ".join(item['text'] for item in out)
    fp = open(f"{data_dir}/{fname}.txt", "w")
    fp.write(concatenated_text)

    return out, concatenated_text

In [None]:
def get_youtube(yt_link, fname):
    '''
    Download a Youtube Video and write out a unique mp4 and mp3 file
    Input: yt_link: String, URL of youtube video
               fname: String, filename to write out Audio and Video files
    Output: out: audio_path: String path to MP3 audio file 
                 video_path: String path to MP4 video file
    '''
    fname = f"{data_dir}/{fname}.mp4"
    yt = YouTube(yt_link)
    video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download(filename=fname)
    audio_path = video_path.replace(".mp4", ".mp3")
    cmd = f"ffmpeg -i {video_path} -vn {audio_path} -hide_banner -loglevel error"
    os.system(cmd)
    return audio_path, video_path

In [None]:
def chunk_text(text, chunk_size, overlap):
    """
    Splits a text into chunks with a specified size and overlap.

    Input:
    text (str): The text to be chunked.
    chunk_size (int): The size of each chunk in words.
    overlap (int): The number of words to overlap between consecutive chunks.

    Output:
    list: A list of text chunks.
    """
    words = text.split()
    chunks = []
    start = 0

    while start < len(words):
        end = start + chunk_size
        chunk = ' '.join(words[start:end])
        chunks.append(chunk)
        start = end - overlap  # Overlap words for the next chunk

        # Break the loop if we reach the end of the text
        if end >= len(words):
            break

    return chunks

In [None]:
def find_timestamps(json_data, sentence):
    """
    Finds the start and end timestamps of a specific sentence within a given JSON data structure.

    The function searches for the first 20 characters of the sentence at the beginning and the last 20 characters 
    at the end within the 'text' field of each JSON object. When the start of the sentence is found, it records 
    the 'start' timestamp. It continues to search for the end of the sentence, and when found, records the 'end' 
    timestamp of the last word that matches. The search is case-insensitive.

    Parameters:
    json_data (list of dicts): A list of dictionaries, each containing 'text' and 'words' fields, 
                               where 'words' is a list of dicts with 'word' and 'end' fields.
    sentence (str): The sentence for which the start and end timestamps are to be found.

    Returns:
    tuple: A tuple (start_time, end_time) where both are timestamps. 'start_time' is the timestamp 
           when the sentence starts, and 'end_time' is the timestamp when the sentence ends. 
           Returns (None, None) if the sentence is not found.
    """
    start_time = None
    end_time = None
    start = sentence[:20]
    end = sentence[-20:]
    #print(f"start: {start}, end: {end}")
    for item in json_data:
        if start.lower() in item['text'].lower():
            #print(item['text'])
            start_time = item['start']
        if start_time and end.lower() in item['text'].lower():
            #print(item['text'])
            end_words = end.split(" ")
            end_w = end_words[-1]
            item_words = item['words']
            for i_w in item_words:
                #print(f"i_w = {i_w}")
                ip_w = i_w['word'].translate(str.maketrans('', '', string.punctuation))
                end_w = end_w.translate(str.maketrans('', '', string.punctuation))
                #print(f"ip_w = {ip_w}")
                if end_w == ip_w:
                    end_time = i_w['end']
                    break
        if start_time is not None and end_time is not None:
            break

    return start_time, end_time

In [None]:
system_prompt = """
You are a published bestseller NYT Author and Social Media Influencer.
Given some long text that is an interview or podcast or transcript of a talk, you find clips that qualify for short form standalone content that will be published online.
Clips are defined as a group of sequential sentences that can represent engaging standalone content.
Our goal is social media engagement.
The maximum length of a clip that qualifies is 20 sentences.
You should not alter the text simply find as many clips as you can and return the clipped text, whole and verbatim.
Your response should be a JSON list of dictionaries. Make a single key called "clips" which is a List of dicts
Each dicionary has key 'clip': the whole verbatim clipped text and key 'title': a short soundbite title why the clip is engaging.
Example JSON Structure:
{
    "clips" : [ {"clip": <verbatim clip that qualifies>, "title" : <a short soundbite title>}, 
                {"clip": <verbatim clip that qualifies>, "title" : <a short soundbite title>},
                ...
            ]
{
"""
def get_quotes(transcript):
    """
    Extracts engaging clips from a given transcript for social media content.

    The function utilizes a GPT-4 model to analyze the transcript and extract relevant clips. Each extracted 
    clip is accompanied by a short, engaging title that encapsulates its essence. The output is structured as 
    a JSON object with a single key "clips", which is a list of dictionaries. Each dictionary contains two keys: 
    'clip' for the verbatim text of the extracted segment, and 'title' for its engaging title.

    Parameters:
    transcript (str): The long text input, such as a transcript of a talk, interview, or podcast.

    Returns:
    str: A JSON string representing a list of extracted clips. Each clip is a dictionary with 'clip' and 'title' keys.
    """
    response = o_client.chat.completions.create(
      model="gpt-4-1106-preview",
      response_format={ "type": "json_object" },
      messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": transcript}
      ],
        temperature=0.2,
        max_tokens=4095,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        n = 1
    )
    return response.choices[0].message.content

# Start Here

In [None]:
# Step 1 : Generate a random name, and fetch a Youtube video's MP4 and MP3 with that name
fname = random_choice()
audio_path, video_path = get_youtube("https://youtu.be/9uOMectkCCs?si=PNvQKilpwlxxbLPe", fname)

In [None]:
#Step 2 : Upload the audio path to the storage bucket and get its transcript
url = upload_to_bucket(f"{fname}.mp3", audio_path)
out, concatenated_text = get_transcript(url, fname)

In [None]:
# Step 3: Get Quotes from the transcript
oai_response = get_quotes(concatenated_text)

In [None]:
# Step 4: Extract the clips
clip_dict = json.loads(oai_response)
clip_list = clip_dict["clips"]

In [None]:
len(clip_list)

In [None]:
# Step 5: Clip it!
count = 0
for c in clip_list:
    print(f"{c['title']} : {c['clip']}")
    s_t , e_t = find_timestamps(out, c['clip'])
    #print(f"found {s_t} and {e_t}\n\n")
    if s_t and e_t:
        cmd = f"ffmpeg -y -ss {s_t} -to {e_t} -i {video_path}  -c copy {data_dir}/{fname}_clipped_{count}.mp4 -hide_banner -loglevel error"
        os.system(cmd)
        count = count+1
    else:
        print(f"Couldnt find {c['title']}")