In [34]:
import os
import json
import pandas as pd
import numpy as np
import openai
from dotenv import load_dotenv
from tqdm.notebook import tqdm

load_dotenv('~/.secrets')

# use open ai to create embeddings
openai.api_key = os.getenv('OPENAI_KEY')

%reload_ext dotenv
%dotenv

Get the transcripts + video urls in one big data object...

In [45]:
def load_transcripts(file_name: str):
    # load text file transcript.txt 
    df = pd.read_csv('../data/transcribed/' + file_name, on_bad_lines='skip')
    return df

def compare_string(title1, title2):
    words1 = title1.split(' ')
    words2 = title2.split(' ')
    # check if first 3 words are the same
    if words1[:3] == words2[:3]:
        return True
    
def get_video_data(file_name: str):
    for video in video_data:
        if compare_string(video['title'], file_name.split('.wav.csv')[0]):
            return video['url'], video['upload_date']

transcribed_dir = os.listdir('../data/transcribed')
video_data = json.load(open('../data/videos.json', 'r'))

transcript_data = []
for file in transcribed_dir:
    df = load_transcripts(file)
    video_url, video_upload_date = get_video_data(file)

    transcript_data.append({
        'video_url': video_url,
        'file': file,
        'transcript': df,
        'upload_date': video_upload_date
    })

Load embeddings and see which transcripts need to be newly processed...

In [53]:
original_embeddings = np.load('embeddings.npy', allow_pickle=True)

completed_transcripts = list(set([x['video_url'] for x in original_embeddings]))


new_transcripts = []
for transcript in transcript_data:
    if transcript['video_url'] in completed_transcripts:
        continue
    new_transcripts.append(transcript)
    print("Processing", transcript['video_url'])

Processing https://www.youtube.com/watch?v=lbjWRvzL-o0
Processing https://www.youtube.com/watch?v=4NnXdK-Kncc
Processing https://www.youtube.com/watch?v=lh8Zdyy3zTQ
Processing https://www.youtube.com/watch?v=AxAAJnp5yms
Processing https://www.youtube.com/watch?v=EUu0bnDNC-A
Processing https://www.youtube.com/watch?v=xZD5x39M6zw
Processing https://www.youtube.com/watch?v=Luz82RG5PqA
Processing https://www.youtube.com/watch?v=1dhGKw7ph3s
Processing https://www.youtube.com/watch?v=0kLX6ewqQ98
Processing https://www.youtube.com/watch?v=CUb942pcIdQ
Processing https://www.youtube.com/watch?v=TXsw_92Y2e0
Processing https://www.youtube.com/watch?v=A6_UOejJ8Zk
Processing https://www.youtube.com/watch?v=zYpyS2HaZHM
Processing https://www.youtube.com/watch?v=m4RolgXsoxY
Processing https://www.youtube.com/watch?v=vKZXiQOO52I
Processing https://www.youtube.com/watch?v=KkmZGFEpd5c
Processing https://www.youtube.com/watch?v=ltyntSIVsjA
Processing https://www.youtube.com/watch?v=SqgSEgVph1U
Processing

Okay, now that all the data is in one place, we can embed multiple transcripts in a batched way...

In [54]:
def get_embeddings(line: str):
    return openai.Embedding.create(input=[line], model="text-embedding-ada-002")["data"][0]["embedding"]

# create the embedding for store the emebdding and line
embedding_type = np.dtype([
    ('embedding', np.float64, (1536,)), 
    ('line', str, 10000),
    ('start_time', np.float64),
    ('video_url', str, 1000),
])

# initialize the embeddings array
embeddings = []

# how many (s) to embed in 1 go
DURATION = 10

# iterate through each transcript
for transcript in new_transcripts:
    print("Embedding video: " ,transcript['file'])
    video_url = transcript['video_url']
    df = transcript['transcript']

    embed_text = ''
    text_duration = 0
    start_time = 0
    # iterate through each line in the transcript
    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        line = row['text']
        # convert time to seconds
        start, end = row['start'] / 1000, row['end'] / 1000

        # if the time diff is less than embed duration, add to embed text
        if text_duration < DURATION:
            embed_text += line + ' '
            text_duration += end - start
            continue
        
        arr = (
            get_embeddings(embed_text),
            embed_text,
            start_time,
            video_url
        )
        embeddings.append(arr)
        # restart from this line
        embed_text = line
        start_time = end
        text_duration = 0

embeddings = np.array(embeddings, dtype=embedding_type)
# append new embeddings to old embeddings
embeddings = np.append(embeddings, original_embeddings)
# remove embeddings with sum 0
embeddings = embeddings[~np.all(embeddings['embedding'] == 0, axis=1)]

Embedding video:  Intel Takes The L  - WAN Show August 5, 2022.wav.csv


  0%|          | 0/3326 [00:00<?, ?it/s]

Embedding video:  Is Intel ARC REALLY Cancelled？  - WAN Show July 29, 2022.wav.csv


  0%|          | 0/1115 [00:00<?, ?it/s]

Embedding video:  Our Worst Week in Years - WAN Show March 24, 2023.wav.csv


  0%|          | 0/5531 [00:00<?, ?it/s]

Embedding video:  Our Biggest Sponsor Pulled Out - WAN Show February 10, 2023.wav.csv


  0%|          | 0/2740 [00:00<?, ?it/s]

Embedding video:  I've never been so angry...  - WAN Show September 2, 2022.wav.csv


  0%|          | 0/1297 [00:00<?, ?it/s]

Embedding video:  Userbench CPU score DRAMA - WAN Show Aug 9, 2019.wav.csv


  0%|          | 0/1599 [00:00<?, ?it/s]

Embedding video:  Why Do I Keep Getting Called Out - WAN Show December 2, 2022.wav.csv


  0%|          | 0/2846 [00:00<?, ?it/s]

Embedding video:  I'm A Short King - WAN Show January 20, 2023.wav.csv


  0%|          | 0/4153 [00:00<?, ?it/s]

Embedding video:  USB Branding Changed Again... - WAN Show September 30, 2022.wav.csv


  0%|          | 0/3246 [00:00<?, ?it/s]

Embedding video:  Lenovo Attacked My Investment  - WAN Show August 19, 2022.wav.csv


  0%|          | 0/1775 [00:00<?, ?it/s]

Embedding video:  We Talked To A VP At Microsoft - WAN Show December 23, 2022.wav.csv


  0%|          | 0/3243 [00:00<?, ?it/s]

Embedding video:  The Bank of Silicon Valley Just Collapsed - WAN Show March 10, 2023.wav.csv


  0%|          | 0/4530 [00:00<?, ?it/s]

Embedding video:  Where Will This End？ - WAN Show November 25, 2022.wav.csv


  0%|          | 0/2826 [00:00<?, ?it/s]

Embedding video:  I Am Hard... [REDACTED] - WAN Show March 17, 2023.wav.csv


  0%|          | 0/5770 [00:00<?, ?it/s]

Embedding video:  We've Made Some Big Mistakes - WAN Show November 18, 2022.wav.csv


  0%|          | 0/2072 [00:00<?, ?it/s]

Embedding video:  My Reaction Channel Is Getting Called Out - WAN Show January 27, 2023.wav.csv


  0%|          | 0/2569 [00:00<?, ?it/s]

Embedding video:  4K YouTube Is Getting PAYWALLED - WAN Show October 7, 2022.wav.csv


  0%|          | 0/3657 [00:00<?, ?it/s]

Embedding video:  YouTube Backstabbed Me - WAN Show October 21, 2022.wav.csv


  0%|          | 0/1374 [00:00<?, ?it/s]

Embedding video:  Coffeezilla EXPOSES My Fellow Creators - WAN Show December 30, 2022.wav.csv


  0%|          | 0/2494 [00:00<?, ?it/s]

Embeddings generated, now time to insert!

In [12]:
# embeddings = np.load('embeddings.npy', allow_pickle=True)

from sqlalchemy import Integer, String, create_engine, text, JSON, insert, Float
from sqlalchemy.orm import declarative_base, mapped_column, Session
from pgvector.sqlalchemy import Vector

postgres_pwd = os.environ.get('POSTGRES_PWD')
# supabase engine
db_uri = 'postgresql://postgres:{}@db.wiagwfwjtbojsbzmeduv.supabase.co:5432/postgres'.format(postgres_pwd)
engine = create_engine(db_uri)

Base = declarative_base()

class Document(Base):
    """
    A class used to represent a Document
    """

    __tablename__ = 'docs'

    id = mapped_column(Integer, primary_key=True, autoincrement=True)
    embedding = mapped_column(Vector(1536))
    line = mapped_column(String)
    meta = mapped_column(JSON)
    video_url = mapped_column(String)
    timestamp = mapped_column(Float)
    created_at = mapped_column(String, server_default=text('NOW()'))
    

# TODO: what does this line do?
Base.metadata.drop_all(engine)
Base.metadata.create_all(engine)

documents = [
    dict(
        embedding=embedding['embedding'], 
        line=embedding['line'],
        meta={
            'line': embedding['line'], 
            'start_time': embedding['start_time']
        },
        video_url=embedding['video_url'],
        timestamp=embedding['start_time']
    ) for embedding in embeddings
]

session = Session(engine)
session.execute(insert(Document), documents)
session.commit()

In [56]:
# saving embeddings locally
np.save('embeddings.npy', embeddings)

Also append data about youtube to DB...

In [21]:
from sqlalchemy import Integer, String, create_engine, text, JSON, insert, Float, Date
from sqlalchemy.orm import declarative_base, mapped_column, Session
from pgvector.sqlalchemy import Vector

postgres_pwd = os.environ.get('POSTGRES_PWD')
# supabase engine
db_uri = 'postgresql://postgres:{}@db.wiagwfwjtbojsbzmeduv.supabase.co:5432/postgres'.format(postgres_pwd)
engine = create_engine(db_uri)

Base = declarative_base()

class Videos(Base):
    """
    A class used to represent a Document
    """

    __tablename__ = 'videos'

    id = mapped_column(Integer, primary_key=True, autoincrement=True)
    title = mapped_column(String)
    url = mapped_column(String)
    # create a date column to store the date the video was created
    video_uploaded_at = mapped_column(Date)
    created_at = mapped_column(String, server_default=text('NOW()'))
    

# TODO: what does this line do?
Base.metadata.drop_all(engine)
Base.metadata.create_all(engine)

videos = [
    dict(
        title=video['file'].split('.')[0],
        url=video['video_url'],
        video_uploaded_at=video["upload_date"]
    ) for video in transcript_data
]

session = Session(engine)
session.execute(insert(Videos), videos)
session.commit()