In [1]:
import openai
import os
import numpy as np

from dotenv import load_dotenv
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import Markdown, display

load_dotenv('~/.secrets')

# use open ai to create embeddings
openai.api_key = os.getenv('OPENAI_KEY')


%reload_ext dotenv
%dotenv

Let's load up some data!!

In [9]:
def load_transcripts(file_name: str):
    # load text file transcript.txt 
    with open(file_name, 'r') as file:
        transcript = file.read()
    return transcript

def clean_transcript(transcript: str):
    # remove timestamps from lines
    lines = transcript.strip('\n').split('\n')
    formatted_lines = [line.split(']')[1].strip() for line in lines]
    return formatted_lines

transcript = load_transcripts('transcript.txt')
formatted_lines = clean_transcript(transcript)
formatted_lines[:5]

['Welcome to the WAN show everyone.',
 'The truth is, I screwed up this week.',
 "It's very obvious that, I mean, I just couldn't conceal it anymore.",
 'We were paid by NVIDIA for our RTX 4060 TI review.',
 'And the scariest part of what I just said is that a not insignificant number of people']

Process

1. For a given WAN show, create the whisper transcript for it.
2. Create embeddings for every line.
3. Store the embeddings in Pinecone
    - Need to store embeddings
    - + also information about the podcast
    - For ex timestamp, date / show of the podcast etc.
4. (Optional) - Also embed summaries of the WAN show 
    - For example, Ask an LLM "What was the topics discussed on today's show"? and embed it.
    

### Generating and Storing Embeddings

In [10]:
def get_embeddings(line: str):
    return openai.Embedding.create(input=[line], model="text-embedding-ada-002")["data"][0]["embedding"]


# create the embedding for store the emebdding and line
embedding_type = np.dtype([
    ('embedding', np.float64, (1536,)), 
    ('line', str, 10000),
    ('start_time', np.float64),
])

# initialize the embeddings array
embeddings = np.empty(len(formatted_lines), dtype=embedding_type)

# load embeddings from file if it exists
if os.path.exists('embeddings.npy'):
    embeddings = np.load('embeddings.npy', allow_pickle=True)

Generate the embeddings for a few lines...

In [10]:
question_embedding = get_embeddings("What are they discussing in the show?")

Hmmm. This makes me feel like -
- we can/should embed more than line by line? Because otherwise the vector search is on a very small context.
- OR, once the similar line is received, get also +- 5 secs of audio around the line for even more context.

In [11]:
from datetime import datetime

def extract_start_end_time(line: str):
    # Extract the start and end time string from the square brackets
    time_string = line[line.find('[') + 1:line.find(']')]

    # Split the time string into start and end time
    start_time, end_time = time_string.split(' --> ')

    # Parse the start and end time into datetime objects
    start_datetime = datetime.strptime(start_time, "%H:%M:%S.%f")
    end_datetime = datetime.strptime(end_time, "%H:%M:%S.%f")

    # Extract the time components (hours, minutes, seconds) from datetime objects
    start_time = start_datetime.time()
    end_time = end_datetime.time()

    return start_time, end_time

def get_time_diff(start, end):
    return (datetime.combine(datetime.min, end) - datetime.combine(datetime.min, start)).total_seconds()

# get time in seconds from end
def get_time_in_seconds(end):
    return (datetime.combine(datetime.min, end) - datetime.min).total_seconds()

# parse out timestamps
lines = transcript.strip('\n').split('\n')

# join timestamps for every 20s
start, end = extract_start_end_time(lines[1])
# get difference in seconds

get_time_diff(start, end)

4.04

Get embeddings of a certain duration of content...

In [21]:
# initialize the embeddings array
embeddings = np.empty(len(formatted_lines), dtype=embedding_type)

# variable
DURATION = 20

embed_text = ''
text_duration = 0
embed_idx = 0
start_time = 0
# add tqdm to show progress bar
from tqdm.notebook import tqdm

for i in tqdm(range(len(lines))):
    raw_line = lines[i]
    # get the line start and end time
    start, end = extract_start_end_time(raw_line)
    # get the difference in seconds
    time_diff = get_time_diff(start, end)
    # get the raw line
    line = raw_line.split(']')[1].strip()

    # if the time difference is less than the embed duration, add the line to the embed text
    if text_duration < DURATION:
        embed_text += line + ' '
        text_duration += time_diff
        continue

    embeddings[embed_idx]['embedding'] = get_embeddings(embed_text)
    embeddings[embed_idx]['line'] = embed_text
    embeddings[embed_idx]['start_time'] = start_time
    # restart from this line
    embed_text = line
    start_time = get_time_in_seconds(end)
    text_duration = 0
    embed_idx += 1

# remove embeddings with sum 0
embeddings = embeddings[~np.all(embeddings['embedding'] == 0, axis=1)]
# save embeddings
# np.save('embeddings.npy', embeddings)

  0%|          | 0/2497 [00:00<?, ?it/s]

Save embeddings whenever necessary...

In [23]:
# save embeddings
# np.save('embeddings.npy', embeddings)

In [12]:
def answer_question_from_context(question: str, most_similar_indices: list):
    # append all lines from most similar indices
    context = '\n'.join([embeddings[index]['line'] for index in most_similar_indices])
    # create the prompt
    prompt = f"""
        Question: {question}
        Context:
        {context}
    """

    # create the completion
    completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a bot that answers questions about the WAN show from Linus Tech Tips."},
            {"role": "user", "content": prompt},
            {"role": "user", "content": "Give Answer in bullet points. Explain in detail."},
        ]
    )
    return completion.choices[0].message.content

def get_context_to_question(question: str):
    question_embedding = get_embeddings(question)

    # calculate the cosine similarity between the question and each line
    similarities = np.array([
        cosine_similarity(
            np.array(question_embedding).reshape(1, -1), 
            np.array(embedding['embedding']).reshape(1, -1)
        ) 
        for embedding in embeddings
    ])
    # get the 5 most similar lines
    most_similar_indices = np.argsort(similarities, axis=0)[-10:].flatten()
    return most_similar_indices


def answer_question(question: str):
    # get the most similar indices
    most_similar_indices = get_context_to_question(question)
    # answer the question
    answer = answer_question_from_context(question, most_similar_indices)
    return answer, most_similar_indices

In [8]:
answer, _ = answer_question("What are the topics discussed in the WAN show?")
display(Markdown(answer))

Topic: Topics Discussed in the WAN Show

Bullet points:
- Eight Sleep and viewer concerns about mandatory subscriptions
- Signalwire sponsorship
- Traveling with LAN gear tips
- Channel strategy and content evolution
- National Eating Disorders Association chatbot controversy
- Interesting places merch has been spotted
- NVIDIA paid review disclosure
- Mega channels and content categorization

Explanation:

The WAN Show is a weekly show hosted by Linus and Luke from Linus Tech Tips that covers a broad range of technology-related topics. In this particular episode, some of the topics discussed were:

- Eight Sleep and viewer concerns about mandatory subscriptions: Eight Sleep, a sponsor of Linus Tech Tips, recently faced criticism from viewers for requiring a mandatory subscription for their products. Linus and Luke discussed the issue and acknowledged viewers' concerns, as well as their own disappointment with the messaging of the product.
- Signalwire sponsorship: The show also featured a sponsorship segment for Signalwire, a cloud communications company.
- Traveling with LAN gear tips: One viewer asked for tips on traveling with LAN gear, and Linus and Luke jokingly suggested removing the GPU and talked about some of their own experiences with transporting equipment.
- Channel strategy and content evolution: The hosts discussed their content strategy and how it has evolved over the years, with a focus on category-specific channels and mega channels like MKBHD and LTT.
- National Eating Disorders Association chatbot controversy: The show also briefly touched on a controversy involving the National Eating Disorders Association replacing a helpline with a chatbot.
- Interesting places merch has been spotted: Another viewer asked about the most interesting place Linus Tech Tips merch has been spotted, and the hosts shared some fun examples.
- NVIDIA paid review disclosure: Linus briefly disclosed that they were paid by NVIDIA for an RTX 4060 TI review, and acknowledged that some viewers might be concerned about the disclosure.
- Mega channels and content categorization: Finally, the hosts discussed their thoughts on mega channels and category-specific channels, and how content categorization may play a role in the future of online content.

In [9]:
answer, _ = answer_question("What is the RTX 4060ti discourse about?")
display(Markdown(answer))

Topic: RTX 4060ti discourse

• People are disappointed with the RTX 4060ti review and think that it should have been more critical of NVIDIA.

• Some comments suggest that Linus may be supporting NVIDIA because he didn't give a harsh enough critique.

• Linus acknowledges that the naming of the RTX 4060ti is a problem, as it is marketed as a 60Ti card but has 50 series performance and is priced higher than it should be.

• Linus notes that AMD's product stack is also confusing, with a mix of 60 and 50 series cards.

• Linus mentions that they made an error in the review where they incorrectly stated the RTX 4060ti had a 16x interface instead of an 8x interface.

• Linus talks about AMD's recent price drops on the 7600XT, which is now priced at $269.

• Linus jokes that they were paid by NVIDIA for the review, which is not true.

• The truth is that Linus is proud of their RTX 4060ti and RX7600 reviews, but acknowledges that they made some errors.

• Reports suggest that the RTX 4060ti has not generated much consumer interest and some retailers are already discounting it.

TODO
 - Given the answer, get the context it referred to, and then the timestamps, so that you can link it back to the youtube video!
 - Embed summaries of popular questions (What is the wan show about?)
 - Specify metadata structure
    - line
    - date / timestamp
    - youtube video link
- Play with whisper settings
    - Max character limit per generation?
    - How many words to translate at a time?
- Embed conversations with a little bit of overlap. Maybe use langchain.
- Use "Think about it step by step" prompting / reAct.

### Getting Context

IMO, one of the major benefits of video/audio search like this is being able to not only summarize, but also refer to the the snippets of the video / audio for further references.

In [71]:
answer, context = answer_question("What is the RTX 4060ti controversy about?")

Get the relevant lines which contributed to the summary...

In [73]:
video_url = "https://www.youtube.com/watch?v=0vP5Knq1xhs"

def get_video_timestamp(main_url: str, start_time: float):
    return f"{main_url}&t={int(start_time)}"

for index in context[-10:]:
    print(embeddings[index]['start_time'])
    print(embeddings[index]['line'])
    print(get_video_timestamp(video_url, embeddings[index]['start_time']))

3234.4
Let's go. Like, Oh, come on. Oh, well. Anyway, AMD, of course, dropped prices on the 7600 XT down to 269, only 36 hours before launch. Okay, I'd actually like to talk about that. 
https://www.youtube.com/watch?v=0vP5Knq1xhs&t=3234
221.96
But we've got a couple of comments in a row here. Like, why can't you just say it's a bad deal? You're not crapping on Nvidia makes it look like you're caping for Nvidia. Not saying you are, but that's what it looks like. This review also feels short and unrigorous. This is a rare thumbs down for me. We had probably more game benchmarking than just them we've ever had before. 
https://www.youtube.com/watch?v=0vP5Knq1xhs&t=221
410.28
It doesn't affect the numbers that we showed you guys, but that is something that we could have gotten a little bit better. I think there were a couple small things in the 7600. But every time we release a new GPU review, we're getting a little bit cleaner about it. It's a lot of moving parts, guys. But this is one o

Hmm, we need to have the timestamps as part of the embedding too, otherwise it will be hard to know where it came from.

In [65]:
get_video_timestamp(video_url, 0)

'https://www.youtube.com/watch?v=0vP5Knq1xhs?t=0'

Inserting embeddings into postgres with pgvector...

In [24]:
from sqlalchemy import Integer, String, create_engine, text, JSON, insert, Float
from sqlalchemy.orm import declarative_base, mapped_column, Session
from pgvector.sqlalchemy import Vector

postgres_pwd = os.environ.get('POSTGRES_PWD')
# supabase engine
db_uri = 'postgresql://postgres:{}@db.wiagwfwjtbojsbzmeduv.supabase.co:5432/postgres'.format(postgres_pwd)
engine = create_engine(db_uri)

Base = declarative_base()

class Document(Base):
    """
    A class used to represent a Document
    """

    __tablename__ = 'docs'

    id = mapped_column(Integer, primary_key=True, autoincrement=True)
    embedding = mapped_column(Vector(1536))
    line = mapped_column(String)
    meta = mapped_column(JSON)
    video_url = mapped_column(String)
    timestamp = mapped_column(Float)
    created_at = mapped_column(String, server_default=text('NOW()'))
    

# TODO: what does this line do?
Base.metadata.drop_all(engine)
Base.metadata.create_all(engine)

video_url='https://www.youtube.com/watch?v=0vP5Knq1xhs'
documents = [
    dict(
        embedding=embedding['embedding'], 
        line=embedding['line'],
        meta={
            'line': embedding['line'], 
            'start_time': embedding['start_time']
        },
        video_url=video_url,
        timestamp=embedding['start_time']
    ) for embedding in embeddings
]

session = Session(engine)
session.execute(insert(Document), documents)
session.commit()