<a href="https://colab.research.google.com/github/rohit9mehta/video-search/blob/main/video_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Imports
!pip install transformers pytube sentence-transformers

# optional install pytorch so you can use a gpu for faster transcription
# command below is for Linux. See instructions for mac and windows: https://pytorch.org/get-started/locally/
!pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu

!pip install git+https://github.com/openai/whisper.git -q
!apt install ffmpeg # https://stackoverflow.com/questions/51856340/how-to-install-package-ffmpeg-in-google-colab
!pip install pytubefix
!pip install tqdm

!pip install -U sentence-transformers pinecone-client

from typing import Dict

from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import whisper
import torch
from pytubefix import YouTube
import time
from getpass import getpass

PINECONE_API_KEY = getpass('Enter PINECONE_API_KEY')


from sentence_transformers import SentenceTransformer

import pinecone  # !pip install pinecone-client
# import os
from pinecone import Pinecone, ServerlessSpec

Collecting pytube
  Downloading pytube-15.0.0-py3-none-any.whl.metadata (5.0 kB)
Downloading pytube-15.0.0-py3-none-any.whl (57 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytube
Successfully installed pytube-15.0.0
Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cpu
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.5/209.5 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m45.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ff

In [3]:
class EndpointHandler():
    def __init__(self, path=""):
        # load the model
        WHISPER_MODEL_NAME = "tiny.en"
        SENTENCE_TRANSFORMER_MODEL_NAME = "multi-qa-mpnet-base-dot-v1"

        device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f'whisper will use: {device}')

        t0 = time.time()
        self.whisper_model = whisper.load_model(WHISPER_MODEL_NAME).to(device)
        t1 = time.time()

        total = t1 - t0
        print(f'Finished loading whisper_model in {total} seconds')

        t0 = time.time()
        self.sentence_transformer_model = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL_NAME)
        t1 = time.time()

        total = t1 - t0
        print(f'Finished loading sentence_transformer_model in {total} seconds')

    def __call__(self, data: Dict[str, str]) -> Dict:
        """
        Args:
            data (:obj:):
                includes the URL to video for transcription
        Return:
            A :obj:`dict`:. transcribed dict
        """
        # process input
        print('data', data)

        # if "inputs" not in data:
        #     raise Exception(f"data is missing 'inputs' key which  EndpointHandler expects. Received: {data}"
        #                     f" See: https://huggingface.co/docs/inference-endpoints/guides/custom_handler#2-create-endpointhandler-cp")
        video_urls = data.pop("video_urls", None)
        query = data.pop("query", None)
        trying_live = data.pop("trying_live", None)
        encoded_segments = {}
        if video_urls:
          videos_with_transcript = [self.transcribe_video(video_url) for video_url in video_urls]
          encode_transcript = data.pop("encode_transcript", True)
          if encode_transcript:
              encoded_segments = self.combine_transcripts(videos_with_transcript)
              if trying_live:
                return {
                    "encoded_segments": encoded_segments
                }
              encoded_segments = {
                  "encoded_segments": self.encode_sentences(encoded_segments)
              }
          return {
              **videos_with_transcript,
              **encoded_segments
          }
        elif query:
            query = [{"text": query, "id": ""}]
            encoded_segments = self.encode_sentences(query)

            return {
                "encoded_segments": encoded_segments
            }

    def transcribe_video(self, video_url):
        decode_options = {
            # Set language to None to support multilingual,
            # but it will take longer to process while it detects the language.
            # Realized this by running in verbose mode and seeing how much time
            # was spent on the decoding language step
            "language": "en",
            "verbose": True
        }
        yt = YouTube(video_url)
        video_info = {
            'id': yt.video_id,
            'thumbnail': yt.thumbnail_url,
            'title': yt.title,
            'views': yt.views,
            'length': yt.length,
            # Althhough, this might seem redundant since we already have id
            # but it allows the link to the video be accessed in 1-click in the API response
            'url': f"https://www.youtube.com/watch?v={yt.video_id}"
        }
        stream = yt.streams.filter(only_audio=True)[0]
        path_to_audio = f"{yt.video_id}.mp3"
        stream.download(filename=path_to_audio)
        t0 = time.time()
        transcript = self.whisper_model.transcribe(path_to_audio, **decode_options)
        t1 = time.time()
        for segment in transcript['segments']:
            # Remove the tokens array, it makes the response too verbose
            segment.pop('tokens', None)

        total = t1 - t0
        print(f'Finished transcription in {total} seconds')

        # postprocess the prediction
        return {"transcript": transcript, 'video': video_info}

    def encode_sentences(self, transcripts, batch_size=64):
        """
        Encoding all of our segments at once or storing them locally would require too much compute or memory.
        So we do it in batches of 64
        :param transcripts:
        :param batch_size:
        :return:
        """
        # loop through in batches of 64
        all_batches = []
        for i in tqdm(range(0, len(transcripts), batch_size)):
            # find end position of batch (for when we hit end of data)
            i_end = min(len(transcripts), i + batch_size)
            # extract the metadata like text, start/end positions, etc
            batch_meta = [{
                **row
            } for row in transcripts[i:i_end]]
            # extract only text to be encoded by embedding model
            batch_text = [
                row['text'] for row in batch_meta
            ]
            # create the embedding vectors
            batch_vectors = self.sentence_transformer_model.encode(batch_text).tolist()

            batch_details = [
                {
                    **batch_meta[x],
                    'vectors':batch_vectors[x]
                } for x in range(0, len(batch_meta))
            ]
            all_batches.extend(batch_details)

        return all_batches

    @staticmethod
    def combine_transcripts(videos, window=6, stride=3):
        """

        :param video:
        :param window: number of sentences to combine
        :param stride: number of sentences to 'stride' over, used to create overlap
        :return:
        """
        new_transcript_segments = []

        for video in videos:
          video_info = video['video']
          transcript_segments = video['transcript']['segments']
          for i in tqdm(range(0, len(transcript_segments), stride)):
              i_end = min(len(transcript_segments), i + window)
              text = ' '.join(transcript['text']
                              for transcript in
                              transcript_segments[i:i_end])
              # TODO: Should int (float to seconds) conversion happen at the API level?
              start = int(transcript_segments[i]['start'])
              end = int(transcript_segments[i]['end'])
              new_transcript_segments.append({
                  **video_info,
                  **{
                      'start': start,
                      'end': end,
                      'title': video_info['title'],
                      'text': text,
                      'id': f"{video_info['id']}-t{start}",
                      'url': f"https://youtu.be/{video_info['id']}?t={start}",
                      'video_id': video_info['id'],
                  }
              })
        return new_transcript_segments

payload = {"video_urls": ["https://www.youtube.com/watch?v=w4CMaKF_IXI", "https://www.youtube.com/watch?v=PQtMTPhmQwM"], "trying_live": True} # I Tried Every Fast Food Chicken Tender In America


# # test the handler
my_handler = EndpointHandler(path="")
payload_pred=my_handler(payload)
payload_pred

model_id = "multi-qa-mpnet-base-dot-v1"

sentence_transformer_model = SentenceTransformer(model_id)
sentence_transformer_model

dimensions = sentence_transformer_model.get_sentence_embedding_dimension()

index_id = "youtube-search"

pc = Pinecone(
        api_key=PINECONE_API_KEY
    )

if index_id not in pc.list_indexes():
    pc.create_index(
        name=index_id,
        dimension=dimensions,
        metric="dotproduct",
        spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
      )
)

pinecone_index = pc.Index(index_id)
pinecone_index.describe_index_stats()

# we encode and insert in batches of 64
batch_size = 64

def upload_transcripts_to_vector_db(transcripts_for_upload):
  # loop through in batches of 64
  for i in tqdm(range(0, len(transcripts_for_upload), batch_size)):
      # find end position of batch (for when we hit end of data)
      i_end = min(len(transcripts_for_upload)-1, i+batch_size)
      # extract the metadata like text, start/end positions, etc
      batch_meta = [{
          **transcripts_for_upload[x]
      } for x in range(i, i_end)]
      # extract only text to be encoded by embedding model
      batch_text = [
          row['text'] for row in transcripts_for_upload[i:i_end]
      ]
      # create the embedding vectors
      batch_embeds = sentence_transformer_model.encode(batch_text).tolist()
      # extract IDs to be attached to each embedding and metadata
      batch_ids = [
          row['id'] for row in transcripts_for_upload[i:i_end]
      ]
      # 'upsert' (insert) IDs, embeddings, and metadata to index
      to_upsert = list(zip(
          batch_ids, batch_embeds, batch_meta
      ))
      pinecone_index.upsert(to_upsert)
      print(f'Uploaded Batches: {i} to {i_end}')
# print([len(x['vectors']) for x in payload_pred.get('encoded_segments')])
upload_transcripts_to_vector_db(payload_pred.get('encoded_segments'))

def query_model(query, video_ids=[]):
  encoded_query = sentence_transformer_model.encode(query).tolist()
  metadata_filter = { "video_id": {"$in": video_ids}} if video_ids else None
  # print(encoded_query)
  # print(metadata_filter)
  return pinecone_index.query(vector=encoded_query, top_k=5,
                              include_metadata=True,
                              filter=metadata_filter)

query_phrase = "maple syrup"
results = query_model(query_phrase, ["w4CMaKF_IXI", "PQtMTPhmQwM"])
query_phrase_2 = "crispy exterior"
results_2 = query_model(query_phrase_2, ["w4CMaKF_IXI", "PQtMTPhmQwM"])
# results = query_model(query_phrase)
results['matches'][0]

whisper will use: cpu


  checkpoint = torch.load(fp, map_location=device)


Finished loading whisper_model in 1.875312328338623 seconds
Finished loading sentence_transformer_model in 0.5872910022735596 seconds
data {'video_urls': ['https://www.youtube.com/watch?v=w4CMaKF_IXI', 'https://www.youtube.com/watch?v=PQtMTPhmQwM'], 'trying_live': True}




[00:00.000 --> 00:02.720]  Chicken tenders meteoric rise needs to be studied.
[00:02.720 --> 00:08.160]  They've become so popular that their entire restaurant concepts based around the chicken tender.
[00:08.160 --> 00:10.960]  In this sea of options, which chicken tender is king?
[00:10.960 --> 00:15.280]  We're tasting and raking every major fast food chicken tender in America, starting with
[00:15.280 --> 00:18.640]  Zach's piece. Can I just get a 10 piece chicken finger? Thank you.
[00:18.640 --> 00:20.880]  Just for the record, we're not doing any sauces.
[00:20.880 --> 00:24.240]  I know people are like, oh, you should compare sauces, but it wouldn't be a fair competition
[00:24.240 --> 00:27.440]  between who makes the best tender. Should we do it every fast food sauce?
[00:27.440 --> 00:30.800]  I don't want to. Remember, in order for Russia to qualify, number one,
[00:30.800 --> 00:35.360]  they must have a minimum of 300 locations across America, and number two, they have to



[00:00.000 --> 00:04.080]  You know, it's funny, their whole brand is pancakes and yet I just don't think they get it right.
[00:04.080 --> 00:07.280]  It's good, but how often are you sober when you go?
[00:11.920 --> 00:16.720]  I get it, they're pancakes. You can't really screw them up that bad, right? Flashback to iHOB, what's that?
[00:16.720 --> 00:18.320]  International House of Burgers, why?
[00:18.320 --> 00:18.880]  Mm.
[00:18.880 --> 00:23.040]  iHOB has this unusually vast menu that's completely unnecessary because the whole concept is pancakes
[00:23.040 --> 00:26.400]  and people only go there for pancakes. I don't know how they sustain their business model, but whatever.
[00:26.400 --> 00:30.720]  We're here to destabilize the pancake world. So let's make this, shall we?
[00:30.720 --> 00:32.320]  The new kitchen's almost done, let's go look.
[00:35.360 --> 00:38.160]  What do you think you're doing? I can't show you that, but I can show you this.
[00:38.160 --> 00:41.60

100%|██████████| 83/83 [00:00<00:00, 60240.05it/s]
100%|██████████| 108/108 [00:00<00:00, 113729.56it/s]
 33%|███▎      | 1/3 [00:38<01:17, 38.96s/it]

Uploaded Batches: 0 to 64


 67%|██████▋   | 2/3 [01:06<00:32, 32.01s/it]

Uploaded Batches: 64 to 128


100%|██████████| 3/3 [01:22<00:00, 27.61s/it]

Uploaded Batches: 128 to 190





{'id': 'PQtMTPhmQwM-t288',
 'metadata': {'end': 289.0,
              'id': 'PQtMTPhmQwM-t288',
              'length': 699.0,
              'start': 288.0,
              'text': " Of course maple syrup.  Good lord.  And let's face our "
                      "first opponent.  See when it's real butter it wants to "
                      'fall.  This is going nowhere.  These are very tall in '
                      'comparison.',
              'thumbnail': 'https://i.ytimg.com/vi/PQtMTPhmQwM/sddefault.jpg',
              'title': 'Making IHOP Pancakes at Home | But Better',
              'url': 'https://youtu.be/PQtMTPhmQwM?t=288',
              'video_id': 'PQtMTPhmQwM',
              'views': 1817480.0},
 'score': 27.112524,
 'values': []}

In [4]:
results_2['matches'][0]

{'id': 'w4CMaKF_IXI-t44',
 'metadata': {'end': 48.0,
              'id': 'w4CMaKF_IXI-t44',
              'length': 1159.0,
              'start': 44.0,
              'text': " Oh, that's on the cusp. I'm very confused by this. I'm "
                      "like in between.  It's fresh cooked, so it's nice and "
                      "hot, which I do like. It's juicy. It's not overcooked,  "
                      "but the chicken's just stringy. If you feel like they "
                      "didn't let it thaw completely or something  when they "
                      "fried it. Should be tender. It's really not. Salt level "
                      'is about 10% over what it should be.  Not enough '
                      "breading. It's crisp, but not as crisp as I like it to "
                      'be. Anyway, this is a six and a half.  Six point seven. '
                      "Okay, not bad Zach's piece. Above average, moving on. "
                      "Zach's piece with our first",
