In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import csv
import os
import networkx as nx
from subprocess import check_output
from wordcloud import WordCloud, STOPWORDS
import tensorflow as tf
from IPython.display import YouTubeVideo
plt.style.use('ggplot')

2024-03-21 18:07:56.399135: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
DATA_PATH = "/Users/runzhou/Desktop/youtube/data/yt8m/video/"
# Inspect data directory
print(os.listdir(DATA_PATH))

# Explore video-level data
video_lvl_data_path = DATA_PATH + "train0274.tfrecord"
raw_dataset = tf.data.TFRecordDataset(video_lvl_data_path)


vid_ids = []
labels = []
rgb = []
audio = []

for example in tf.compat.v1.python_io.tf_record_iterator(video_lvl_data_path):
    seq_example = tf.train.Example.FromString(example)
    vid_ids.append(seq_example.features.feature['id'].bytes_list.value[0].decode(encoding='UTF-8'))
    labels.append(seq_example.features.feature['labels'].int64_list.value)
    rgb.append(seq_example.features.feature['mean_rgb'].float_list.value)
    audio.append(seq_example.features.feature['mean_audio'].float_list.value)

print('Number of videos in this tfrecord: ',len(vid_ids))
print ('Number of labels in this tfrecord: ', len (labels))
print('Number of mean RGB in this tfrecord: ', len(rgb))
print('Number of mean audio in this tfrecord: ', len(audio))
print('Picking a youtube video id:', vid_ids[1])

['2_video_train_download_plan.json', 'train0434.tfrecord', 'train0274.tfrecord', 'train0208.tfrecord', 'train0580.tfrecord', 'train2161.tfrecord']
Number of videos in this tfrecord:  1045
Number of labels in this tfrecord:  1045
Number of mean RGB in this tfrecord:  1045
Number of mean audio in this tfrecord:  1045
Picking a youtube video id: zseA


In [6]:
DATA_PATH = "/Users/runzhou/Desktop/youtube/data/yt8m/frame/"
# Inspect frame-level data directory
print(os.listdir(DATA_PATH))

import tensorflow as tf

# Explore frame-level data
frame_lvl_data_path = DATA_PATH + "train0274.tfrecord"
raw_dataset = tf.data.TFRecordDataset(frame_lvl_data_path)

vid_ids = []
labels = []
rgb = []
audio = []

for example in tf.compat.v1.python_io.tf_record_iterator(frame_lvl_data_path):
    seq_example = tf.train.SequenceExample.FromString(example)
    vid_ids.append(seq_example.context.feature['id'].bytes_list.value[0].decode(encoding='UTF-8'))
    labels.append(seq_example.context.feature['labels'].int64_list.value)

    # Decode RGB features
    rgb_encoded = seq_example.feature_lists.feature_list['rgb'].feature[0].bytes_list.value[0]
    rgb_decoded = tf.io.decode_raw(rgb_encoded, tf.float32)
    rgb.append(rgb_decoded.numpy().tolist())

    # Decode audio features
    audio_encoded = seq_example.feature_lists.feature_list['audio'].feature[0].bytes_list.value[0]
    audio_decoded = tf.io.decode_raw(audio_encoded, tf.float32)
    audio.append(audio_decoded.numpy().tolist())

print('Number of videos in this tfrecord: ',len(vid_ids))
print ('Number of labels in this tfrecord: ', len (labels))
print('Number of mean RGB in this tfrecord: ', len(rgb))
print('Number of mean audio in this tfrecord: ', len(audio))
print('Picking a youtube video id:', vid_ids[1])

['2_frame_train_download_plan.json', 'train0274.tfrecord', 'train0580.tfrecord', 'train2161.tfrecord']
Number of videos in this tfrecord:  1045
Number of labels in this tfrecord:  1045
Number of mean RGB in this tfrecord:  1045
Number of mean audio in this tfrecord:  1045
Picking a youtube video id: WjeA


In [7]:
DATA_PATH = "/Users/runzhou/Desktop/youtube/data/yt8m/segment/"

# Inspect segment-level data directory
print(os.listdir(DATA_PATH))

# Explore segment-level data
segment_lvl_data_path = DATA_PATH + "validate3489.tfrecord"
raw_dataset = tf.data.TFRecordDataset(segment_lvl_data_path)

vid_ids = []
labels = []
rgb = []
audio = []
segment_start_times = []
segment_end_times = []
segment_scores = []
segment_labels = []


for example in tf.compat.v1.python_io.tf_record_iterator(segment_lvl_data_path):
    # Context features
    example_example = tf.train.Example.FromString(example)
    
    vid_ids.append(example_example.features.feature['id'].bytes_list.value[0].decode(encoding='UTF-8'))
    labels.append(example_example.features.feature['labels'].int64_list.value)

    # Segment labels & scores
    segment_start_times.append(example_example.features.feature['segment_start_times'].int64_list.value)
    segment_end_times.append(example_example.features.feature['segment_end_times'].int64_list.value)
    segment_labels.append(example_example.features.feature['segment_labels'].int64_list.value)

    # Frame features
    seq_example = tf.train.SequenceExample.FromString(example)

    # Decode RGB features
    rgb_encoded = seq_example.feature_lists.feature_list['rgb'].feature[0].bytes_list.value[0]
    rgb_decoded = tf.io.decode_raw(rgb_encoded, tf.uint8)
    rgb.append(rgb_decoded.numpy().tolist())

    # Decode audio features
    audio_encoded = seq_example.feature_lists.feature_list['audio'].feature[0].bytes_list.value[0]
    audio_decoded = tf.io.decode_raw(audio_encoded, tf.uint8)
    audio.append(audio_decoded.numpy().tolist())


print('Number of videos in this tfrecord: ',len(vid_ids))
print ('Number of labels in this tfrecord: ', len (labels))
print('Number of mean RGB in this tfrecord: ', len(rgb))
print('Number of mean audio in this tfrecord: ', len(audio))
print('Picking a youtube video id:', vid_ids[1])


['validate3489.tfrecord', '3_frame_validate_download_plan.json']
Number of videos in this tfrecord:  8
Number of labels in this tfrecord:  8
Number of mean RGB in this tfrecord:  8
Number of mean audio in this tfrecord:  8
Picking a youtube video id: Xd4r


In [8]:
#rgb: 8 x 1024
#audio: 8 x 128
len(audio[0])

128

In [9]:
# Translate video ID to Youtube URL
import requests

def translate_video_id(encoded_id):
    # Construct the URL to retrieve the mapping
    url = f"http://data.yt8m.org/2/j/i/{encoded_id[:2]}/{encoded_id}.js"
    response = requests.get(url)

    # Check if the response is successful and has the expected format
    if response.status_code == 200 and 'i("' in response.text:
        # Extract the standard YouTube video ID from the response
        video_id = response.text.split('"')[3]
        return video_id
    else:
        # Log an error or handle it as appropriate
        print(f"Error retrieving video ID for {encoded_id}: {response.text}")
        return '200'

youtube_urls = []
for encoded_id in vid_ids:
    video_id = translate_video_id(encoded_id)
    if video_id:
        youtube_url = f"https://www.youtube.com/watch?v={video_id}"
        youtube_urls.append(youtube_url)

# Print or use the YouTube URLs
for url in youtube_urls:
    print(url)


Error retrieving video ID for Xw4r: <?xml version='1.0' encoding='UTF-8'?><Error><Code>AccessDenied</Code><Message>Access denied.</Message></Error>
Error retrieving video ID for En4r: <?xml version='1.0' encoding='UTF-8'?><Error><Code>AccessDenied</Code><Message>Access denied.</Message></Error>
https://www.youtube.com/watch?v=KYVAv4gq00k
https://www.youtube.com/watch?v=Ph-dQZnHChc
https://www.youtube.com/watch?v=jGve-X-fJug
https://www.youtube.com/watch?v=GnbzWaaIGRY
https://www.youtube.com/watch?v=200
https://www.youtube.com/watch?v=200
https://www.youtube.com/watch?v=DTQpR-AqKnM
https://www.youtube.com/watch?v=tTfptynuWdA


In [11]:
DATA_PATH = "/Users/runzhou/Desktop/youtube/data/"
# Read in vocabulary for human-rated mapping of numeric label to text label for the video
vocab = pd.read_csv(DATA_PATH + 'vocabulary.csv')

# Create a pandas dataframe to store the segment-rated frame-level contextual data and embeddings
df = pd.DataFrame()

for i, encoded_id in enumerate(vid_ids):
    video_id = translate_video_id(encoded_id)
    if video_id:
        youtube_url = f"https://www.youtube.com/watch?v={video_id}"
    
    text_labels = [vocab[vocab['Index'] == label]['Name'].values[0] for label in labels[i]]
    wiki_descriptions = [vocab[vocab['Index'] == label]['WikiDescription'].values[0] for label in labels[i]]
    text_segment_labels = [vocab[vocab['Index'] == label]['Name'].values[0] for label in segment_labels[i]]
    df = pd.concat([df, pd.DataFrame([{'labels': labels[i],
                    'text_labels': text_labels,
                    'url': youtube_url,
                    'WikiDescription': wiki_descriptions,
                    'text_segment_labels': text_segment_labels,
                    'segment_labels': segment_labels[i],
                    'segment_start_times': segment_start_times[i],
                    'segment_end_times': segment_end_times[i],
                    'mean_rgb': rgb[i],
                    'mean_audio': audio[i]}])],
                    ignore_index=True)

# Display the DataFrame
print(df)

Error retrieving video ID for Xw4r: <?xml version='1.0' encoding='UTF-8'?><Error><Code>AccessDenied</Code><Message>Access denied.</Message></Error>
Error retrieving video ID for En4r: <?xml version='1.0' encoding='UTF-8'?><Error><Code>AccessDenied</Code><Message>Access denied.</Message></Error>
                   labels                                   text_labels  \
0       [2, 76, 281, 446]             [Vehicle, Boat, Motorboat, Yacht]   
1                   [191]                                       [Hotel]   
2  [11, 1366, 1684, 1872]    [Food, Grocery store, Supermarket, Retail]   
3        [2, 7, 116, 235]    [Vehicle, Car, Sport utility vehicle, nan]   
4             [461, 1325]                    [Printer (computing), Ink]   
5                   [607]                                  [Board game]   
6       [0, 1, 234, 1303]  [Game, Video game, Super Smash Bros., Yoshi]   
7                [17, 19]                          [Motorsport, Racing]   

                            

In [12]:
df.head(5)

Unnamed: 0,labels,text_labels,url,WikiDescription,text_segment_labels,segment_labels,segment_start_times,segment_end_times,mean_rgb,mean_audio
0,"[2, 76, 281, 446]","[Vehicle, Boat, Motorboat, Yacht]",https://www.youtube.com/watch?v=KYVAv4gq00k,[A vehicle is a mobile machine that transports...,"[Luxury yacht, Luxury yacht, Luxury yacht, Lux...","[1314, 1314, 1314, 1314, 1314]","[50, 165, 230, 115, 210]","[55, 170, 235, 120, 215]","[0, 75, 148, 37, 0, 0, 74, 28, 194, 118, 230, ...","[175, 43, 116, 109, 154, 93, 75, 198, 172, 171..."
1,[191],[Hotel],https://www.youtube.com/watch?v=Ph-dQZnHChc,[A hotel is an establishment that provides pai...,"[Hotel, Hotel, Hotel]","[191, 191, 191]","[200, 255, 230]","[205, 260, 235]","[89, 195, 86, 213, 96, 100, 151, 183, 103, 151...","[163, 55, 116, 98, 91, 183, 151, 157, 177, 117..."
2,"[11, 1366, 1684, 1872]","[Food, Grocery store, Supermarket, Retail]",https://www.youtube.com/watch?v=jGve-X-fJug,[Food is any substance consumed to provide nut...,"[Grocery store, Grocery store, Grocery store, ...","[1366, 1366, 1366, 1366, 1366]","[85, 220, 230, 125, 45]","[90, 225, 235, 130, 50]","[0, 72, 173, 30, 0, 0, 40, 33, 199, 132, 199, ...","[175, 47, 132, 85, 169, 168, 156, 167, 157, 13..."
3,"[2, 7, 116, 235]","[Vehicle, Car, Sport utility vehicle, nan]",https://www.youtube.com/watch?v=GnbzWaaIGRY,[A vehicle is a mobile machine that transports...,"[Sport utility vehicle, Sport utility vehicle,...","[116, 116, 116, 235, 116, 235, 235, 116, 235, ...","[265, 220, 65, 180, 105, 230, 105, 85, 135, 165]","[270, 225, 70, 185, 110, 235, 110, 90, 140, 170]","[95, 203, 71, 171, 124, 165, 110, 172, 123, 17...","[157, 37, 131, 92, 127, 93, 194, 217, 140, 103..."
4,"[461, 1325]","[Printer (computing), Ink]",https://www.youtube.com/watch?v=200,"[In computing, a printer is a peripheral which...","[Ink, Ink, Ink, Ink, Ink]","[1325, 1325, 1325, 1325, 1325]","[50, 45, 85, 35, 110]","[55, 50, 90, 40, 115]","[86, 33, 92, 131, 206, 63, 92, 178, 43, 139, 5...","[133, 86, 84, 137, 155, 55, 95, 83, 153, 123, ..."


#### Build a generic RAG system

In [13]:
#!sudo apt-get install build-essential libssl-dev python3-dev
#!python -m pip install --upgrade pip setuptools wheel
#!python -m pip install grpcio

#!python -m pip install chromadb

#!python --version

In [14]:
df.head(10)

Unnamed: 0,labels,text_labels,url,WikiDescription,text_segment_labels,segment_labels,segment_start_times,segment_end_times,mean_rgb,mean_audio
0,"[2, 76, 281, 446]","[Vehicle, Boat, Motorboat, Yacht]",https://www.youtube.com/watch?v=KYVAv4gq00k,[A vehicle is a mobile machine that transports...,"[Luxury yacht, Luxury yacht, Luxury yacht, Lux...","[1314, 1314, 1314, 1314, 1314]","[50, 165, 230, 115, 210]","[55, 170, 235, 120, 215]","[0, 75, 148, 37, 0, 0, 74, 28, 194, 118, 230, ...","[175, 43, 116, 109, 154, 93, 75, 198, 172, 171..."
1,[191],[Hotel],https://www.youtube.com/watch?v=Ph-dQZnHChc,[A hotel is an establishment that provides pai...,"[Hotel, Hotel, Hotel]","[191, 191, 191]","[200, 255, 230]","[205, 260, 235]","[89, 195, 86, 213, 96, 100, 151, 183, 103, 151...","[163, 55, 116, 98, 91, 183, 151, 157, 177, 117..."
2,"[11, 1366, 1684, 1872]","[Food, Grocery store, Supermarket, Retail]",https://www.youtube.com/watch?v=jGve-X-fJug,[Food is any substance consumed to provide nut...,"[Grocery store, Grocery store, Grocery store, ...","[1366, 1366, 1366, 1366, 1366]","[85, 220, 230, 125, 45]","[90, 225, 235, 130, 50]","[0, 72, 173, 30, 0, 0, 40, 33, 199, 132, 199, ...","[175, 47, 132, 85, 169, 168, 156, 167, 157, 13..."
3,"[2, 7, 116, 235]","[Vehicle, Car, Sport utility vehicle, nan]",https://www.youtube.com/watch?v=GnbzWaaIGRY,[A vehicle is a mobile machine that transports...,"[Sport utility vehicle, Sport utility vehicle,...","[116, 116, 116, 235, 116, 235, 235, 116, 235, ...","[265, 220, 65, 180, 105, 230, 105, 85, 135, 165]","[270, 225, 70, 185, 110, 235, 110, 90, 140, 170]","[95, 203, 71, 171, 124, 165, 110, 172, 123, 17...","[157, 37, 131, 92, 127, 93, 194, 217, 140, 103..."
4,"[461, 1325]","[Printer (computing), Ink]",https://www.youtube.com/watch?v=200,"[In computing, a printer is a peripheral which...","[Ink, Ink, Ink, Ink, Ink]","[1325, 1325, 1325, 1325, 1325]","[50, 45, 85, 35, 110]","[55, 50, 90, 40, 115]","[86, 33, 92, 131, 206, 63, 92, 178, 43, 139, 5...","[133, 86, 84, 137, 155, 55, 95, 83, 153, 123, ..."
5,[607],[Board game],https://www.youtube.com/watch?v=200,[A board game is a tabletop game that involves...,"[Board game, Board game, Board game, Board gam...","[607, 607, 607, 607, 607]","[95, 215, 140, 60, 145]","[100, 220, 145, 65, 150]","[116, 161, 174, 145, 99, 88, 64, 117, 147, 104...","[105, 70, 109, 126, 116, 29, 122, 114, 181, 18..."
6,"[0, 1, 234, 1303]","[Game, Video game, Super Smash Bros., Yoshi]",https://www.youtube.com/watch?v=DTQpR-AqKnM,"[A game is structured form of play, usually un...","[Yoshi, Yoshi, Yoshi, Yoshi, Yoshi]","[1303, 1303, 1303, 1303, 1303]","[185, 45, 190, 105, 195]","[190, 50, 195, 110, 200]","[181, 16, 82, 162, 190, 255, 88, 28, 202, 213,...","[156, 218, 194, 148, 121, 42, 8, 73, 68, 35, 9..."
7,"[17, 19]","[Motorsport, Racing]",https://www.youtube.com/watch?v=tTfptynuWdA,[Motorsport or motorsports is a global term us...,"[Running, Running, Running, Running, Running]","[210, 210, 210, 210, 210]","[40, 65, 80, 30, 55]","[45, 70, 85, 35, 60]","[120, 193, 70, 224, 104, 170, 113, 125, 118, 1...","[65, 70, 167, 165, 166, 107, 173, 41, 113, 238..."


In [15]:
import chromadb

# Initialize the ChromaDB client
chroma_client = chromadb.Client()

#Create a collections to store your embeddings, documents, and any additional metadata. 
collection = chroma_client.get_or_create_collection(name='rgb_collection')

# Store embeddings in ChromaDB
for i, row in df.iterrows():
    # Create a unique ID for each embedding
    id = f'{i}'

    # Get info of each row
    metadatas_dict = {
    #'labels': row['labels'],
    #'text_labels': row['text_labels'],
    #'segment_labels': row['segment_labels'],
    #'text_segment_labels': row['text_segment_labels'],
    #'segment_start_times': row['segment_start_times'],
    #'segment_end_times': row['segment_end_times'],
    'url': row['url'],
    'wiki': row['WikiDescription'][0]}

    # Store the embedding in ChromaDB
    collection.add(
        embeddings=row['mean_rgb'],
        metadatas=metadatas_dict,
        ids=id
    )

print('Embeddings stored in ChromaDB successfully.')


Embeddings stored in ChromaDB successfully.


In [17]:
# get items from a collection
collection.peek()



{'ids': ['0', '1', '2', '3', '4', '5', '6', '7'],
 'embeddings': [[0.0,
   75.0,
   148.0,
   37.0,
   0.0,
   0.0,
   74.0,
   28.0,
   194.0,
   118.0,
   230.0,
   85.0,
   58.0,
   164.0,
   155.0,
   195.0,
   70.0,
   99.0,
   63.0,
   247.0,
   106.0,
   225.0,
   163.0,
   130.0,
   110.0,
   7.0,
   239.0,
   131.0,
   51.0,
   140.0,
   128.0,
   194.0,
   78.0,
   72.0,
   115.0,
   115.0,
   112.0,
   214.0,
   123.0,
   81.0,
   109.0,
   146.0,
   102.0,
   187.0,
   155.0,
   125.0,
   159.0,
   130.0,
   164.0,
   84.0,
   165.0,
   112.0,
   106.0,
   96.0,
   93.0,
   108.0,
   121.0,
   120.0,
   88.0,
   117.0,
   99.0,
   112.0,
   158.0,
   114.0,
   118.0,
   149.0,
   178.0,
   157.0,
   119.0,
   94.0,
   149.0,
   140.0,
   137.0,
   112.0,
   98.0,
   81.0,
   159.0,
   141.0,
   120.0,
   112.0,
   131.0,
   120.0,
   175.0,
   70.0,
   112.0,
   112.0,
   145.0,
   159.0,
   79.0,
   144.0,
   205.0,
   93.0,
   168.0,
   137.0,
   150.0,
   134.0,
   142.0

In [18]:
# do nearest neighbor search to find similar embeddings or documents, supports filtering
collection.query(
    query_embeddings=rgb[3],
    n_results=2
    #where={"style": "style2"}
)

{'ids': [['3', '2']],
 'distances': [[0.0, 3891342.0]],
 'metadatas': [[{'url': 'https://www.youtube.com/watch?v=GnbzWaaIGRY',
    'wiki': 'A vehicle is a mobile machine that transports people or cargo. Typical vehicles include wagons, bicycles, motor vehicles, railed vehicles, watercraft, aircraft and spacecraft. Land vehicles are classified broadly by what is used to apply steering and drive forces against the ground: wheeled, tracked, railed or skied. ISO 3833-1977 is the standard, also internationally used in legislation, for road vehicles types, terms and definitions.'},
   {'url': 'https://www.youtube.com/watch?v=jGve-X-fJug',
    'wiki': "Food is any substance consumed to provide nutritional support for the body. It is usually of plant or animal origin, and contains essential nutrients, such as carbohydrates, fats, proteins, vitamins, or minerals. The substance is ingested by an organism and assimilated by the organism's cells to provide energy, maintain life, or stimulate growt

In [19]:
df.iloc[6]

labels                                                 [0, 1, 234, 1303]
text_labels                 [Game, Video game, Super Smash Bros., Yoshi]
url                          https://www.youtube.com/watch?v=DTQpR-AqKnM
WikiDescription        [A game is structured form of play, usually un...
text_segment_labels                  [Yoshi, Yoshi, Yoshi, Yoshi, Yoshi]
segment_labels                            [1303, 1303, 1303, 1303, 1303]
segment_start_times                             [185, 45, 190, 105, 195]
segment_end_times                               [190, 50, 195, 110, 200]
mean_rgb               [181, 16, 82, 162, 190, 255, 88, 28, 202, 213,...
mean_audio             [156, 218, 194, 148, 121, 42, 8, 73, 68, 35, 9...
Name: 6, dtype: object

In [20]:
df.iloc[2]

labels                                            [11, 1366, 1684, 1872]
text_labels                   [Food, Grocery store, Supermarket, Retail]
url                          https://www.youtube.com/watch?v=jGve-X-fJug
WikiDescription        [Food is any substance consumed to provide nut...
text_segment_labels    [Grocery store, Grocery store, Grocery store, ...
segment_labels                            [1366, 1366, 1366, 1366, 1366]
segment_start_times                              [85, 220, 230, 125, 45]
segment_end_times                                [90, 225, 235, 130, 50]
mean_rgb               [0, 72, 173, 30, 0, 0, 40, 33, 199, 132, 199, ...
mean_audio             [175, 47, 132, 85, 169, 168, 156, 167, 157, 13...
Name: 2, dtype: object

In [21]:
#!pip install youtube_transcript_api

In [22]:
youtube_urls

# https://www.youtube.com/watch?v=h06wB5JlyNg


['https://www.youtube.com/watch?v=KYVAv4gq00k',
 'https://www.youtube.com/watch?v=Ph-dQZnHChc',
 'https://www.youtube.com/watch?v=jGve-X-fJug',
 'https://www.youtube.com/watch?v=GnbzWaaIGRY',
 'https://www.youtube.com/watch?v=200',
 'https://www.youtube.com/watch?v=200',
 'https://www.youtube.com/watch?v=DTQpR-AqKnM',
 'https://www.youtube.com/watch?v=tTfptynuWdA']

In [1]:
#!pip install google-api-python-client
#!pip install pytube



#### Pull Videos From Lowes Youtube Channel

In [None]:
from googleapiclient.discovery import build
from pytube import YouTube
import os
from youtube_transcript_api import YouTubeTranscriptApi

# Set up the YouTube Data API
api_key = 'AIzaSyBmGeetqQFiMoDgS4d1Xe7JCTi23yLU2d8'

def get_video_title(video_id, api_key):
    youtube = build('youtube', 'v3', developerKey=api_key)
    request = youtube.videos().list(part='snippet', id=video_id)
    response = request.execute()
    if response['items']:
        return response['items'][0]['snippet']['title']
    else:
        return ''
    
def get_video_transcript(video_id):
    try:
        transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
        transcript_segments = [
            {
                'start_time': segment['start'],
                'duration': segment['duration'],
                'transcript_text': segment['text']
            }
            for segment in transcript_list
        ]
        return transcript_segments
    except Exception as e:
        print(f"Error extracting transcript for video {video_id}: {e}")
        return []

In [None]:
api_key = 'AIzaSyBmGeetqQFiMoDgS4d1Xe7JCTi23yLU2d8'
youtube = build('youtube', 'v3', developerKey=api_key)

In [8]:
# Get youtube videos from a given channel
# Get the channel ID (replace 'CHANNEL_USERNAME' with the actual username)
channel_response = youtube.channels().list(forUsername='lowes', part='contentDetails').execute()
channel_id = channel_response['items'][0]['id']

# Get the playlist ID for the channel's uploads
uploads_playlist_id = channel_response['items'][0]['contentDetails']['relatedPlaylists']['uploads']

# Get the list of video IDs from the uploads playlist
video_ids = []
next_page_token = None
while True:
    playlist_response = youtube.playlistItems().list(playlistId=uploads_playlist_id, part='contentDetails', maxResults=50, pageToken=next_page_token).execute()
    video_ids.extend([item['contentDetails']['videoId'] for item in playlist_response['items']])
    next_page_token = playlist_response.get('nextPageToken')
    if not next_page_token:
        break

# Download the videos using pytube
# download_path = '/Users/runzhu/Desktop/rag/data'
download_path = '/Users/danielcohen/Desktop/RAG/data'
# youtube_url = f"https://www.youtube.com/watch?v={video_id}"
video_info = {}

# Loop through each video ID and extract the transcript
n = 0
max_n = 100
for video_id in video_ids[0:max_n-1]:
    print(video_id)
    title = get_video_title(video_id, api_key)
    transcript = get_video_transcript(video_id)
    i = {'video_title': title,
         'video_transcript': transcript}

    video_info[video_id] = i 

hc8ph34dDi4
62gTbT_a-bA
9erpMbCs1mA
D9AYFnP8-Kc
nHMFNj3i5j0
bsOH6-bWMuU
cdY-4gH5H-Q
spcWcQjO-MM
r92Hyv7T4HM
0ON22Q_Eybo
qbGkLPImZZY
9EFo6EKtYUo
pQ3rDz4BF_Q
uqzeCY-rr4o
n73MICun1Dc
M2PI8PHZw94
sU4Aoo9GShY
nn7JjGM_hsE
nsvVbbYIEZw
g4f3XzxskEE
Bhb9Z3-z29g
tF4NF5QdLn0
nlYy9STQiZ8
dUQwgwKFYtk
CyQZe9W5oOc
s1j7PZgZKkY
GiF6Xhfp4v8
t8AjY8yZ90k
2u3HFJ1xzvk
vVgjJuw8CF8
HoUsyGjMD38
K69HPQHg1ZE
CPey-f0HP4o
DD113dCPqmg
FDg183RHzzM
Error extracting transcript for video FDg183RHzzM: 
Could not retrieve a transcript for the video https://www.youtube.com/watch?v=FDg183RHzzM! This is most likely caused by:

Subtitles are disabled for this video

If you are sure that the described cause is not responsible for this error and that a transcript should be retrievable, please create an issue at https://github.com/jdepoix/youtube-transcript-api/issues. Please add which version of youtube_transcript_api you are using and provide the information needed to replicate the error. Also make sure that there are no open 

In [9]:
video_info

{'hc8ph34dDi4': {'video_title': 'How to Plant Container Pots | How To Anything',
  'video_transcript': [{'start_time': 0.0,
    'duration': 3.49,
    'transcript_text': '[Music]'},
   {'start_time': 7.799,
    'duration': 4.041,
    'transcript_text': "hi everyone I'm Jackie and you're"},
   {'start_time': 9.599,
    'duration': 3.92,
    'transcript_text': "watching Lowe's how to anything it's"},
   {'start_time': 11.84,
    'duration': 3.999,
    'transcript_text': 'springtime so that could only mean one'},
   {'start_time': 13.519,
    'duration': 4.801,
    'transcript_text': 'thing it is the perfect time to plant'},
   {'start_time': 15.839,
    'duration': 4.241,
    'transcript_text': 'new plants more specifically plant them'},
   {'start_time': 18.32,
    'duration': 3.16,
    'transcript_text': "in a container pot so I'm going to walk"},
   {'start_time': 20.08,
    'duration': 2.8,
    'transcript_text': 'you through that process today and not'},
   {'start_time': 21.48,
    

In [10]:
VIDEO_ID = 'cdY-4gH5H-Q'
# Print the resulting dictionary
print(video_info[VIDEO_ID])

{'video_title': 'How To Cut Tile #diy #kitchenremodel #howtotile', 'video_transcript': [{'start_time': 0.88, 'duration': 3.64, 'transcript_text': 'build your DIY confidence and learn how'}, {'start_time': 2.679, 'duration': 4.321, 'transcript_text': 'to cut'}, {'start_time': 4.52, 'duration': 3.8, 'transcript_text': "tiles if you're tiling a floor or wall"}, {'start_time': 7.0, 'duration': 4.36, 'transcript_text': "you'll probably need to cut some tiles"}, {'start_time': 8.32, 'duration': 5.279, 'transcript_text': 'to fit the space and a wet tile saw is'}, {'start_time': 11.36, 'duration': 4.12, 'transcript_text': 'perfect for the job it uses the water to'}, {'start_time': 13.599, 'duration': 4.081, 'transcript_text': "cool down the tile as it's being cut so"}, {'start_time': 15.48, 'duration': 4.84, 'transcript_text': "it doesn't break don't worry if you've"}, {'start_time': 17.68, 'duration': 4.2, 'transcript_text': "never used one before it's safe and easy"}, {'start_time': 20.32, '

In [13]:
# Download the youtube video to local
#download_path = '/Users/runzhou/Desktop/rag/data'
download_path = '/Users/danielcohen/Desktop/RAG/data'

#made this a for loop to make choosing specific videos easier
VIDEO_IDS = ['9EFo6EKtYUo',  '9tvPeP6O-6M', 'BPhdUVH7NzE','CPey-f0HP4o','CfDwDFkmVQQ'] #Video id #11- 15
for VIDEO_ID in VIDEO_IDS:
    video_url = f'https://www.youtube.com/watch?v=' + VIDEO_ID
    yt = YouTube(video_url)
    stream = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first()
    if stream:
        # Download the video and get the file path
        file_path = stream.download(download_path)
        print(f'Downloaded: {yt.title}')
        print(f'File path: {file_path}')
    else:
        print(f'Could not download: {yt.title}')

Downloaded: Junk Drawer No More
File path: /Users/danielcohen/Desktop/RAG/data/Junk Drawer No More.mp4
Downloaded: Power Outage? Never Heard Of It.
File path: /Users/danielcohen/Desktop/RAG/data/Power Outage Never Heard Of It.mp4
Downloaded: How to Replace Faucets and Fix Drainage Issues | DIY-U by Lowe's
File path: /Users/danielcohen/Desktop/RAG/data/How to Replace Faucets and Fix Drainage Issues  DIY-U by Lowes.mp4
Downloaded: Deck Your Hallway With This Holiday Hack
File path: /Users/danielcohen/Desktop/RAG/data/Deck Your Hallway With This Holiday Hack.mp4
Downloaded: How to Sharpen, Balance and Change Lawn Mower Blades | How To Anything
File path: /Users/danielcohen/Desktop/RAG/data/How to Sharpen Balance and Change Lawn Mower Blades  How To Anything.mp4


#### After downloading data, now analyze each frame:

In [14]:
#!pip install opencv-python

Collecting opencv-python
  Downloading opencv_python-4.9.0.80-cp37-abi3-macosx_11_0_arm64.whl.metadata (20 kB)
Downloading opencv_python-4.9.0.80-cp37-abi3-macosx_11_0_arm64.whl (35.4 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.4/35.4 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hInstalling collected packages: opencv-python
Successfully installed opencv-python-4.9.0.80


In [17]:
# Get frames of a given video
import cv2
# Read the video from specified path
file_path = '/Users/danielcohen/Desktop/rag/data/".mp4'
cam = cv2.VideoCapture(file_path)

# frame
currentframe = 0
while True:
    # reading from frame
    ret, frame = cam.read()
    if ret:
        # if video is still left continue creating images
        #name = '/Users/runzhou/Desktop/rag/data/video_frame/frame_' + str(currentframe) + '.jpg'
        name = '/Users/danielcohen/Desktop/RAG/data/video_frame/frame_' + str(currentframe) + '.jpg'
        print ('Creating...' + name)
        # writing the extracted images
        cv2.imwrite(name, frame)
        # increasing counter so that it will
        # show how many frames are created
        currentframe += 1
    else:
        break
# Release all space and windows once done
cam.release()
cv2.destroyAllWindows()

Creating.../Users/danielcohen/Desktop/RAG/data/video_frame/frame_0.jpg
Creating.../Users/danielcohen/Desktop/RAG/data/video_frame/frame_1.jpg
Creating.../Users/danielcohen/Desktop/RAG/data/video_frame/frame_2.jpg
Creating.../Users/danielcohen/Desktop/RAG/data/video_frame/frame_3.jpg
Creating.../Users/danielcohen/Desktop/RAG/data/video_frame/frame_4.jpg
Creating.../Users/danielcohen/Desktop/RAG/data/video_frame/frame_5.jpg
Creating.../Users/danielcohen/Desktop/RAG/data/video_frame/frame_6.jpg
Creating.../Users/danielcohen/Desktop/RAG/data/video_frame/frame_7.jpg
Creating.../Users/danielcohen/Desktop/RAG/data/video_frame/frame_8.jpg
Creating.../Users/danielcohen/Desktop/RAG/data/video_frame/frame_9.jpg
Creating.../Users/danielcohen/Desktop/RAG/data/video_frame/frame_10.jpg
Creating.../Users/danielcohen/Desktop/RAG/data/video_frame/frame_11.jpg
Creating.../Users/danielcohen/Desktop/RAG/data/video_frame/frame_12.jpg
Creating.../Users/danielcohen/Desktop/RAG/data/video_frame/frame_13.jpg
Cr

In [18]:
# get all start time in the video_info
VIDEO_ID = VIDEO_IDS[0]
start_times = [item['start_time'] for item in video_info[VIDEO_ID]['video_transcript']]
print(start_times)

[0.199, 1.76, 4.16, 5.96, 7.56, 9.28, 10.8, 12.2, 14.759, 16.16, 18.92]


In [19]:
import os
# Get the captured frame corresponding to the segment start times
# Read the video from specified path
cam = cv2.VideoCapture(file_path)

#Set the directory path for video frame start times
#directory = '/Users/runzhou/Desktop/rag/data/video_frame_starttime'
directory = '/Users/danielcohen/Desktop/RAG/data/video_frame_starttime'

# Check if the directory exists
if not os.path.exists(directory):
    # Create the directory
    os.makedirs(directory)
    print(f'Directory created: {directory}')
else:
    print(f'Directory already exists: {directory}')

# FPS
fps = cam.get(cv2.CAP_PROP_FPS)
captured_frames = []
for target_time in start_times:
    # set the frame point to capture from
    cam.set(cv2.CAP_PROP_POS_MSEC, target_time * 1000)

    # read the frame
    ret, frame = cam.read()

    # save the captured frame
    if ret:
        #name = '/Users/runzhou/Desktop/rag/data/video_frame_starttime/frame_at_' + str(target_time) + '.jpg'
        name = '/Users/danielcohen/Desktop/RAG/data/video_frame_starttime/frame_at_' + str(target_time) + '.jpg'
        captured_frames.append(name)
        print ('Creating...' + name)
        # writing the extracted images
        cv2.imwrite(name, frame)
    else:
        print("Error: Failed to read frame at {} seconds.".format(target_time))

# Release all space and windows once done
cam.release()
cv2.destroyAllWindows()

Directory already exists: /Users/danielcohen/Desktop/RAG/data/video_frame_starttime
Creating.../Users/danielcohen/Desktop/RAG/data/video_frame_starttime/frame_at_0.199.jpg
Creating.../Users/danielcohen/Desktop/RAG/data/video_frame_starttime/frame_at_1.76.jpg
Creating.../Users/danielcohen/Desktop/RAG/data/video_frame_starttime/frame_at_4.16.jpg
Creating.../Users/danielcohen/Desktop/RAG/data/video_frame_starttime/frame_at_5.96.jpg
Creating.../Users/danielcohen/Desktop/RAG/data/video_frame_starttime/frame_at_7.56.jpg
Creating.../Users/danielcohen/Desktop/RAG/data/video_frame_starttime/frame_at_9.28.jpg
Creating.../Users/danielcohen/Desktop/RAG/data/video_frame_starttime/frame_at_10.8.jpg
Creating.../Users/danielcohen/Desktop/RAG/data/video_frame_starttime/frame_at_12.2.jpg
Creating.../Users/danielcohen/Desktop/RAG/data/video_frame_starttime/frame_at_14.759.jpg
Creating.../Users/danielcohen/Desktop/RAG/data/video_frame_starttime/frame_at_16.16.jpg
Creating.../Users/danielcohen/Desktop/RAG/

## Eden AI

In [20]:
# Detect objects in the captured frames
import json
import requests

object_detected_amazon = []
object_detected_google = []

for frame in captured_frames:
    # if frame.split('/')[-1] == "frame_at_9.28.jpg":
    headers = {"Authorization": "Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VyX2lkIjoiYjVkMTM0MmMtYTAzNC00ZmNhLTg4NmEtOGNhYzk2YmYxYjA2IiwidHlwZSI6ImFwaV90b2tlbiJ9.AI0apCBMmPPLg9Vo4BftZ7PCr3YCgyG1KIk2EDxTNV0"}

    url = "https://api.edenai.run/v2/image/object_detection"
    data = {
        "providers": "google,amazon",
        "fallback_providers": ""
    }
    files = {'file': open(frame, 'rb')}

    response = requests.post(url, data=data, files=files, headers=headers)
    result = json.loads(response.text)
    print("Items detected in " + frame.split('/')[-1] + ":")
    if 'amazon' in result and 'items' in result['amazon']:
        object_list = [item['label'] for item in result['amazon']['items']]
        object_detected_amazon.append(object_list)
        print("Amazon response:")
        print(object_list)
    else:
        object_detected_amazon.append([])
    if 'google' in result and 'items' in result['google']:
        object_list = [item['label'] for item in result['google']['items']]
        object_detected_google.append(object_list)
        print("Google response:")
        print(object_list)
    else:
        object_detected_google.append([])

Items detected in frame_at_0.199.jpg:
Amazon response:
['Drawer', 'Furniture', 'Cabinet', 'Business Card', 'Paper', 'Text', 'Dynamite', 'Weapon']
Google response:
[]
Items detected in frame_at_1.76.jpg:
Amazon response:
['Drawer', 'Furniture', 'Cabinet', 'Business Card', 'Paper', 'Text', 'Can', 'Tin', 'Box']
Google response:
[]
Items detected in frame_at_4.16.jpg:
Amazon response:
['Drawer', 'Furniture', 'Cabinet', 'Box', 'First Aid']
Google response:
[]
Items detected in frame_at_5.96.jpg:
Amazon response:
['Indoors', 'Interior Design', 'Boy', 'Child', 'Male', 'Person', 'Photo Frame', 'Window', 'Windowsill', 'Business Card', 'Paper', 'Text']
Google response:
[]
Items detected in frame_at_7.56.jpg:
Amazon response:
['Advertisement', 'Poster', 'Business Card', 'Paper', 'Text']
Google response:
[]
Items detected in frame_at_9.28.jpg:
Items detected in frame_at_10.8.jpg:
Items detected in frame_at_12.2.jpg:
Items detected in frame_at_14.759.jpg:
Items detected in frame_at_16.16.jpg:
Items

In [22]:
# Print object detected by Amazon API
object_detected_amazon

[['Drawer',
  'Furniture',
  'Cabinet',
  'Business Card',
  'Paper',
  'Text',
  'Dynamite',
  'Weapon'],
 ['Drawer',
  'Furniture',
  'Cabinet',
  'Business Card',
  'Paper',
  'Text',
  'Can',
  'Tin',
  'Box'],
 ['Drawer', 'Furniture', 'Cabinet', 'Box', 'First Aid'],
 ['Indoors',
  'Interior Design',
  'Boy',
  'Child',
  'Male',
  'Person',
  'Photo Frame',
  'Window',
  'Windowsill',
  'Business Card',
  'Paper',
  'Text'],
 ['Advertisement', 'Poster', 'Business Card', 'Paper', 'Text'],
 [],
 [],
 [],
 [],
 [],
 []]

In [53]:
# Print object detected by Google API
object_detected_google

[['Glasses', 'Clothing', 'Person'],
 [],
 [],
 ['Person', 'Clothing', 'Shoe', 'Footwear', 'Person'],
 ['Shoe', 'Boot', 'Footwear'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 []]

In [131]:
frame = "/Users/runzhou/Desktop/rag/data/video_frame_starttime/frame_at_7.0.jpg"

# if frame.split('/')[-1] == "frame_at_9.28.jpg":
headers = {"Authorization": "Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VyX2lkIjoiYjVkMTM0MmMtYTAzNC00ZmNhLTg4NmEtOGNhYzk2YmYxYjA2IiwidHlwZSI6ImFwaV90b2tlbiJ9.AI0apCBMmPPLg9Vo4BftZ7PCr3YCgyG1KIk2EDxTNV0"}

url = "https://api.edenai.run/v2/image/object_detection"
data = {
    "providers": "google,amazon",
    "fallback_providers": ""
}
files = {'file': open(frame, 'rb')}

response = requests.post(url, data=data, files=files, headers=headers)
result = json.loads(response.text)
print("Items detected in " + frame.split('/')[-1] + ":")
if 'amazon' in result and 'items' in result['amazon']:
    object_list = [item['label'] for item in result['amazon']['items']]
    print("Amazon response:")
    print(object_list)
if 'google' in result and 'items' in result['google']:
    object_list = [item['label'] for item in result['google']['items']]
    print("Google response:")
    print(object_list)

Items detected in frame_at_7.0.jpg:
Amazon response:
['Floor', 'Flooring', 'Indoors', 'Interior Design', 'Person', 'Face', 'Head', 'Home Renovation Work']
Google response:
['Person', 'Clothing', 'Shoe', 'Footwear', 'Person']


#### ************* Summary *************
##### 1. Process videos to create a pandas dataframe with the following info:
#####           video ID, segment start time, segment-level transcript, list of objects detected
##### 2. Create embeddings for segment-level transcripts and the list of objects detected
##### 3. Ingest the embeddings into ChromaDB
#### ************************************

In [107]:
#!pip install spacy
#!python -m spacy download en_core_web_sm

In [64]:
print("We are about to process {x} videos".format(x=len(video_info)))

We are about to process 99 videos


In [285]:
import cv2
import pandas as pd
import os
import json
import requests
import spacy
import re
import shutil
from pytube import YouTube

# Load the language model
nlp = spacy.load("en_core_web_sm")


def clear_directory(directory):
    # Check if the directory exists
    if os.path.exists(directory):
        # Remove all files in the directory
        shutil.rmtree(directory)
        print(f"Cleared directory: {directory}")
    # Recreate the directory
    os.makedirs(directory)

def download_video(video_id, download_path):
    video_url = f'https://www.youtube.com/watch?v=' + video_id
    yt = YouTube(video_url)
    stream = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first()
    if stream:
        file_path = stream.download(download_path)
        print(f'Downloaded: {yt.title}')
        print(f'File path: {file_path}')
        return file_path
    else:
        print(f'Could not download: {yt.title}')
        return None

def get_captured_frames(file_path, start_times, directory):
    cam = cv2.VideoCapture(file_path)
    captured_frames = []
    for target_time in start_times:
        cam.set(cv2.CAP_PROP_POS_MSEC, target_time * 1000)
        ret, frame = cam.read()
        if ret:
            name = os.path.join(directory, f'frame_at_{target_time}.jpg')
            captured_frames.append(name)
            cv2.imwrite(name, frame)
        else:
            print(f"Error: Failed to read frame at {target_time} seconds.")
    cam.release()
    cv2.destroyAllWindows()
    return captured_frames

def detect_objects(captured_frames, provider, token):
    object_detected = []
    headers = {"Authorization": "Bearer {x}".format(x=token)}
    url = "https://api.edenai.run/v2/image/object_detection"
    data = {
        "providers": provider,
        "fallback_providers": ""
    }
    for frame in captured_frames:
        files = {'file': open(frame, 'rb')}
        response = requests.post(url, data=data, files=files, headers=headers)
        result = json.loads(response.text)
        if provider in result and 'items' in result[provider]:
            object_list = [item['label'] for item in result[provider]['items']]
            object_detected.append(object_list)
        else:
            object_detected.append([])
    return object_detected

# Function to check if a sentence contains a keyword
def contains_keyword(sent, keywords):
    for token in sent:
        if token.lemma_.lower() in keywords:
            return True
    return False

# Function to check if a sentence is imperative
def is_imperative(sent):
    first_token = sent[0]
    if first_token.pos_ == "VERB" and first_token.lemma_ == first_token.text and first_token.text.lower() not in ["is", "are", "was", "were"]:
        return True
    return False

# Function to check the segmented transcript and return relevant sentences with start times
def check_segmented_transcript(segmented_transcript, nlp_model, top_n):
    # List of keywords that indicate steps or imperatives
    step_keywords = ["step", "first", "next", "then", "now", "lastly", "finally", "secondly", "thirdly"]

    relevant_segments = []
    for segment in segmented_transcript:
        doc = nlp_model(segment['transcript_text'])
        for sent in doc.sents:
            if contains_keyword(sent, step_keywords) or is_imperative(sent):
                relevant_segments.append({'start_time': segment['start_time'], 'transcript_text': sent.text})
    # Sort the relevant segments based on your criteria for relevance
    # For simplicity, this example just takes the first N relevant segments
    # You can modify this to sort based on a more sophisticated relevance metric
    relevant_segments = relevant_segments[:top_n]
    return relevant_segments

# Function to combine transcripts and keep track of start times
def combine_transcripts_with_timing(transcript_data):
    combined_transcript = ""
    segment_start_times = []
    current_start_time = 0.000

    for segment in transcript_data:
        # Append the transcript text to the combined transcript
        combined_transcript += segment['transcript_text'] + " "

        # Record the start time for this segment
        segment_start_times.append(current_start_time)

        # Update the current start time for the next segment
        current_start_time += segment['start_time']

    # Remove the trailing space from the combined transcript
    combined_transcript = combined_transcript.strip()

    return combined_transcript, segment_start_times

def segment_transcript_with_timing(combined_transcript, segment_start_times, nlp_model):
    segmented_transcript = []
    doc = nlp_model(combined_transcript)

    # Initialize variables to keep track of the current segment and word count
    current_segment_index = 0
    current_word_count = 0

    for sent in doc.sents:
        sentence_word_count = len(sent.text.split())

        # Update the segment index if the current sentence goes beyond the current segment
        while (current_segment_index < len(segment_start_times) - 1 and
               current_word_count + sentence_word_count > len(combined_transcript[:combined_transcript.find(segmented_transcript[current_segment_index + 1]['transcript_text']) if current_segment_index + 1 < len(segmented_transcript) else len(combined_transcript)].split())):
            current_segment_index += 1

        # Add the sentence and its start time to the segmented transcript
        segmented_transcript.append({'start_time': segment_start_times[current_segment_index], 'transcript_text': sent.text})

        # Update the word count for the next sentence
        current_word_count += sentence_word_count

    return segmented_transcript


def process_videos(video_info, object_detection_api_token):
    # Set path for video download
    download_path = '/Users/runzhou/Desktop/rag/data'
    directory = '/Users/runzhou/Desktop/rag/data/video_frame_starttime'
    
    # Initialize an empty DataFrame
    df = pd.DataFrame()

    for video_id, info in video_info.items():
        # Clear the directory for captured frames
        clear_directory(directory)

        file_path = download_video(video_id, download_path)
        print("Processing file: {x}".format(x=file_path))
        if file_path:
            # Combine the transcript segments and keep track of start times
            combined_transcript, segment_start_times = combine_transcripts_with_timing(info['video_transcript'])

            # Segment the combined transcript and assign start times
            segmented_transcript = segment_transcript_with_timing(combined_transcript, segment_start_times, nlp)

            # Preprocess the transcript to get the most relevant sentences
            relevant_segments = check_segmented_transcript(segmented_transcript, nlp, top_n=5)
            #print(relevant_segments)

            # Get captured frames for the start times of the relevant segments
            start_times = [seg['start_time'] for seg in relevant_segments]
            captured_frames = get_captured_frames(file_path, start_times, directory)
            print("{x} captured frames.".format(x=len(captured_frames)))

            # Detect objects in the captured frames using Amazon Rekognition
            object_detected_amazon = detect_objects(captured_frames, 'amazon', object_detection_api_token)
            print(f"Object detection completed for {len(object_detected_amazon)} frames.")


            # Append processed rows to the DataFrame
            for i, segment in enumerate(relevant_segments):
                row = {
                    'video_id': video_id,
                    'segment_start_time': segment['start_time'],
                    'segmented_transcript': segment['transcript_text'],
                    'captured_frame_path': captured_frames[i] if i < len(captured_frames) else None,
                    'objects_detected_amazon': object_detected_amazon[i] if i < len(object_detected_amazon) else None
                }
                df = df.append(row, ignore_ind=True)
        
        return df


#video_info_sample = video_info.get('nHMFNj3i5j0')

object_detection_api_token = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VyX2lkIjoiYjVkMTM0MmMtYTAzNC00ZmNhLTg4NmEtOGNhYzk2YmYxYjA2IiwidHlwZSI6ImFwaV90b2tlbiJ9.AI0apCBMmPPLg9Vo4BftZ7PCr3YCgyG1KIk2EDxTNV0'
df = process_videos(video_info, object_detection_api_token)
print(df)


Cleared directory: /Users/runzhou/Desktop/rag/data/video_frame_starttime
Downloaded: How To Paint Cabinets | A Step-by-Step Guide
File path: /Users/runzhou/Desktop/rag/data/How To Paint Cabinets  A Step-by-Step Guide.mp4
Processing file: /Users/runzhou/Desktop/rag/data/How To Paint Cabinets  A Step-by-Step Guide.mp4
5 captured frames.
Object detection completed for 5 frames.
Cleared directory: /Users/runzhou/Desktop/rag/data/video_frame_starttime
Downloaded: An oh-so-satisfying steam clean 😌
File path: /Users/runzhou/Desktop/rag/data/An oh-so-satisfying steam clean 😌.mp4
Processing file: /Users/runzhou/Desktop/rag/data/An oh-so-satisfying steam clean 😌.mp4
1 captured frames.
Object detection completed for 1 frames.
Cleared directory: /Users/runzhou/Desktop/rag/data/video_frame_starttime
Downloaded: How To Cut Tile #diy #kitchenremodel #howtotile
File path: /Users/runzhou/Desktop/rag/data/How To Cut Tile diy kitchenremodel howtotile.mp4
Processing file: /Users/runzhou/Desktop/rag/data/H

In [286]:
len(df)

0

## TODO: Get more information for the YouTube videos, using video_id or url and YouTube API maybe

### Thumbnail

In [48]:
import requests

youtube_video_id = 'bsOH6-bWMuU'

for i in range(4):
    url = f"https://img.youtube.com/vi/{youtube_video_id}/{i}.jpg"
    response = requests.get(url)
    if response.status_code == 200:
        with open(f"image_{i}.jpg", "wb") as f:
            f.write(response.content)
        print(f"Image {i} downloaded successfully.")
    else:
        print(f"Failed to download image {i}.")

Image 0 downloaded successfully.
Image 1 downloaded successfully.
Image 2 downloaded successfully.
Image 3 downloaded successfully.


In [49]:
video_response = youtube.videos().list(
    part='snippet',
    id=youtube_video_id
).execute()

thumbnails = video_response['items'][0]['snippet']['thumbnails']

for thumbnail_type, thumbnail_info in thumbnails.items():
    print(f"Thumbnail type: {thumbnail_type}")
    print(f"Thumbnail URL: {thumbnail_info['url']}")

Thumbnail type: default
Thumbnail URL: https://i.ytimg.com/vi/bsOH6-bWMuU/default.jpg
Thumbnail type: medium
Thumbnail URL: https://i.ytimg.com/vi/bsOH6-bWMuU/mqdefault.jpg
Thumbnail type: high
Thumbnail URL: https://i.ytimg.com/vi/bsOH6-bWMuU/hqdefault.jpg
Thumbnail type: standard
Thumbnail URL: https://i.ytimg.com/vi/bsOH6-bWMuU/sddefault.jpg
Thumbnail type: maxres
Thumbnail URL: https://i.ytimg.com/vi/bsOH6-bWMuU/maxresdefault.jpg


In [50]:
for i in ['default', 'mqdefault', 'hqdefault', 'sddefault', 'maxresdefault']:
    url = f"https://img.youtube.com/vi/{youtube_video_id}/{i}.jpg"
    response = requests.get(url)
    if response.status_code == 200:
        with open(f"image_{i}.jpg", "wb") as f:
            f.write(response.content)
        print(f"Image {i} downloaded successfully.")
    else:
        print(f"Failed to download image {i}.")

Image default downloaded successfully.
Image mqdefault downloaded successfully.
Image hqdefault downloaded successfully.
Image sddefault downloaded successfully.
Image maxresdefault downloaded successfully.


In [None]:
# Download the videos using pytube (FULL)
download_path = '/Users/richard/Desktop/RAG Hackthon/data'
for video_id in video_ids:
    video_url = f'https://www.youtube.com/watch?v={video_id}'
    yt = YouTube(video_url)
    stream = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first()
    if stream:
        stream.download(download_path)
        print(f'Downloaded: {yt.title}')
    else:
        print(f'Could not download: {yt.title}')

## The embedded vectors

In [117]:
collection.get().get('embeddings')

In [118]:
print(collection.peek().get('embeddings'))

[[0.0, 75.0, 148.0, 37.0, 0.0, 0.0, 74.0, 28.0, 194.0, 118.0, 230.0, 85.0, 58.0, 164.0, 155.0, 195.0, 70.0, 99.0, 63.0, 247.0, 106.0, 225.0, 163.0, 130.0, 110.0, 7.0, 239.0, 131.0, 51.0, 140.0, 128.0, 194.0, 78.0, 72.0, 115.0, 115.0, 112.0, 214.0, 123.0, 81.0, 109.0, 146.0, 102.0, 187.0, 155.0, 125.0, 159.0, 130.0, 164.0, 84.0, 165.0, 112.0, 106.0, 96.0, 93.0, 108.0, 121.0, 120.0, 88.0, 117.0, 99.0, 112.0, 158.0, 114.0, 118.0, 149.0, 178.0, 157.0, 119.0, 94.0, 149.0, 140.0, 137.0, 112.0, 98.0, 81.0, 159.0, 141.0, 120.0, 112.0, 131.0, 120.0, 175.0, 70.0, 112.0, 112.0, 145.0, 159.0, 79.0, 144.0, 205.0, 93.0, 168.0, 137.0, 150.0, 134.0, 142.0, 124.0, 102.0, 166.0, 128.0, 73.0, 128.0, 176.0, 147.0, 128.0, 108.0, 76.0, 116.0, 186.0, 136.0, 120.0, 125.0, 160.0, 154.0, 134.0, 89.0, 78.0, 110.0, 149.0, 146.0, 70.0, 159.0, 102.0, 127.0, 138.0, 154.0, 149.0, 106.0, 169.0, 155.0, 112.0, 200.0, 125.0, 95.0, 110.0, 136.0, 188.0, 137.0, 192.0, 151.0, 136.0, 120.0, 126.0, 141.0, 140.0, 160.0, 111.0, 

In [69]:
df.head(10)

Unnamed: 0,labels,text_labels,url,WikiDescription,text_segment_labels,segment_labels,segment_start_times,segment_end_times,mean_rgb,mean_audio
0,"[2, 76, 281, 446]","[Vehicle, Boat, Motorboat, Yacht]",https://www.youtube.com/watch?v=KYVAv4gq00k,[A vehicle is a mobile machine that transports...,"[Luxury yacht, Luxury yacht, Luxury yacht, Lux...","[1314, 1314, 1314, 1314, 1314]","[50, 165, 230, 115, 210]","[55, 170, 235, 120, 215]","[0, 75, 148, 37, 0, 0, 74, 28, 194, 118, 230, ...","[175, 43, 116, 109, 154, 93, 75, 198, 172, 171..."
1,[191],[Hotel],https://www.youtube.com/watch?v=Ph-dQZnHChc,[A hotel is an establishment that provides pai...,"[Hotel, Hotel, Hotel]","[191, 191, 191]","[200, 255, 230]","[205, 260, 235]","[89, 195, 86, 213, 96, 100, 151, 183, 103, 151...","[163, 55, 116, 98, 91, 183, 151, 157, 177, 117..."
2,"[11, 1366, 1684, 1872]","[Food, Grocery store, Supermarket, Retail]",https://www.youtube.com/watch?v=jGve-X-fJug,[Food is any substance consumed to provide nut...,"[Grocery store, Grocery store, Grocery store, ...","[1366, 1366, 1366, 1366, 1366]","[85, 220, 230, 125, 45]","[90, 225, 235, 130, 50]","[0, 72, 173, 30, 0, 0, 40, 33, 199, 132, 199, ...","[175, 47, 132, 85, 169, 168, 156, 167, 157, 13..."
3,"[2, 7, 116, 235]","[Vehicle, Car, Sport utility vehicle, nan]",https://www.youtube.com/watch?v=GnbzWaaIGRY,[A vehicle is a mobile machine that transports...,"[Sport utility vehicle, Sport utility vehicle,...","[116, 116, 116, 235, 116, 235, 235, 116, 235, ...","[265, 220, 65, 180, 105, 230, 105, 85, 135, 165]","[270, 225, 70, 185, 110, 235, 110, 90, 140, 170]","[95, 203, 71, 171, 124, 165, 110, 172, 123, 17...","[157, 37, 131, 92, 127, 93, 194, 217, 140, 103..."
4,"[461, 1325]","[Printer (computing), Ink]",https://www.youtube.com/watch?v=200,"[In computing, a printer is a peripheral which...","[Ink, Ink, Ink, Ink, Ink]","[1325, 1325, 1325, 1325, 1325]","[50, 45, 85, 35, 110]","[55, 50, 90, 40, 115]","[86, 33, 92, 131, 206, 63, 92, 178, 43, 139, 5...","[133, 86, 84, 137, 155, 55, 95, 83, 153, 123, ..."
5,[607],[Board game],https://www.youtube.com/watch?v=200,[A board game is a tabletop game that involves...,"[Board game, Board game, Board game, Board gam...","[607, 607, 607, 607, 607]","[95, 215, 140, 60, 145]","[100, 220, 145, 65, 150]","[116, 161, 174, 145, 99, 88, 64, 117, 147, 104...","[105, 70, 109, 126, 116, 29, 122, 114, 181, 18..."
6,"[0, 1, 234, 1303]","[Game, Video game, Super Smash Bros., Yoshi]",https://www.youtube.com/watch?v=DTQpR-AqKnM,"[A game is structured form of play, usually un...","[Yoshi, Yoshi, Yoshi, Yoshi, Yoshi]","[1303, 1303, 1303, 1303, 1303]","[185, 45, 190, 105, 195]","[190, 50, 195, 110, 200]","[181, 16, 82, 162, 190, 255, 88, 28, 202, 213,...","[156, 218, 194, 148, 121, 42, 8, 73, 68, 35, 9..."
7,"[17, 19]","[Motorsport, Racing]",https://www.youtube.com/watch?v=tTfptynuWdA,[Motorsport or motorsports is a global term us...,"[Running, Running, Running, Running, Running]","[210, 210, 210, 210, 210]","[40, 65, 80, 30, 55]","[45, 70, 85, 35, 60]","[120, 193, 70, 224, 104, 170, 113, 125, 118, 1...","[65, 70, 167, 165, 166, 107, 173, 41, 113, 238..."


In [96]:
from nnsplit import NNSplit

ModuleNotFoundError: No module named 'nnsplit'

In [97]:
transcript_data = video_info['bsOH6-bWMuU']

# get all transcript_text
transcript_texts = [item['transcript_text'] for item in transcript_data['video_transcript']]

# merge to one string
full_transcript = ' '.join(transcript_texts)
print(full_transcript)
#
# splitter = NNSplit.load("en")
#
# # returns `Split` objects
# splits = splitter.split(["This is a test This is another test"])[0]
#
# for idx, sentence in enumerate(splits):
#     print(f"Sentence {idx + 1}: {sentence}")

do you ever wonder how dirty your couch actually is I was shocked when I found out because all of that is so nasty thankfully I used this steam cleaner to get all of the areas that I realized were much dirtier than I expected I picked up the Bissell Little Green Pro heat that was super manageable and easy to use it even came with its own spot and stain remover solution to mix with hot water and spray on your stained area once I started cleaning I instantly saw results and was surprised by how quickly it worked I really like that the bristle brush really got the solution into those deeper Spains it was so s satisfying to see the steam cleaner work its magic and now I have a much cleaner couch because of it bye-bye dirty couch


labels                                                             [191]
text_labels                                                      [Hotel]
url                          https://www.youtube.com/watch?v=Ph-dQZnHChc
WikiDescription        [A hotel is an establishment that provides pai...
text_segment_labels                                [Hotel, Hotel, Hotel]
segment_labels                                           [191, 191, 191]
segment_start_times                                      [200, 255, 230]
segment_end_times                                        [205, 260, 235]
mean_rgb               [89, 195, 86, 213, 96, 100, 151, 183, 103, 151...
mean_audio             [163, 55, 116, 98, 91, 183, 151, 157, 177, 117...
Name: 1, dtype: object