In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import csv
import os
import networkx as nx
from subprocess import check_output
from wordcloud import WordCloud, STOPWORDS
import tensorflow as tf
from IPython.display import YouTubeVideo
plt.style.use('ggplot')

ModuleNotFoundError: No module named 'networkx'

In [None]:
DATA_PATH = "/Users/runzhou/Desktop/youtube/data/yt8m/video/"
# Inspect data directory
print(os.listdir(DATA_PATH))

# Explore video-level data
video_lvl_data_path = DATA_PATH + "train0274.tfrecord"
raw_dataset = tf.data.TFRecordDataset(video_lvl_data_path)


vid_ids = []
labels = []
rgb = []
audio = []

for example in tf.compat.v1.python_io.tf_record_iterator(video_lvl_data_path):
    seq_example = tf.train.Example.FromString(example)
    vid_ids.append(seq_example.features.feature['id'].bytes_list.value[0].decode(encoding='UTF-8'))
    labels.append(seq_example.features.feature['labels'].int64_list.value)
    rgb.append(seq_example.features.feature['mean_rgb'].float_list.value)
    audio.append(seq_example.features.feature['mean_audio'].float_list.value)

print('Number of videos in this tfrecord: ',len(vid_ids))
print ('Number of labels in this tfrecord: ', len (labels))
print('Number of mean RGB in this tfrecord: ', len(rgb))
print('Number of mean audio in this tfrecord: ', len(audio))
print('Picking a youtube video id:', vid_ids[1])

In [None]:
DATA_PATH = "/Users/runzhou/Desktop/youtube/data/yt8m/frame/"
# Inspect frame-level data directory
print(os.listdir(DATA_PATH))

import tensorflow as tf

# Explore frame-level data
frame_lvl_data_path = DATA_PATH + "train0274.tfrecord"
raw_dataset = tf.data.TFRecordDataset(frame_lvl_data_path)

vid_ids = []
labels = []
rgb = []
audio = []

for example in tf.compat.v1.python_io.tf_record_iterator(frame_lvl_data_path):
    seq_example = tf.train.SequenceExample.FromString(example)
    vid_ids.append(seq_example.context.feature['id'].bytes_list.value[0].decode(encoding='UTF-8'))
    labels.append(seq_example.context.feature['labels'].int64_list.value)

    # Decode RGB features
    rgb_encoded = seq_example.feature_lists.feature_list['rgb'].feature[0].bytes_list.value[0]
    rgb_decoded = tf.io.decode_raw(rgb_encoded, tf.float32)
    rgb.append(rgb_decoded.numpy().tolist())

    # Decode audio features
    audio_encoded = seq_example.feature_lists.feature_list['audio'].feature[0].bytes_list.value[0]
    audio_decoded = tf.io.decode_raw(audio_encoded, tf.float32)
    audio.append(audio_decoded.numpy().tolist())

print('Number of videos in this tfrecord: ',len(vid_ids))
print ('Number of labels in this tfrecord: ', len (labels))
print('Number of mean RGB in this tfrecord: ', len(rgb))
print('Number of mean audio in this tfrecord: ', len(audio))
print('Picking a youtube video id:', vid_ids[1])

In [None]:
DATA_PATH = "/Users/runzhou/Desktop/youtube/data/yt8m/segment/"

# Inspect segment-level data directory
print(os.listdir(DATA_PATH))

# Explore segment-level data
segment_lvl_data_path = DATA_PATH + "validate3489.tfrecord"
raw_dataset = tf.data.TFRecordDataset(segment_lvl_data_path)

vid_ids = []
labels = []
rgb = []
audio = []
segment_start_times = []
segment_end_times = []
segment_scores = []
segment_labels = []


for example in tf.compat.v1.python_io.tf_record_iterator(segment_lvl_data_path):
    # Context features
    example_example = tf.train.Example.FromString(example)
    
    vid_ids.append(example_example.features.feature['id'].bytes_list.value[0].decode(encoding='UTF-8'))
    labels.append(example_example.features.feature['labels'].int64_list.value)

    # Segment labels & scores
    segment_start_times.append(example_example.features.feature['segment_start_times'].int64_list.value)
    segment_end_times.append(example_example.features.feature['segment_end_times'].int64_list.value)
    segment_labels.append(example_example.features.feature['segment_labels'].int64_list.value)

    # Frame features
    seq_example = tf.train.SequenceExample.FromString(example)

    # Decode RGB features
    rgb_encoded = seq_example.feature_lists.feature_list['rgb'].feature[0].bytes_list.value[0]
    rgb_decoded = tf.io.decode_raw(rgb_encoded, tf.uint8)
    rgb.append(rgb_decoded.numpy().tolist())

    # Decode audio features
    audio_encoded = seq_example.feature_lists.feature_list['audio'].feature[0].bytes_list.value[0]
    audio_decoded = tf.io.decode_raw(audio_encoded, tf.uint8)
    audio.append(audio_decoded.numpy().tolist())


print('Number of videos in this tfrecord: ',len(vid_ids))
print ('Number of labels in this tfrecord: ', len (labels))
print('Number of mean RGB in this tfrecord: ', len(rgb))
print('Number of mean audio in this tfrecord: ', len(audio))
print('Picking a youtube video id:', vid_ids[1])


In [None]:
#rgb: 8 x 1024
#audio: 8 x 128
len(audio[0])

In [None]:
# Translate video ID to Youtube URL
import requests

def translate_video_id(encoded_id):
    # Construct the URL to retrieve the mapping
    url = f"http://data.yt8m.org/2/j/i/{encoded_id[:2]}/{encoded_id}.js"
    response = requests.get(url)

    # Check if the response is successful and has the expected format
    if response.status_code == 200 and 'i("' in response.text:
        # Extract the standard YouTube video ID from the response
        video_id = response.text.split('"')[3]
        return video_id
    else:
        # Log an error or handle it as appropriate
        print(f"Error retrieving video ID for {encoded_id}: {response.text}")
        return '200'

youtube_urls = []
for encoded_id in vid_ids:
    video_id = translate_video_id(encoded_id)
    if video_id:
        youtube_url = f"https://www.youtube.com/watch?v={video_id}"
        youtube_urls.append(youtube_url)

# Print or use the YouTube URLs
for url in youtube_urls:
    print(url)


In [None]:
DATA_PATH = "/Users/runzhou/Desktop/youtube/data/"
# Read in vocabulary for human-rated mapping of numeric label to text label for the video
vocab = pd.read_csv(DATA_PATH + 'vocabulary.csv')

# Create a pandas dataframe to store the segment-rated frame-level contextual data and embeddings
df = pd.DataFrame()

for i, encoded_id in enumerate(vid_ids):
    video_id = translate_video_id(encoded_id)
    if video_id:
        youtube_url = f"https://www.youtube.com/watch?v={video_id}"
    
    text_labels = [vocab[vocab['Index'] == label]['Name'].values[0] for label in labels[i]]
    wiki_descriptions = [vocab[vocab['Index'] == label]['WikiDescription'].values[0] for label in labels[i]]
    text_segment_labels = [vocab[vocab['Index'] == label]['Name'].values[0] for label in segment_labels[i]]
    df = pd.concat([df, pd.DataFrame([{'labels': labels[i],
                    'text_labels': text_labels,
                    'url': youtube_url,
                    'WikiDescription': wiki_descriptions,
                    'text_segment_labels': text_segment_labels,
                    'segment_labels': segment_labels[i],
                    'segment_start_times': segment_start_times[i],
                    'segment_end_times': segment_end_times[i],
                    'mean_rgb': rgb[i],
                    'mean_audio': audio[i]}])],
                    ignore_index=True)

# Display the DataFrame
print(df)

In [None]:
df.head(5)

#### Build a generic RAG system

In [None]:
#!sudo apt-get install build-essential libssl-dev python3-dev
#!python -m pip install --upgrade pip setuptools wheel
#!python -m pip install grpcio

#!python -m pip install chromadb

#!python --version

In [None]:
df.head(10)

In [None]:
import chromadb

# Initialize the ChromaDB client
chroma_client = chromadb.Client()

#Create a collections to store your embeddings, documents, and any additional metadata. 
collection = chroma_client.get_or_create_collection(name='rgb_collection')

# Store embeddings in ChromaDB
for i, row in df.iterrows():
    # Create a unique ID for each embedding
    id = f'{i}'

    # Get info of each row
    metadatas_dict = {
    #'labels': row['labels'],
    #'text_labels': row['text_labels'],
    #'segment_labels': row['segment_labels'],
    #'text_segment_labels': row['text_segment_labels'],
    #'segment_start_times': row['segment_start_times'],
    #'segment_end_times': row['segment_end_times'],
    'url': row['url'],
    'wiki': row['WikiDescription'][0]}

    # Store the embedding in ChromaDB
    collection.add(
        embeddings=row['mean_rgb'],
        metadatas=metadatas_dict,
        ids=id
    )

print('Embeddings stored in ChromaDB successfully.')


In [None]:
# get items from a collection
collection.peek()



In [None]:
# do nearest neighbor search to find similar embeddings or documents, supports filtering
collection.query(
    query_embeddings=rgb[3],
    n_results=2
    #where={"style": "style2"}
)

In [None]:
df.iloc[6]

In [None]:
df.iloc[2]

In [None]:
#!pip install youtube_transcript_api

In [None]:
youtube_urls

# https://www.youtube.com/watch?v=h06wB5JlyNg


In [None]:
#!pip install google-api-python-client
#!pip install pytube