### Check Video ID in ElasticSearch Index

In [25]:
import json
from tqdm import tqdm
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer

VIDEO_ID = 'zjkBMFhNj_g'
INDEX_NAME = "video-transcripts-vect"
VECTOR_MODEL = 'multi-qa-MiniLM-L6-cos-v1'
VECTOR_DIMS = 384

es_client = Elasticsearch('http://localhost:9200')
model = SentenceTransformer(VECTOR_MODEL)

INDEX_SETTINGS = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "uid": {"type": "keyword"},
            "text": {"type": "text"},
            "smry_text": {"type": "text"},
            "clean_text": {"type": "text"},
            "keywords": {"type": "text"},
            "text_vector": {
                "type": "dense_vector",
                "dims": VECTOR_DIMS,
                "index": True,
                "similarity": "cosine"
            },
            "smry_vector": {
                "type": "dense_vector",
                "dims": VECTOR_DIMS,
                "index": True,
                "similarity": "cosine"
            },
            "cleantext_vector": {
                "type": "dense_vector",
                "dims": VECTOR_DIMS,
                "index": True,
                "similarity": "cosine"
            },
            "kwords_vector": {
                "type": "dense_vector",
                "dims": VECTOR_DIMS,
                "index": True,
                "similarity": "cosine"
            },
            "kwords_smry_vector": {
                "type": "dense_vector",
                "dims": VECTOR_DIMS,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}


def load_jsonfile(filename):
    '''
    filename : filepath and filename(.json file) as a single string
    This function can be used to read any json file
    '''
    with open(filename, 'rt') as f:
        data_json = json.load(f)
    return data_json


def index_doc(filename, index_name):
    documents = load_jsonfile(filename)

    for doc in tqdm(documents):
        text = doc['text']
        smry_text = doc['smry_text']
        clean_text = doc['clean_text']
        keywords = doc['keywords']
        kwords_smry = keywords + ' ' + smry_text

        doc['text_vector'] = model.encode(text)
        doc['smry_vector'] = model.encode(smry_text)
        doc['cleantext_vector'] = model.encode(clean_text)
        doc['kwords_vector'] = model.encode(keywords)
        doc['kwords_smry_vector'] = model.encode(kwords_smry)

    for doc in tqdm(documents):
        es_client.index(index=index_name, document=doc)

In [26]:
def is_video_id_indexed(video_id, index_name):
    """
    Checks if the video_id is already indexed in the Elasticsearch index.
    
    Args:
        video_id (str): The video ID to check in Elasticsearch.
        index_name (str): The Elasticsearch index to search in. Default is "video-transcripts".
    
    Returns:
        bool: True if the video_id is indexed, False otherwise.
    """
    search_query = {
        "query": {
            "wildcard": {
                "uid": {
                    "value": f"{video_id}__*"  # Using wildcard to match 'video_id' part in 'uid'
                }
            }
        }
    }
    
    response = es_client.search(index=index_name, body=search_query)
    
    if response['hits']['total']['value'] > 0:
        return True
    return False


def process_and_index_video(video_id, index_name):
    """
    Process the video transcript for the provided video_id and index it into Elasticsearch.
    
    Args:
        video_id (str): The video ID to process.
        
    Returns:
        None
    """
    print(f"INFO: processing tarnscript data for video_id: {video_id}")
    # Placeholder: Implement the logic to extract, process, and save the video transcript as json document

    print(f"INFO: indexing data for video_id: {video_id}")
    index_doc("../data/summary_transcripts/tscribe_vid_"+video_id+".json", index_name)


def check_and_index_video(video_id, index_name):
    """
    Checks if the provided video_id's data is already indexed in Elasticsearch.
    If not, processes and indexes the video data.
    
    Args:
        video_id (str): The video ID to check and index if not found.
        index_name (str): The Elasticsearch index to check in. Default is "video-transcripts".
    
    Returns:
        None
    """
    if not es_client.indices.exists(index=index_name):
        print(f"INFO: index does not exist.")
        es_client.indices.create(index=index_name, body=INDEX_SETTINGS)
        print(f"INFO: index '{index_name}' created.")
        print(f"INFO: processing and indexing data for video_id {video_id}...")
        process_and_index_video(video_id, index_name)
    elif is_video_id_indexed(video_id, index_name):
        print(f"INFO: data for video_id {video_id} is already indexed.")
    else:
        print(f"INFO: data for video_id {video_id} not found. processing and indexing...")
        process_and_index_video(video_id, index_name)

In [28]:
# es_client.indices.delete(index=INDEX_NAME)
# print(f"INFO: index '{INDEX_NAME}' deleted.")

In [29]:
# Example usage with no index:
check_and_index_video(VIDEO_ID, INDEX_NAME)

INFO: index does not exist.
INFO: index 'video-transcripts-vect' created.
INFO: processing and indexing data for video_id zjkBMFhNj_g...
INFO: processing tarnscript data for video_id: zjkBMFhNj_g
INFO: indexing data for video_id: zjkBMFhNj_g


100%|██████████| 56/56 [00:18<00:00,  2.97it/s]
100%|██████████| 56/56 [00:08<00:00,  6.24it/s]


In [30]:
# Example usage with existing video_id in index:
check_and_index_video(VIDEO_ID, INDEX_NAME)

INFO: data for video_id zjkBMFhNj_g is already indexed.


In [31]:
def search_video_id(video_id, index_name):
    search_query = {
        "query": {
            "wildcard": {
                "uid": {
                    "value": f"{video_id}__*"  # Using wildcard to match 'video_id' part in 'uid'
                }
            }
        }
    }
    
    response = es_client.search(index=index_name, body=search_query)
    
    # if response['hits']['total']['value'] > 0:
    #     return True
    # return False

    return response

In [32]:
tmpp = search_video_id(VIDEO_ID, INDEX_NAME)

In [33]:
tmpp['hits']

{'total': {'value': 56, 'relation': 'eq'},
 'max_score': 1.0,
 'hits': [{'_index': 'video-transcripts-vect',
   '_id': 'wXWAaJIB2wc2yqBi-Y1m',
   '_score': 1.0,
   '_source': {'uid': 'zjkBMFhNj_g__B1__S0.16',
    'text': " hi everyone so recently I gave a 30-minute talk on large language models just kind of like an intro talk um unfortunately that talk was not recorded but a lot of people came to me after the talk and they told me that uh they really liked the talk so I would just I thought I would just re-record it and basically put it up on YouTube so here we go the busy person's intro to large language models director Scott okay so let's begin first of all what is a large language model really well a large language model is just two files right um there will be two files in this hypothetical directory so for example working with a specific example of the Llama 270b model this is a large language model released by meta Ai and this is basically the Llama series of language models the 

### Test run "get_transcript.py" script (with above functionalities; before adding "ingest_data.py")

In [8]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200')
es_client.indices.delete(index="video-transcripts-vect")
print(f"INFO: index deleted.")

INFO: index deleted.


In [9]:
!python ../scripts/get_transcript.py --video_id zjkBMFhNj_g --index_name video-transcripts-vect --filepath ../data/summary_transcripts

DEBUG: multi-qa-MiniLM-L6-cos-v1, 384
INFO: index does not exist.
INFO: index 'video-transcripts-vect' created.
INFO: processing and indexing data for video_id zjkBMFhNj_g...
INFO: processing tarnscript data for video_id: zjkBMFhNj_g
INFO: indexing data for video_id: zjkBMFhNj_g
INFO: creating vector embeddings
INFO: adding documents to index
INFO: added 56 documents to index




### Test run "get_transcript.py" script (after adding "ingest_data.py")

In [7]:
from youtube_transcript_api import YouTubeTranscriptApi
import pandas as pd
VIDEO_ID = 't6Wc7OMks4U'
LANG = ['en', 'en-US', ]
srt = YouTubeTranscriptApi.get_transcript(VIDEO_ID, languages=LANG)
df_srt = pd.DataFrame(srt)
df_srt.head(3)

Unnamed: 0,text,start,duration
0,Every piece of art tells a story.,0.0,2.48
1,"Usually, more than one.",2.48,1.56
2,I’ll show you.,4.04,2.24


In [5]:
transcript_list = YouTubeTranscriptApi.list_transcripts(VIDEO_ID)
for transcript in transcript_list:
    print(transcript.language,transcript.language_code, transcript.is_generated, transcript.is_translatable)

English (United States) en-US False True


In [8]:
!python ../scripts/get_transcript.py --video_id t6Wc7OMks4U --index_name video-transcripts-vect --filepath ../data/summary_transcripts



INFO: data for video_id t6Wc7OMks4U not found. processing and indexing...
INFO: processing tarnscript data for video_id: t6Wc7OMks4U
INFO: extracting raw video transcript t6Wc7OMks4U
INFO: raw transcript extracted with 1864 words
INFO: chunking transcript
INFO: initiated block creation of video transcript
INFO: video length 12 | block size 5 | stride 1 | max blocks 116
INFO: generated block 5 | start 4 | stop 9 | rows combined 62
INFO: reached max blocks limit
INFO: generated block 8 | start 7 | stop 12 | rows combined 1
INFO: reached end of video
INFO: original data (163, 3) | block data (8, 3)
INFO: generating summary and uid
INFO: initiated summary generation
INFO: total text blocks 8
INFO: generating summaries
INFO: summary generation finished
INFO: temp file created with summary
INFO: generating clean text
INFO: initiated text cleaning
INFO: total text blocks 8
INFO: cleaning texts
INFO: text cleaning finished
INFO: temp file created with cleaned text
INFO: generating keywords
INF

### Multi video transcript

In [12]:
from youtube_transcript_api import YouTubeTranscriptApi
import pandas as pd
VIDEO_ID = 't6Wc7OMks4U'
LANG = ['en', 'en-US', ]
srt = YouTubeTranscriptApi.get_transcript(VIDEO_ID, languages=LANG)
type(srt), len(srt)

(list, 163)

In [14]:
from youtube_transcript_api import YouTubeTranscriptApi
import pandas as pd
VIDEO_ID = ['t6Wc7OMks4U', 'Cd2ch4XV84s']
LANG = ['en', 'en-US', ]
srt = YouTubeTranscriptApi.get_transcripts(VIDEO_ID, languages=LANG)
# df_srt = pd.DataFrame(srt)
# df_srt.head(3)
type(srt), len(srt)

(tuple, 2)

In [20]:
len(srt[0][VIDEO_ID[0]]), len(srt[0][VIDEO_ID[1]])

(163, 88)