In [24]:
import numpy as np
from youtube_transcript_api import YouTubeTranscriptApi
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import spacy
from gensim.models import Word2Vec


In [25]:

def extract_transcript_with_timestamps(video_id):
    try:
        transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
        transcript_with_timestamps = ''
        for segment in transcript_list:
            start_time = segment['start']
            end_time = segment['start'] + segment['duration']
            transcript_with_timestamps += f"[{format_time(start_time)} - {format_time(end_time)}] {segment['text']}\n"
        return transcript_with_timestamps, transcript_list
    except Exception as e:
        print("An error occurred:", e)
        return None, None


In [26]:

def format_time(seconds):
    minutes, seconds = divmod(seconds, 60)
    hours, minutes = divmod(minutes, 60)
    return '{:02}:{:02}:{:02}'.format(int(hours), int(minutes), int(seconds))


In [27]:
# Load English tokenizer, tagger, parser, NER, and word vectors
nlp = spacy.load("en_core_web_sm")

In [28]:

# # Load Word2Vec model
# word2vec_model = Word2Vec.load("GoogleNews-vectors-negative300.bin")

In [29]:
# Function to predict timestamps for a given subtopic
def predict_timestamps(transcript_list, subtopic):
    # Extract text from transcript segments
    segments_text = [segment['text'] for segment in transcript_list]

    # Perform named entity recognition (NER) on the subtopic
    doc = nlp(subtopic)
    ner_subtopic = [ent.text for ent in doc.ents]

    # Keyword extraction for the subtopic
    keywords = [token.text for token in doc if not token.is_stop]

    # Combine NER entities and keywords
    subtopic_tokens = set(ner_subtopic + keywords)

    # Vectorize the segments and the subtopic tokens
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(segments_text)
    subtopic_vec = vectorizer.transform([" ".join(subtopic_tokens)])

    # Compute cosine similarity between subtopic and each segment
    similarities = cosine_similarity(X, subtopic_vec)

    # Get index of the segment with highest similarity
    closest_segment_index = np.argmax(similarities)

    # Get the start time of the closest segment
    closest_segment = transcript_list[closest_segment_index]
    start_time = closest_segment['start']
    return start_time


In [30]:

# Replace 'VIDEO_ID' with the actual ID of the YouTube video
video_id = 'PYS3UZFPJWI'
transcript_with_timestamps, transcript_list = extract_transcript_with_timestamps(video_id)

print("Transcript with Timestamps:")
print(transcript_with_timestamps)


Transcript with Timestamps:
[00:00:05 - 00:00:11] hey guys it's kendall i am so excited to
[00:00:09 - 00:00:13] be back for my second beauty secrets
[00:00:11 - 00:00:15] video with vogue i did my last one
[00:00:13 - 00:00:17] almost four years ago and i was just a
[00:00:15 - 00:00:19] little baby and
[00:00:17 - 00:00:21] my routine has completely changed since
[00:00:19 - 00:00:24] then so i'm excited to show you guys my
[00:00:21 - 00:00:26] new routine i obviously look really
[00:00:24 - 00:00:29] really crazy i have this avocado mask on
[00:00:26 - 00:00:30] that i make myself in my kitchen
[00:00:29 - 00:00:32] and
[00:00:30 - 00:00:35] um it's super fun to make and super easy
[00:00:32 - 00:00:37] i put an avocado in a bowl and then mash
[00:00:35 - 00:00:38] it up with some oatmeal
[00:00:37 - 00:00:40] this
[00:00:38 - 00:00:42] superfood honey
[00:00:40 - 00:00:45] i got it from courtney so i'm assuming
[00:00:42 - 00:00:48] it's really good and natural and then
[00:00:45 

In [31]:
if transcript_with_timestamps and transcript_list:

    # Example subtopic
    subtopic = "lip Blush"
    start_time = predict_timestamps(transcript_list, subtopic)
    print(f"Start Time for {subtopic} :", format_time(start_time))
else:
    print("Transcript extraction failed.")


Start Time for lip Blush : 00:16:04


In [32]:
def generate_youtube_link(video_id, start_time_seconds):
    base_url = f"https://www.youtube.com/watch?v={video_id}"
    time_parameter = f"&t={start_time_seconds}s"
    return base_url + time_parameter

# Example usage:
video_id = 'PYS3UZFPJWI'
start_time_seconds = 964  # 00:16:04 in seconds
youtube_link = generate_youtube_link(video_id, start_time_seconds)
print("YouTube Link:", youtube_link)


YouTube Link: https://www.youtube.com/watch?v=PYS3UZFPJWI&t=964s


In [33]:
def generate_youtube_link(video_id, start_time_seconds):
    base_url = f"https://www.youtube.com/watch?v={video_id}"
    time_parameter = f"&t={start_time_seconds}s"
    return base_url + time_parameter


In [34]:
if transcript_with_timestamps and transcript_list:
    # Example subtopic
    subtopic = "lip Blush"
    start_time = predict_timestamps(transcript_list, subtopic)
    print(f"Start Time for {subtopic} :", format_time(start_time))

    # Generate YouTube link
    video_id = 'PYS3UZFPJWI'  # Replace with your video ID
    youtube_link = generate_youtube_link(video_id, start_time)
    print("YouTube Link:", youtube_link)
else:
    print("Transcript extraction failed.")


Start Time for lip Blush : 00:16:04
YouTube Link: https://www.youtube.com/watch?v=PYS3UZFPJWI&t=964.959s
