In [None]:
# Install required dependencies
!pip install sentence-transformers pinecone-client youtube-transcript-api spacy nltk

import numpy as np
import torch
import time
import re
import spacy
from collections import Counter
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import TextFormatter
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec

# Check for GPU availability
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# Initialize Pinecone
pc = Pinecone(api_key="pcsk_3rWW1w_Eua9C9tD1rbQybpChVD9nDijUycon7auXNs3afy7T2Z2zK2YnSHEFeLmKJsx4pp")

# Create/connect to index
index_name = "video-embeddings"

# Check if index exists, create if not
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,  # all-MiniLM-L6-v2 outputs 384-dimensional embeddings
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )
    # Wait for index to be ready
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

index = pc.Index(index_name)

# Load model on GPU if available
model = SentenceTransformer("all-MiniLM-L6-v2").to(device)

# Load SpaCy model for NLP tasks
nlp = spacy.load("en_core_web_sm")

def extract_transcript(video_id):
    """Extracts transcript from a YouTube video."""
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        formatter = TextFormatter()
        return formatter.format_transcript(transcript)
    except Exception as e:
        print(f"Error fetching transcript: {e}")
        return None

def preprocess_text(text):
    """Clean and preprocess the text data."""
    if not text:
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and digits
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def pos_tagging(text):
    """Perform POS tagging on the given text and return analysis."""
    doc = nlp(text)
    
    # Extract POS counts
    pos_counts = Counter([token.pos_ for token in doc])
    
    # Extract key entities and their POS
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    
    # Extract key noun phrases (potential skills/topics)
    noun_chunks = [chunk.text for chunk in doc.noun_chunks]
    
    # Extract frequent verbs (actions)
    verbs = [token.lemma_ for token in doc if token.pos_ == "VERB"]
    verb_counts = Counter(verbs)
    
    return {
        "pos_distribution": dict(pos_counts),
        "entities": entities,
        "noun_phrases": noun_chunks,
        "common_verbs": dict(verb_counts.most_common(10))
    }

def get_embedding(text):
    """Generates embedding using Sentence Transformers."""
    return model.encode(text).tolist()

def chunk_text(text, chunk_size=150, overlap=20):
    """Split text into overlapping chunks of roughly equal size."""
    words = text.split()
    if len(words) <= chunk_size:
        return [text]
    
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = ' '.join(words[i:i + chunk_size])
        if chunk:  # Make sure we're not adding empty chunks
            chunks.append(chunk)
    
    return chunks

def generate_embeddings(video_id):
    """Generates embeddings from YouTube video transcript."""
    transcript = extract_transcript(video_id)
    if not transcript:
        return None, None
    
    # Preprocess the transcript
    processed_transcript = preprocess_text(transcript)
    
    # Perform POS tagging and analysis
    pos_analysis = pos_tagging(processed_transcript)
    
    # Split into proper chunks for embeddings - use word-based chunking for more control
    text_chunks = chunk_text(processed_transcript)
    
    print(f"Created {len(text_chunks)} text chunks from transcript")
    
    # Generate embeddings for each chunk
    embeddings = []
    for chunk in text_chunks:
        if len(chunk.strip()) > 10:  # Only process non-empty chunks
            embedding = get_embedding(chunk)
            embeddings.append(embedding)
    
    print(f"Generated {len(embeddings)} embeddings")
    
    return np.array(embeddings, dtype=np.float32), pos_analysis

def store_embeddings_in_pinecone(embeddings, video_id, pos_analysis=None):
    """Store embeddings in Pinecone index with metadata."""
    if embeddings is not None and len(embeddings) > 0:
        ids = [f"{video_id}_{i}" for i in range(len(embeddings))]
        
        # Format vectors for Pinecone upsert
        vectors = []
        for i, emb in enumerate(embeddings):
            # Create basic metadata for each chunk
            metadata = {
                "video_id": video_id,
                "chunk_id": i
            }
            
            # Add full POS analysis only to the first vector to avoid duplication
            if i == 0 and pos_analysis:
                metadata.update({
                    "pos_distribution": str(pos_analysis["pos_distribution"]),
                    "common_entities": str(pos_analysis["entities"][:10] if len(pos_analysis["entities"]) > 10 else pos_analysis["entities"]),
                    "common_verbs": str(pos_analysis["common_verbs"])
                })
            
            vector_entry = {
                "id": ids[i], 
                "values": emb.tolist(), 
                "metadata": metadata
            }
            vectors.append(vector_entry)
            
        # Upsert in batches to avoid size limitations
        batch_size = 100
        for i in range(0, len(vectors), batch_size):
            batch = vectors[i:i+batch_size]
            index.upsert(vectors=batch)
            print(f"Upserted batch {i//batch_size + 1}/{(len(vectors)-1)//batch_size + 1} to Pinecone")
            
        print(f"Successfully upserted {len(vectors)} embeddings into Pinecone.")
        
        # If POS analysis exists, print summary
        if pos_analysis:
            print("\nPOS Analysis Summary:")
            print(f"- Most common parts of speech: {dict(Counter(pos_analysis['pos_distribution']).most_common(5))}")
            print(f"- Top entities detected: {pos_analysis['entities'][:5] if pos_analysis['entities'] else 'None'}")
            print(f"- Sample noun phrases: {pos_analysis['noun_phrases'][:5] if pos_analysis['noun_phrases'] else 'None'}")
            print(f"- Most common verbs: {list(pos_analysis['common_verbs'].keys())[:5] if pos_analysis['common_verbs'] else 'None'}")

def main():
    """Main function to process YouTube videos."""
    yt_video_id = "CqOfi41LfDw"  # Replace with actual YouTube video ID
    
    print(f"Processing YouTube video: {yt_video_id}")
    print("Extracting transcript and generating embeddings...")
    
    embeddings, pos_analysis = generate_embeddings(yt_video_id)
    
    if embeddings is not None and len(embeddings) > 0:
        print(f"Generated {len(embeddings)} embedding chunks")
        store_embeddings_in_pinecone(embeddings, yt_video_id, pos_analysis)
    else:
        print("Failed to generate embeddings")

if __name__ == "__main__":
    main()


[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


