In [26]:
from sentence_transformers import SentenceTransformer, CrossEncoder
from pinecone import Pinecone
from rank_bm25 import BM25Okapi

# Initialize Pinecone
pc = Pinecone(api_key="pcsk_3rWW1w_Eua9C9tD1rbQybpChVD9nDijUycon7auXNs3afy7T2Z2zK2YnSHEFeLmKJsx4pp", region="us-east-1")
index_name = "video-embeddings"
index = pc.Index(index_name)

# Load Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def get_embedding(text):
    """Generates an embedding using Sentence Transformers."""
    return model.encode(text).tolist()

def bm25_search(query, corpus):
    """Performs BM25 keyword search on the given corpus."""
    tokenized_corpus = [doc.split() for doc in corpus]
    bm25 = BM25Okapi(tokenized_corpus)
    tokenized_query = query.split()
    scores = bm25.get_scores(tokenized_query)
    return sorted(zip(corpus, scores), key=lambda x: x[1], reverse=True)[:5]

def rerank_results(query, retrieved_texts):
    """Re-ranks results using a Cross-Encoder model."""
    pairs = [(query, text) for text in retrieved_texts]
    scores = reranker.predict(pairs)
    sorted_results = [text for _, text in sorted(zip(scores, retrieved_texts), reverse=True)]
    return sorted_results

def search_transcript(user_query, top_k=10):
    """Searches Pinecone for relevant transcript sections based on a user query using hybrid search."""
    
    # Convert query to embedding
    query_embedding = get_embedding(user_query)
    
    # Semantic search in Pinecone (without using metadata)
    result = index.query(vector=query_embedding, top_k=top_k)
    
    # Extracting only the retrieved text chunks (no metadata)
    pinecone_results = [match['id'] for match in result['matches']]
    
    # Re-rank the combined results
    final_results = rerank_results(user_query, pinecone_results)
    
    return final_results


In [28]:
user_query = "What are machines?"
retrieved_results = search_transcript(user_query)

print("\n🔹 Most Relevant Transcript Sections:")
for idx, section in enumerate(retrieved_results, 1):
    print(f"{idx}. {section}\n")



🔹 Most Relevant Transcript Sections:
1. ukzFI9rgwfU_1

2. ukzFI9rgwfU_33

3. VMj-3S1tku0_44

4. i_LwzRVP7bg_158

5. VMj-3S1tku0_307

6. i_LwzRVP7bg_16

7. i_LwzRVP7bg_14

8. bmmQA8A-yUA_20

9. bmmQA8A-yUA_25

10. bmmQA8A-yUA_24

