In [1]:
import pandas as pd

# Load datasets
meta_df = pd.read_csv("/content/Master_Task1_withTranscriptFlag.csv")

# Try reading with a different engine and error handling
try:
    trans_df = pd.read_csv("/content/Master_task2_Cleaned_main.csv", engine='python', on_bad_lines='skip')
except Exception as e:
    print(f"Error reading CSV: {e}")
    # If still fails, try reading with a different delimiter or quoting
    trans_df = pd.read_csv("/content/Master_task2_Cleaned_main.csv", engine='python', sep='\t', on_bad_lines='skip')


# Ensure consistent column name for merging
trans_df.rename(columns={"video_id": "id"}, inplace=True)

# Merge on video ID
merged_df = pd.merge(meta_df, trans_df, on="id", how="inner")

# Remove empty or missing transcripts
merged_df["transcript"] = merged_df["transcript"].fillna("").astype(str)
merged_df = merged_df[merged_df["transcript"].str.strip() != ""]

print(f"✅ Merged dataset shape: {merged_df.shape}")

# Save outputs
merged_df.to_csv("Merged_VideoData.csv", index=False)
merged_df.to_parquet("Merged_VideoData.parquet", index=False)

print("💾 Saved merged dataset as CSV and Parquet.")

✅ Merged dataset shape: (607, 26)
💾 Saved merged dataset as CSV and Parquet.


In [2]:

import pandas as pd
from sentence_transformers import SentenceTransformer

# Load merged dataset
merged_df = pd.read_csv("/content/Merged_VideoData.csv")

# Initialize model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Combine title and transcript for embeddings
merged_df["combined_text"] = merged_df["title"] + " " + merged_df["transcript"]

# Generate embeddings
embeddings = model.encode(merged_df["combined_text"].tolist(), show_progress_bar=True)

# Add embeddings to DataFrame
merged_df["embedding"] = embeddings.tolist()

# Save outputs
merged_df.to_csv("Merged_Embeddings.csv", index=False)
merged_df.to_parquet("Merged_Embeddings.parquet", index=False)

print(f"✅ Embeddings generated and saved for {len(merged_df)} videos.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/19 [00:00<?, ?it/s]

✅ Embeddings generated and saved for 607 videos.


In [1]:
!pip install chromadb



In [2]:
import pandas as pd
import numpy as np
import chromadb

# ============================
# Load dataset from Parquet
# ============================
merged_df = pd.read_parquet("/content/Merged_Embeddings.parquet")

# Initialize persistent ChromaDB client
client = chromadb.PersistentClient(path="./chroma_db")

# Create or get collection
collection = client.get_or_create_collection(name="youtube_videos")

# Remove duplicate IDs
merged_df.drop_duplicates(subset=['id'], inplace=True)

# Convert embedding column safely (if stored as string)
def parse_embedding(x):
    if isinstance(x, str):
        return np.array(eval(x))
    return np.array(x)

merged_df["embedding"] = merged_df["embedding"].apply(parse_embedding)

# Stack embeddings into a single array
embeddings = np.vstack(merged_df["embedding"].values)

# Add data to ChromaDB
collection.add(
    ids=merged_df["id"].astype(str).tolist(),
    embeddings=embeddings,
    metadatas=merged_df[["title", "transcript"]].to_dict(orient="records"),
    documents=merged_df["combined_text"].astype(str).tolist()
)

print(f"✅ Stored {len(merged_df)} videos in ChromaDB collection 'youtube_videos'.")
print("🎯 Data is ready for semantic search queries.")


✅ Stored 531 videos in ChromaDB collection 'youtube_videos'.
🎯 Data is ready for semantic search queries.


In [3]:
from sentence_transformers import SentenceTransformer
import chromadb
import numpy as np

# ===============================
# 1️⃣ Query Input Handling
# ===============================
def get_user_query():
    query = input("🔍 Enter your search query: ").strip()
    if not query:
        raise ValueError("❌ Query cannot be empty. Please enter a valid search term.")
    # Optional preprocessing
    query = ''.join(c for c in query if c.isalnum() or c.isspace())
    return query


# ===============================
# 2️⃣ Query Embedding Generation
# ===============================
def generate_query_embedding(query):
    print("⚙️ Loading embedding model...")
    model = SentenceTransformer("all-MiniLM-L6-v2")
    embedding = model.encode(query, convert_to_numpy=True)
    return embedding


# ===============================
# 3️⃣ Perform Semantic Search
# ===============================
def search_chromadb(query_embedding, top_k=5):
    print("🔎 Connecting to ChromaDB...")
    client = chromadb.PersistentClient(path="./chroma_db")
    collection = client.get_or_create_collection(name="youtube_videos")

    # Perform the semantic search
    results = collection.query(
        query_embeddings=query_embedding.tolist(),
        n_results=top_k,
        include=["metadatas", "documents", "distances"]
    )
    return results


# ===============================
# 4️⃣ Format and Filter Results
# ===============================
def format_results(results, min_score=0.2):
    formatted = []
    for i in range(len(results["ids"][0])):
        score = 1 / (1 + results["distances"][0][i])
  # Convert distance to similarity
        if score >= min_score:
            data = {
                "rank": i + 1,
                "title": results["metadatas"][0][i].get("title", "N/A"),
                "transcript": results["metadatas"][0][i].get("transcript", "")[:200] + "...",
                "similarity_score": round(score, 3)
            }
            formatted.append(data)
    return formatted


# ===============================
# 5️⃣ Display Results
# ===============================
def display_results(formatted_results):
    if not formatted_results:
        print("❌ No relevant results found.")
        return
    print("\n🎯 Top Search Results:")
    for r in formatted_results:
        print(f"\nRank {r['rank']}")
        print(f"Title: {r['title']}")
        print(f"Similarity Score: {r['similarity_score']}")
        print(f"Transcript (Preview): {r['transcript']}")


# ===============================
# 🚀 Main Script
# ===============================
if __name__ == "__main__":
    try:
        query = get_user_query()
        query_embedding = generate_query_embedding(query)
        results = search_chromadb(query_embedding, top_k=5)
        formatted_results = format_results(results, min_score=0.2)
        display_results(formatted_results)
    except Exception as e:
        print(f"⚠️ Error: {e}")


🔍 Enter your search query: ai tutorial
⚙️ Loading embedding model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


🔎 Connecting to ChromaDB...

🎯 Top Search Results:

Rank 1
Title: how to use ai as an accelerator, not a crutch, with freelance engineer ankur tyagi podcast 186
Similarity Score: 0.565
Transcript (Preview): welcome back to the free code camp podcast. i'm quincy larson, teacher and founder of free code camp. and this is a new season of the podcast. i'm going to be experimenting with the format a little bi...

Rank 2
Title: how to build advanced ai agents course for beginners livekit, exa, langchain
Similarity Score: 0.544
Transcript (Preview): welcome to this comprehensive course where we will build three cutting edge ai agents from scratch. first, you'll create a sophisticated sales agent that can engage in natural real time conversations ...

Rank 3
Title: production grade ai project tutorial build deploy
Similarity Score: 0.541
Transcript (Preview): this course will teach you to build a powerful enterprisegrade ai system that prepares highquality training data for tasks like creating

In [4]:
from sentence_transformers import SentenceTransformer
import chromadb
import numpy as np

# ===============================
# 1️⃣ Generate query embedding
# ===============================
def generate_query_embedding(query_text):
    model = SentenceTransformer("all-MiniLM-L6-v2")
    query_embedding = model.encode(query_text, convert_to_numpy=True)
    return query_embedding

# ===============================
# 2️⃣ Search top 5 results
# ===============================
def search_top_videos(query_text, top_k=5):
    # Load model & encode query
    embedding = generate_query_embedding(query_text)

    # Connect to ChromaDB
    client = chromadb.PersistentClient(path="./chroma_db")
    collection = client.get_or_create_collection(name="youtube_videos")

    # Perform semantic search
    results = collection.query(
        query_embeddings=embedding.tolist(),
        n_results=top_k,
        include=["metadatas", "documents", "distances"]
    )

    # Format results
    formatted_results = []
    for i in range(len(results["ids"][0])):
        score = 1 / (1 + results["distances"][0][i])
  # Convert distance to similarity
        formatted_results.append({
            "rank": i + 1,
            "title": results["metadatas"][0][i].get("title", "N/A"),
            "similarity_score": round(score, 3),
            "transcript": results["metadatas"][0][i].get("transcript", "")[:200] + "..."
        })

    return formatted_results

# ===============================
# 3️⃣ Display results
# ===============================
def display_results(results):
    if not results:
        print("❌ No relevant videos found.")
        return

    print("\n🎯 Top 5 Most Relevant Videos:")
    for r in results:
        print(f"\nRank {r['rank']}")
        print(f"Title: {r['title']}")
        print(f"Similarity Score: {r['similarity_score']}")
        print(f"Transcript Preview: {r['transcript']}")

# ===============================
# 🚀 Run the search
# ===============================
if __name__ == "__main__":
    user_query = input("🔍 Enter your search query: ").strip()
    if not user_query:
        print("❌ Please enter a valid query.")
    else:
        top_results = search_top_videos(user_query, top_k=5)
        display_results(top_results)


🔍 Enter your search query: python 

🎯 Top 5 Most Relevant Videos:

Rank 1
Title: how to go from 0 to 100 in python part 2
Similarity Score: 0.496
Transcript Preview: if you want to become a python developer as quickly as possible, then you have to skip through the fluff and focus on exactly what matters. now, once you've done that and you're really comfortable wri...

Rank 2
Title: 99 of devs don t know what this means in python
Similarity Score: 0.493
Transcript Preview: i'm willing to bet that most of you have seen this code before and probably have no idea what it actually does. so in this short video, i'm going to break that down. so let's go ahead and get started....

Rank 3
Title: why learning python won t land you a job in tech
Similarity Score: 0.488
Transcript Preview: please don't hate me, but if you want to land a job, you should probably stop learning python. when everyone knows python, it's no longer valuable. now, sure, it's good to know, but you're not going t...

Rank 4