**Import Libraries**

In [4]:
import sqlite3
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np



KeyboardInterrupt



In [2]:
pip install sentence_transformers

Defaulting to user installation because normal site-packages is not writeable
Collecting sentence_transformers
  Obtaining dependency information for sentence_transformers from https://files.pythonhosted.org/packages/8b/c8/990e22a465e4771338da434d799578865d6d7ef1fdb50bd844b7ecdcfa19/sentence_transformers-3.3.1-py3-none-any.whl.metadata
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence_transformers)
  Obtaining dependency information for transformers<5.0.0,>=4.41.0 from https://files.pythonhosted.org/packages/51/51/b87caa939fedf307496e4dbf412f4b909af3d9ca8b189fc3b65c1faa456f/transformers-4.46.3-py3-none-any.whl.metadata
  Downloading transformers-4.46.3-py3-none-any.whl.metadata (44 kB)
     ---------------------------------------- 0.0/44.1 kB ? eta -:--:--
     ----------------- -------------------- 20.5/44.1 kB 640.0 kB/s eta 0:00:01
     -------------------------- ----------- 30.7/44.1 kB 259.2 kB/s eta 0



**Load and Preprocess Subtitle Data**

In [None]:
# Step 2.1: Connect to the Database
db_path = "path_to_your_database.db"  # Replace with the actual path
conn = sqlite3.connect(db_path)

# Step 2.2: Read Subtitle Data
query = "SELECT * FROM subtitles_table"  # Replace with the actual table name
df = pd.read_sql_query(query, conn)

# Step 2.3: Clean Subtitle Text
def clean_text(text):
    # Remove timestamps and unwanted characters
    text = re.sub(r'\[.*?\]', '', text)  # Remove text within square brackets
    text = re.sub(r'\d+:\d+:\d+', '', text)  # Remove timestamps
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

df['cleaned_text'] = df['subtitle_text'].apply(clean_text)  # Replace 'subtitle_text' with the correct column name


**Generate Text Embeddings**

In [None]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['cleaned_text'])


**Store Embeddings in a Database (Optional: Using ChromaDB)**

In [None]:
# Assuming ChromaDB setup
# Install using `pip install chromadb`
import chromadb
client = chromadb.Client()

# Create a Collection
collection = client.create_collection(name="subtitles")

# Add Embeddings and Metadata to Collection
for index, row in df.iterrows():
    for chunk in row['chunks']:
        embedding = model.encode(chunk)
        collection.add(
            documents=[chunk],
            metadatas={"id": row['id']},  # Add relevant metadata
            embeddings=[embedding]
        )


**Process User Query**

In [None]:
import speech_recognition as sr

def audio_to_text(audio_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_path) as source:
        audio_data = recognizer.record(source)
    return recognizer.recognize_google(audio_data)


**Generate Query Embedding**

In [None]:
# Preprocess Query
user_query = "Example user query"  # Replace with actual query or output from audio_to_text
query_embedding = model.encode(user_query)


**Retrieve Relevant Subtitles**""

In [None]:
# Compute Cosine Similarity
query_tfidf = tfidf_vectorizer.transform([user_query])
cosine_sim = cosine_similarity(query_tfidf, tfidf_matrix)

# Get Top N Results
top_n = 5
top_indices = cosine_sim[0].argsort()[-top_n:][::-1]
results = df.iloc[top_indices]
print(results)


 **Using Semantic Search and Cosine Similarity**

In [None]:
# Compute Similarity for Semantic Embeddings
similarities = [cosine_similarity([query_embedding], [embedding])[0][0] for embedding in df['embeddings']]
df['similarity'] = similarities

# Get Top N Results
top_results = df.nlargest(5, 'similarity')
print(top_results[['cleaned_text', 'similarity']])


**Display Results**

In [None]:
for index, row in top_results.iterrows():
    print(f"Video ID: {row['id']}")  # Replace with actual column
    print(f"Subtitle: {row['cleaned_text']}")
    print(f"Similarity Score: {row['similarity']}")
    print("-" * 50)
