In [None]:
!pip install sentence-transformers faiss-cpu pandas


^C


Collecting sentence-transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp310-cp310-win_amd64.whl.metadata (5.0 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Using cached torch-2.7.0-cp310-cp310-win_amd64.whl.metadata (29 kB)
Collecting scikit-learn (from sentence-transformers)
  Downloading scikit_learn-1.6.1-cp310-cp310-win_amd64.whl.metadata (15 kB)
Collecting scipy (from sentence-transformers)
  Downloading scipy-1.15.3-cp310-cp310-win_amd64.whl.metadata (60 kB)
Collecting networkx (from torch>=1.11.0->sentence-transformers)
  Using cached networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting joblib>=1.2.0 (from scikit-learn->sentence-transformers)
  Downloading joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn->sentence-transformers)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading sentence_tra

In [None]:
import pandas as pd
import faiss
import numpy as np
import pickle
from sentence_transformers import SentenceTransformer

# === 1. Load Cleaned CSV ===
df = pd.read_csv("cleaned_movie_quotes.csv")  # Update with your actual filename

# === 2. Load Embedding Model ===
model = SentenceTransformer('all-MiniLM-L6-v2')  # Fast + accurate

# === 3. Generate Embeddings for the 'text' Column ===
print("Generating embeddings...")
embeddings = model.encode(df['text'].tolist(), show_progress_bar=True, convert_to_numpy=True)

# === 4. Build FAISS Index ===
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
print(f"FAISS index built with {index.ntotal} vectors.")

# === 5. Save FAISS index and metadata ===
faiss.write_index(index, "quote_index.faiss")
df.to_pickle("quote_metadata.pkl")  # Stores text + metadata
np.save("quote_embeddings.npy", embeddings)

print("Index and metadata saved.")
