In [None]:
!pip install sentence-transformers faiss-cpu pandas

In [None]:
import pandas as pd
import faiss
import numpy as np
import pickle
from sentence_transformers import SentenceTransformer

# === 1. Load Cleaned CSV ===
df = pd.read_csv("/content/cleaned_movie_quotes_final_i_think.csv")  # Update with your actual filename

# === 2. Load Embedding Model ===
model = SentenceTransformer('all-MiniLM-L6-v2')  # Fast + accurate
# Drop rows with missing text
df = df.dropna(subset=['text'])

# Ensure all text is string type
df['text'] = df['text'].astype(str)
# === 3. Generate Embeddings for the 'text' Column ===
print("Generating embeddings...")
embeddings = model.encode(df['text'].tolist(), show_progress_bar=True, convert_to_numpy=True)

# === 4. Build FAISS Index ===
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
print(f"FAISS index built with {index.ntotal} vectors.")

# === 5. Save FAISS index and metadata ===
faiss.write_index(index, "quote_index.faiss")
df.to_pickle("quote_metadata.pkl")  # Stores text + metadata
np.save("quote_embeddings.npy", embeddings)

print("Index and metadata saved.")


In [None]:
print("Remaining rows after dropping missing text:", len(df))


In [None]:
import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer

# === 1. Load the index, metadata, and model ===
index = faiss.read_index("quote_index.faiss")
metadata = pd.read_pickle("quote_metadata.pkl")
embeddings = np.load("quote_embeddings.npy")  # Optional, not required for search
model = SentenceTransformer('all-MiniLM-L6-v2')

# === 2. Define retrieval function ===
def retrieve_similar_quotes(query, top_k=5):
    query_embedding = model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, top_k)

    print(f"\nTop {top_k} results for: \"{query}\"")
    for i, idx in enumerate(indices[0]):
        row = metadata.iloc[idx]
        print(f"\nResult #{i+1}")
        print(f"Text   : {row['text']}")
        print(f"Char   : {row['character_name']}")
        print(f"Title  : {row['title']} ({row['year']})")
        print(f"Genre  : {row['genres']}")
        print(f"Gender : {row['gender']}")
        print(f"Score  : {distances[0][i]:.4f}")

# === 3. Try a test query ===
retrieve_similar_quotes("I'm the king of the world!")


In [None]:
retrieve_similar_quotes("I'm done.")

In [None]:
retrieve_similar_quotes("I'm not a bad person.")

# RAG

In [None]:
!pip install google-generativeai


In [None]:
import pandas as pd
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import google.generativeai as genai

# === 1. Initialize Gemini API key ===
genai.configure(api_key="AIzaSyDDaUTEWZGkEvfT46SVH_qOs_QPQJcHLsg")  # replace with your actual key

# Initialize the GenerativeModel
model_genai = genai.GenerativeModel('gemini-1.5-flash') # Use the correct model name

# === 2. Load cleaned data and embeddings ===
df = pd.read_pickle("quote_metadata.pkl")        # your cleaned data with metadata
embeddings = np.load("quote_embeddings.npy")     # precomputed embeddings
index = faiss.read_index("quote_index.faiss")    # FAISS index

# === 3. Load embedding model for queries ===
model_embedding = SentenceTransformer('all-MiniLM-L6-v2') # Renamed to avoid confusion with genai model

def retrieve_similar_quotes(query, top_k=5):
    # Embed the query
    query_vec = model_embedding.encode([query], convert_to_numpy=True) # Use the embedding model

    # Search in FAISS index
    D, I = index.search(query_vec, top_k)

    # Retrieve corresponding quotes from df
    results = []
    for idx in I[0]:
        row = df.iloc[idx]
        text = row['text']
        char = row.get('character_name', 'Unknown')
        title = row.get('title', 'Unknown')
        genres = row.get('genres', [])
        results.append(f'"{text}" — {char} ({title}) [Genres: {genres}]')
    return results

def generate_response_with_context(user_query, retrieved_quotes):
    # Combine retrieved quotes as context for Gemini
    context_str = "\n".join(retrieved_quotes)
    full_prompt = f"Here are some relevant movie quotes:\n{context_str}\n\nUser question: {user_query}\nAnswer:"

    # Use the GenerativeModel to generate content
    # The generate_content method accepts a prompt directly
    response = model_genai.generate_content(
        full_prompt,
        generation_config=genai.GenerationConfig( # Use GenerationConfig for parameters
            temperature=0.7,
            max_output_tokens=256,
        )
    )
    # Access the generated text via .text
    return response.text

if __name__ == "__main__":
    print("Welcome to the Movie Quote RAG system with Gemini LLM.")
    while True:
        user_query = input("\nEnter your query (or 'exit' to quit): ").strip()
        if user_query.lower() == 'exit':
            break

        # Step 1: Retrieve similar quotes
        retrieved = retrieve_similar_quotes(user_query, top_k=5)
        print("\nTop 5 similar quotes:")
        for i, quote in enumerate(retrieved, 1):
            print(f"{i}. {quote}")

        # Step 2: Generate answer with Gemini LLM
        answer = generate_response_with_context(user_query, retrieved)
        print("\nGenerated response from Gemini LLM:\n", answer)

In [None]:
print("Welcome to the Movie Quote RAG system with Gemini LLM.")
while True:
  user_query = input("\nEnter your query (or 'exit' to quit): ").strip()
  if user_query.lower() == 'exit':
    break

  # Step 1: Retrieve similar quotes
  retrieved = retrieve_similar_quotes(user_query, top_k=5)
  print("\nTop 5 similar quotes:")
  for i, quote in enumerate(retrieved, 1):
    print(f"{i}. {quote}")

  # Step 2: Generate answer with Gemini LLM
  answer = generate_response_with_context(user_query, retrieved)
  print("\nGenerated response from Gemini LLM:\n", answer)