In [1]:
!pip install sentence-transformers faiss-cpu pandas

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_6

In [3]:
import pandas as pd
import faiss
import numpy as np
import pickle
from sentence_transformers import SentenceTransformer

# === 1. Load Cleaned CSV ===
df = pd.read_csv("/content/cleaned_movie_quotes_final_i_think.csv")  # Update with your actual filename

# === 2. Load Embedding Model ===
model = SentenceTransformer('all-MiniLM-L6-v2')  # Fast + accurate
# Drop rows with missing text
df = df.dropna(subset=['text'])

# Ensure all text is string type
df['text'] = df['text'].astype(str)
# === 3. Generate Embeddings for the 'text' Column ===
print("Generating embeddings...")
embeddings = model.encode(df['text'].tolist(), show_progress_bar=True, convert_to_numpy=True)

# === 4. Build FAISS Index ===
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
print(f"FAISS index built with {index.ntotal} vectors.")

# === 5. Save FAISS index and metadata ===
faiss.write_index(index, "quote_index.faiss")
df.to_pickle("quote_metadata.pkl")  # Stores text + metadata
np.save("quote_embeddings.npy", embeddings)

print("Index and metadata saved.")


  df = pd.read_csv("/content/cleaned_movie_quotes_final_i_think.csv")  # Update with your actual filename
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['text'].astype(str)


Generating embeddings...


Batches:   0%|          | 0/9509 [00:00<?, ?it/s]

FAISS index built with 304269 vectors.
Index and metadata saved.


In [4]:
print("Remaining rows after dropping missing text:", len(df))


Remaining rows after dropping missing text: 304269


In [5]:
import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer

# === 1. Load the index, metadata, and model ===
index = faiss.read_index("quote_index.faiss")
metadata = pd.read_pickle("quote_metadata.pkl")
embeddings = np.load("quote_embeddings.npy")  # Optional, not required for search
model = SentenceTransformer('all-MiniLM-L6-v2')

# === 2. Define retrieval function ===
def retrieve_similar_quotes(query, top_k=5):
    query_embedding = model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, top_k)

    print(f"\nTop {top_k} results for: \"{query}\"")
    for i, idx in enumerate(indices[0]):
        row = metadata.iloc[idx]
        print(f"\nResult #{i+1}")
        print(f"Text   : {row['text']}")
        print(f"Char   : {row['character_name']}")
        print(f"Title  : {row['title']} ({row['year']})")
        print(f"Genre  : {row['genres']}")
        print(f"Gender : {row['gender']}")
        print(f"Score  : {distances[0][i]:.4f}")

# === 3. Try a test query ===
retrieve_similar_quotes("I'm the king of the world!")



Top 5 results for: "I'm the king of the world!"

Result #1
Text   : im the king of the world
Char   : JULIE
Title  : i still know what you did last summer (1998)
Genre  : ['horror', 'mystery', 'thriller']
Gender : f
Score  : 0.2170

Result #2
Text   : no im the king of the world
Char   : KARLA
Title  : i still know what you did last summer (1998)
Genre  : ['horror', 'mystery', 'thriller']
Gender : f
Score  : 0.3822

Result #3
Text   : i am your king
Char   : ARTHUR
Title  : monty python and the holy grail (1975)
Genre  : ['adventure', 'comedy']
Gender : m
Score  : 0.4937

Result #4
Text   : well  i am king
Char   : ARTHUR
Title  : monty python and the holy grail (1975)
Genre  : ['adventure', 'comedy']
Gender : m
Score  : 0.5374

Result #5
Text   : im im not the king
Char   : CHARLES
Title  : the messenger (2009)
Genre  : ['drama', 'romance', 'war']
Gender : m
Score  : 0.5577


In [6]:
retrieve_similar_quotes("I'm done.")


Top 5 results for: "I'm done."

Result #1
Text   : im done
Char   : DANA
Title  : true lies (1994)
Genre  : ['action', 'thriller']
Gender : unknown
Score  : 0.6939

Result #2
Text   : you done
Char   : LORETTA
Title  : the sting (1973)
Genre  : ['comedy', 'crime', 'drama']
Gender : f
Score  : 0.8062

Result #3
Text   : you done
Char   : PONY
Title  : suburbia (1996)
Genre  : ['comedy', 'drama']
Gender : m
Score  : 0.8062

Result #4
Text   : well im done  are you done
Char   : JEFFREY
Title  : buffy the vampire slayer (1992)
Genre  : ['horror', 'comedy', 'action']
Gender : m
Score  : 0.8539

Result #5
Text   : done what
Char   : HILDY
Title  : his girl friday (1940)
Genre  : ['comedy', 'drama', 'romance']
Gender : f
Score  : 0.8906


In [7]:
retrieve_similar_quotes("I'm not a bad person.")


Top 5 results for: "I'm not a bad person."

Result #1
Text   : i am not a bad man
Char   : PORTER
Title  : the third man (1949)
Genre  : ['film-noir', 'mystery', 'thriller']
Gender : m
Score  : 0.3631

Result #2
Text   : im a bad person
Char   : JIMMY
Title  : magnolia (1999)
Genre  : ['drama']
Gender : m
Score  : 0.4389

Result #3
Text   : you know im a good person
Char   : ROB
Title  : high fidelity (2000)
Genre  : ['comedy', 'drama', 'music', 'romance']
Gender : m
Score  : 0.6756

Result #4
Text   : im always bad
Char   : AGNES
Title  : agnes of god (1985)
Genre  : ['drama', 'mystery', 'thriller']
Gender : f
Score  : 0.6964

Result #5
Text   : im a bad girl im a bad girl
Char   : CAMMI
Title  : sideways (2004)
Genre  : ['comedy', 'drama', 'romance']
Gender : unknown
Score  : 0.7291


# RAG

In [13]:
!pip install google-generativeai




In [None]:
import pandas as pd
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import google.generativeai as genai

# === 1. Initialize Gemini API key ===
genai.configure(api_key="AIzaSyDDaUTEWZGkEvfT46SVH_qOs_QPQJcHLsg")  # replace with your actual key

# Initialize the GenerativeModel
model_genai = genai.GenerativeModel('gemini-1.5-flash') # Use the correct model name

# === 2. Load cleaned data and embeddings ===
df = pd.read_pickle("quote_metadata.pkl")        # your cleaned data with metadata
embeddings = np.load("quote_embeddings.npy")     # precomputed embeddings
index = faiss.read_index("quote_index.faiss")    # FAISS index

# === 3. Load embedding model for queries ===
model_embedding = SentenceTransformer('all-MiniLM-L6-v2') # Renamed to avoid confusion with genai model

def retrieve_similar_quotes(query, top_k=5):
    # Embed the query
    query_vec = model_embedding.encode([query], convert_to_numpy=True) # Use the embedding model

    # Search in FAISS index
    D, I = index.search(query_vec, top_k)

    # Retrieve corresponding quotes from df
    results = []
    for idx in I[0]:
        row = df.iloc[idx]
        text = row['text']
        char = row.get('character_name', 'Unknown')
        title = row.get('title', 'Unknown')
        genres = row.get('genres', [])
        results.append(f'"{text}" — {char} ({title}) [Genres: {genres}]')
    return results

def generate_response_with_context(user_query, retrieved_quotes):
    # Combine retrieved quotes as context for Gemini
    context_str = "\n".join(retrieved_quotes)
    full_prompt = f"Here are some relevant movie quotes:\n{context_str}\n\nUser question: {user_query}\nAnswer:"

    # Use the GenerativeModel to generate content
    # The generate_content method accepts a prompt directly
    response = model_genai.generate_content(
        full_prompt,
        generation_config=genai.GenerationConfig( # Use GenerationConfig for parameters
            temperature=0.7,
            max_output_tokens=256,
        )
    )
    # Access the generated text via .text
    return response.text

if __name__ == "__main__":
    print("Welcome to the Movie Quote RAG system with Gemini LLM.")
    while True:
        user_query = input("\nEnter your query (or 'exit' to quit): ").strip()
        if user_query.lower() == 'exit':
            break

        # Step 1: Retrieve similar quotes
        retrieved = retrieve_similar_quotes(user_query, top_k=5)
        print("\nTop 5 similar quotes:")
        for i, quote in enumerate(retrieved, 1):
            print(f"{i}. {quote}")

        # Step 2: Generate answer with Gemini LLM
        answer = generate_response_with_context(user_query, retrieved)
        print("\nGenerated response from Gemini LLM:\n", answer)

Welcome to the Movie Quote RAG system with Gemini LLM.

Enter your query (or 'exit' to quit): Find a funny quote about work.

Top 5 similar quotes:
1. "talk about work never" — MILO (antitrust) [Genres: ['drama', 'thriller']]
2. "work work work is that all that you do" — THEO (scary movie 2) [Genres: ['comedy']]
3. "you find any work" — LULA (wild at heart) [Genres: ['crime', 'romance', 'thriller']]
4. "lets talk about the work that you care so much about" — FRANK BRAND (simone) [Genres: ['short', 'drama', 'horror', 'thriller']]
5. "actually its not about work its advice about" — BEN (the ice storm) [Genres: ['drama']]

Generated response from Gemini LLM:
 "work work work is that all that you do" — THEO (scary movie 2)


Enter your query (or 'exit' to quit): a funny and yet deep quote about life

Top 5 similar quotes:
1. "life i said life" — ANNIE (annie hall) [Genres: ['comedy', 'drama', 'romance']]
2. "life is only life when it is bounded by death  the inheritance is death the gift i

In [19]:
print("Welcome to the Movie Quote RAG system with Gemini LLM.")
while True:
  user_query = input("\nEnter your query (or 'exit' to quit): ").strip()
  if user_query.lower() == 'exit':
    break

  # Step 1: Retrieve similar quotes
  retrieved = retrieve_similar_quotes(user_query, top_k=5)
  print("\nTop 5 similar quotes:")
  for i, quote in enumerate(retrieved, 1):
    print(f"{i}. {quote}")

  # Step 2: Generate answer with Gemini LLM
  answer = generate_response_with_context(user_query, retrieved)
  print("\nGenerated response from Gemini LLM:\n", answer)

Welcome to the Movie Quote RAG system with Gemini LLM.

Enter your query (or 'exit' to quit): a romantic quote

Top 5 similar quotes:
1. "very romantic" — JOSIE (never been kissed) [Genres: ['comedy', 'drama', 'romance']]
2. "how romantic" — PHILBY (the time machine) [Genres: ['sci-fi', 'adventure', 'action']]
3. "how romantic" — IKE (runaway bride) [Genres: ['comedy', 'romance']]
4. "that is so romantic" — JULIANNE (my best friend's wedding) [Genres: ['comedy', 'romance']]
5. "just a romantic thats you" — VIVIAN (pretty woman) [Genres: ['comedy', 'romance']]

Generated response from Gemini LLM:
 "Very romantic," "How romantic," and "That is so romantic" are all good options, depending on the desired tone.  "Just a romantic that's you" is also a possibility, but it's less a statement of romanticism and more a description of someone.


Enter your query (or 'exit' to quit): a deep quote about friendship that suit for two childhood friends

Top 5 similar quotes:
1. "quite any childhood fr