In [5]:
from langchain_huggingface import HuggingFaceEmbeddings

# 1. Initialize the same embedding model used in your notebook
embeddings_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [10]:
from langchain_community.vectorstores import FAISS
import pandas as pd

In [11]:
from langchain_core.documents import Document

In [12]:
df = pd.read_csv("df1_cleaned.csv")

In [26]:
from langchain_text_splitters import RecursiveCharacterTextSplitter


In [27]:

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, 
    chunk_overlap=100
)
# 2. Process your DataFrame into Split Documents
documents = []
for _, row in df.iterrows():
    # Split the long plot into smaller chunks
    chunks = text_splitter.split_text(str(row['clean_plot']))
    
    for chunk in chunks:
        documents.append(
            Document(
                page_content=chunk,
                metadata={
                    "title": row['Title'], 
                    "year": row['Release Year']
                }
            )
        )

In [29]:
# Create the vector store using the fixed documents list
vector_store = FAISS.from_documents(documents, embeddings_model)

# Save it to the artifacts folder
vector_store.save_local("artifacts/movie_faiss_v3")

In [54]:
query = "A movie about space exploration and black holes"
docs = vector_store.similarity_search(query, k=3)

for doc in docs:
    print(f"Title: {doc.metadata['title']} ({doc.metadata['year']})")
    print(f"Snippet: {doc.page_content[:150]}...\n")

Title: Doraemon the Movie: Nobita's New Great Adventure into the Underworld - The Seven Magic Users (2007)
Snippet: frustrated by the multitude of problems at school and at home, nobita wonders if his life would be easier if magic really existed. he then asks doraem...

Title: Interstellar (2014)
Snippet: the event horizon. the tesseract, its purpose completed, collapses and ejects cooper and tars. cooper wakes on a huge station, orbiting saturn. he reu...

Title: Interstellar (2014)
Snippet: the event horizon. the tesseract, its purpose completed, collapses and ejects cooper and tars. cooper wakes on a huge station, orbiting saturn. he reu...



In [55]:
movie_plot = df[df['Title'].str.contains("Interstellar", case=False)]['clean_plot'].value_counts().idxmax()

print(movie_plot)

in the mid-21st century, crop blights and dust storms threaten humanity's survival. joseph cooper, a widowed engineer and former nasa pilot, runs a farm with his father-in-law donald, son tom, and daughter murphy. living in a post-truth society, cooper is reprimanded for telling murphy that the apollo missions were not fake; he encourages her to carefully observe and record what she sees. they discover that dust patterns, which murphy first attributes to a ghost, result from gravity variations, and translate into geographic coordinates. these lead them to a secret nasa facility headed by cooper's former supervisor, professor john brand, who explains that 48 years earlier a wormhole appeared near saturn, opening a path to a distant galaxy with twelve potentially habitable planets located near a black hole named gargantua. volunteers had previously traveled through the wormhole to evaluate the planets, with miller, edmunds, and mann reporting back desirable results. brand explains he has

In [56]:
df[df['Title'].str.contains("Interstellar", case=False)].value_counts()

Release Year  Title         Origin/Ethnicity  Director           Cast                                                                       Genre            Wiki Page                                          Plot                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    

In [57]:
# Change your search type to MMR for better variety
retriever = vector_store.as_retriever(
    search_kwargs={'k': 5, 'fetch_k': 20}
)

# Use .invoke() instead of .get_relevant_documents()
results = retriever.invoke("A space adventure with a sad ending")

# To see your results
for doc in results:
    print(f"Movie: {doc.metadata['title']}")

Movie: Bobbikins
Movie: Heavy Metal
Movie: The Little Prince
Movie: Cocoon
Movie: Redirected


In [None]:

# Change your search type to MMR for better variety
retriever = vector_store.as_retriever(
    search_kwargs={'k': 5, 'fetch_k': 20}
)

# Use .invoke() instead of .get_relevant_documents()
results = retriever.invoke("A movie about space exploration and black holes")

# To see your results
for doc in results:
    print(f"Movie: {doc.metadata['title']}")

Movie: Doraemon the Movie: Nobita's New Great Adventure into the Underworld - The Seven Magic Users
Movie: Interstellar
Movie: Interstellar
Movie: Heavy Metal
Movie: Reptilian


In [61]:
movie_plot = df[df['Title'].str.contains("Reptilian", case=False)]['clean_plot'].value_counts().idxmax()

print(movie_plot)



In [63]:
df[df['Title'].str.contains("Heavy Metal", case=False)].value_counts()

Release Year  Title             Origin/Ethnicity  Director                          Cast                                                           Genre             Wiki Page                                         Plot                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             