# RAG System

## Dependencies

In [151]:
%pip install --upgrade pip --quiet
%pip install pandas==2.2.3 --quiet
%pip install openai==1.62.0 --quiet
%pip install langchain==0.3.18 --quiet
%pip install faiss-cpu==1.10.0 --quiet
%pip install ipywidgets==8.1.5 --quiet
%pip install langchain-community==0.3.17 --quiet
%pip uninstall langchain-community==0.3.17 -y --quiet
%pip uninstall tiktoken==0.8.0 -y --quiet
%pip install langchain-openai==0.3.6 --quiet
%pip install python-dotenv==1.0.1 --quiet

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [152]:
import pandas as pd
import os
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA

load_dotenv()

True

## Loading the dataset into a DataFrame

In [167]:
def load_dataset(file_path):
    """Loads the dataset from a CSV file."""
    df = pd.read_csv(file_path)
    return df

## Data preprocess
1. Drop records where Title or Plot are missing (NaN)
2. Limits the fields we want to use to Title and Plot, merging them into a single field called Content. One entry is created for each movie

In [154]:
def preprocess_data(df):
    """Prepares the dataset by extracting titles and plots."""
    df = df[['Title', 'Plot']].dropna()
    df["Content"] = df.apply(lambda row: f"Title: {row['Title']}\nPlot: {row['Plot']}", axis=1)
    return df

## Text chunking
Splits long texts into chunks of fixed size. The overlap will be used to ensure that some text is repeated between consecutive chunks, this will maintain context

In [155]:
def split_text_into_chunks(texts, chunk_size=500, chunk_overlap=50):
    """Splits the text into smaller chunks for better processing."""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len
    )
    return text_splitter.create_documents(texts)

## Helper function
Checks if the data is already stored in embeddings, this will help us to not reprocess the data unnecessarily

In [156]:
def index_exists(index_folder):
    """Checks if FAISS index folder exists and is not empty."""
    return os.path.exists(index_folder) and os.listdir(index_folder)

## Helper function to store embeddings in local folder
Store embeddings into a folder specified by `index_folder`

In [157]:
def create_and_save_faiss_index(documents, embeddings, index_folder):
    """Creates FAISS index and saves it locally."""
    vector_db = FAISS.from_documents(documents, embeddings)
    vector_db.save_local(index_folder)
    print(f"FAISS index saved to {index_folder}")

## Load FAISS index

In [158]:
def load_faiss_index(index_folder, embeddings):
    """Loads FAISS index from local folder."""
    vector_db = FAISS.load_local(index_folder, embeddings, allow_dangerous_deserialization=True)
    print(f"FAISS index loaded from {index_folder}")
    return vector_db

## Create the LLM model with OpenAI

In [159]:
def create_llm_model(model_name="gpt-3.5-turbo-0125", temperature=0):
    """Initializes the OpenAI language model for text generation."""
    return ChatOpenAI(
        model_name=model_name,
        temperature=temperature
    )

## Create a RAG pipeline

In [160]:
def create_rag_pipeline(llm, vector_db):
    """Creates a Retrieval-Augmented Generation (RAG) pipeline."""
    retriever = vector_db.as_retriever()
    return RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type="stuff")

## Function to ask movie-related questions

In [161]:
def ask_movie_question(qa_chain, question):
    """Queries the RAG pipeline with a movie-related question and a custom system prompt."""
    from langchain.prompts.chat import ChatPromptTemplate

    SYSTEM_PROMPT_TEMPLATE = """
    You are a movie expert with deep knowledge of film plots and cinematic history.
    Provide detailed and accurate answers based on the movie plot data.
    Always include the movie title in your response.
    If you don't have actual information about the movie, reply: This movie is not in the dataset provided
    """

    prompt_template = ChatPromptTemplate.from_messages(
        [
            ("system", SYSTEM_PROMPT_TEMPLATE.strip()),
            ("human", "Respond to the question: {question}")
        ]
    )

    full_prompt = prompt_template.format_messages(question=question)

    response = qa_chain.invoke({"query": question, "input_messages": full_prompt})
    return response

## Connecting all together

In [162]:
if __name__ == "__main__":
    INDEX_FOLDER = "faiss_movie_embeddings"
    embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))
    if index_exists(INDEX_FOLDER):
        vector_db = load_faiss_index(INDEX_FOLDER, embeddings)
    else:
        dataset_path = "wiki_movie_plots_reduced.csv"
        df = load_dataset(dataset_path)
        df = preprocess_data(df)
        documents = split_text_into_chunks(df["Content"].tolist())
        create_and_save_faiss_index(documents, embeddings, INDEX_FOLDER)
        vector_db = load_faiss_index(INDEX_FOLDER, embeddings)

    # Create the LLM model and the RAG pipeline
    llm = create_llm_model(model_name="gpt-3.5-turbo-0125")
    qa_chain = create_rag_pipeline(llm, vector_db)

FAISS index saved to faiss_movie_embeddings
FAISS index loaded from faiss_movie_embeddings


## Query 1. Movie that is not part of the dataset

In [163]:
    question = "What is the plot of the movie Inception?"
    answer = ask_movie_question(qa_chain, question)
    print(answer)

{'query': 'What is the plot of the movie Inception?', 'input_messages': [SystemMessage(content="You are a movie expert with deep knowledge of film plots and cinematic history.\n    Provide detailed and accurate answers based on the movie plot data.\n    Always include the movie title in your response.\n    If you don't have actual information about the movie, reply: This movie is not in the dataset provided", additional_kwargs={}, response_metadata={}), HumanMessage(content='Respond to the question: What is the plot of the movie Inception?', additional_kwargs={}, response_metadata={})], 'result': 'I\'m sorry, but the plot you provided is not from the movie "Inception." Would you like a summary of the actual plot of "Inception"?'}


## Query 2. Movie that is part of the dataset

In [164]:
    # Example query
    question = "What is the plot of the movie Underworld?"
    answer = ask_movie_question(qa_chain, question)
    print(answer)

{'query': 'What is the plot of the movie Underworld?', 'input_messages': [SystemMessage(content="You are a movie expert with deep knowledge of film plots and cinematic history.\n    Provide detailed and accurate answers based on the movie plot data.\n    Always include the movie title in your response.\n    If you don't have actual information about the movie, reply: This movie is not in the dataset provided", additional_kwargs={}, response_metadata={}), HumanMessage(content='Respond to the question: What is the plot of the movie Underworld?', additional_kwargs={}, response_metadata={})], 'result': 'The plot of "Underworld: Blood Wars" revolves around the remaining vampire covens being threatened by the Lycans. Both species are searching for Selene for different reasons, with the vampires seeking justice and the Lycans wanting to use her to locate Eve, whose blood can create vampire-werewolf hybrids. The story involves battles, betrayals, and Selene gaining new abilities to fight again

## Query 3. Create a response, provided some context

In [165]:
    # Example query
    question = "What movie can you suggest me if I like vampire movies?"
    answer = ask_movie_question(qa_chain, question)
    print(answer)

{'query': 'What movie can you suggest me if I like vampire movies?', 'input_messages': [SystemMessage(content="You are a movie expert with deep knowledge of film plots and cinematic history.\n    Provide detailed and accurate answers based on the movie plot data.\n    Always include the movie title in your response.\n    If you don't have actual information about the movie, reply: This movie is not in the dataset provided", additional_kwargs={}, response_metadata={}), HumanMessage(content='Respond to the question: What movie can you suggest me if I like vampire movies?', additional_kwargs={}, response_metadata={})], 'result': 'If you enjoy vampire movies, you might like "Underworld: Blood Wars" based on the plot provided. It\'s a part of the Underworld series that focuses on the conflict between vampires and Lycans.'}


## Query 4. Create a response, provided some context (2)

In [166]:
    # Example query
    question = "I hate romantic movies, what do I have to avoid?"
    answer = ask_movie_question(qa_chain, question)
    print(answer)

{'query': 'I hate romantic movies, what do I have to avoid?', 'input_messages': [SystemMessage(content="You are a movie expert with deep knowledge of film plots and cinematic history.\n    Provide detailed and accurate answers based on the movie plot data.\n    Always include the movie title in your response.\n    If you don't have actual information about the movie, reply: This movie is not in the dataset provided", additional_kwargs={}, response_metadata={}), HumanMessage(content='Respond to the question: I hate romantic movies, what do I have to avoid?', additional_kwargs={}, response_metadata={})], 'result': 'You should avoid "Solo" as it is a romantic drama movie.'}
