# Solution

#### Load Data

In [1]:
import pandas as pd

# Load the IMDb dataset (adjust path accordingly)
df = pd.read_csv("C:/Users/saisu/Documents/Learning/RAG_project_imdb/data/imdb_top_1000.csv")
df.head(2)

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411


In [None]:
# Convert 'Gross' to numerical format (removing commas)
import pandas as pd

# Load the IMDb dataset (adjust path accordingly)
df = pd.read_csv("C:/Users/saisu/Documents/Learning/RAG_project_imdb/data/imdb_top_1000.csv")
df.head(2)
df.head(2)

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469.0
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411.0


#### Create Text Representation for Vectorization

In [12]:
from langchain.schema import Document

# Convert each row into a LangChain Document format
documents = [
    Document(
        metadata={"title": row["Series_Title"], "year": row["Released_Year"], "genre": row["Genre"], "rating": row["IMDB_Rating"]},
        page_content=f"Movie: {row['Series_Title']}, Released: {row['Released_Year']}, Genre: {row['Genre']}, Rating: {row['IMDB_Rating']}, "
                     f"Director: {row['Director']}, Starring: {row['Star1']}, {row['Star2']}, {row['Star3']}, {row['Star4']}. "
                     f"Overview: {row['Overview']}"
    ) 
    for _, row in df.iterrows()
]
documents[:5]

[Document(metadata={'title': 'The Shawshank Redemption', 'year': '1994', 'genre': 'Drama', 'rating': 9.3}, page_content='Movie: The Shawshank Redemption, Released: 1994, Genre: Drama, Rating: 9.3, Director: Frank Darabont, Starring: Tim Robbins, Morgan Freeman, Bob Gunton, William Sadler. Overview: Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.'),
 Document(metadata={'title': 'The Godfather', 'year': '1972', 'genre': 'Crime, Drama', 'rating': 9.2}, page_content="Movie: The Godfather, Released: 1972, Genre: Crime, Drama, Rating: 9.2, Director: Francis Ford Coppola, Starring: Marlon Brando, Al Pacino, James Caan, Diane Keaton. Overview: An organized crime dynasty's aging patriarch transfers control of his clandestine empire to his reluctant son."),
 Document(metadata={'title': 'The Dark Knight', 'year': '2008', 'genre': 'Action, Crime, Drama', 'rating': 9.0}, page_content='Movie: The Dark Knight, Released: 2008, Genr

#### Initialize Qdrant

In [13]:
from langchain.vectorstores import Qdrant
import qdrant_client
import os
from dotenv import load_dotenv
load_dotenv()

True

In [14]:
QDRANT_HOST=os.getenv("QDRANT_HOST")
API_KEY=os.getenv("API_KEY")

client=qdrant_client.QdrantClient(url=QDRANT_HOST, api_key=API_KEY)

In [15]:
# Create collection
QDRANT_COLLECTION_NAME="imdb"

In [16]:
vectors_config=qdrant_client.http.models.VectorParams(
    size=1536,
    distance=qdrant_client.http.models.Distance.COSINE
)

client.recreate_collection(
    collection_name=QDRANT_COLLECTION_NAME,
    vectors_config=vectors_config
)

  client.recreate_collection(


True

In [17]:
from langchain.embeddings import OpenAIEmbeddings

os.environ["OPENAI_API_KEY"]=os.getenv("OPENAI_API_KEY")


embeddings=OpenAIEmbeddings()

vector_store=Qdrant(
    client=client,
    collection_name=os.getenv("QDRANT_COLLECTION_NAME"),
    embeddings=embeddings
)

In [18]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Define chunking strategy
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # Adjust based on average sentence length
    chunk_overlap=50,  # Overlap to maintain context
    separators=["\n"]  # Sentence-based splitting
)

# Create chunked documents as strings
chunked_documents = []
for doc in documents:
    chunks = text_splitter.split_text(doc)  # Directly split the string content
    chunked_documents.extend(chunks)  # Add chunks to the list

# Display some chunked samples
print(chunked_documents[:5])


TypeError: expected string or bytes-like object, got 'Document'

[]

#### Embed Movie Data and Store in Qdrant

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams

# Initialize Qdrant client
client = QdrantClient(":memory:")  # Use in-memory or change to a real Qdrant instance

# Create a collection
client.recreate_collection(
    collection_name="movies",
    vectors_config=VectorParams(size=768, distance=Distance.COSINE)
)

In [5]:
import os
from qdrant_client import QdrantClient
from langchain.vectorstores import Qdrant
from langchain.embeddings import OpenAIEmbeddings

os.environ["OPENAI_API_KEY"]=os.getenv("OPENAI_API_KEY")

# Initialize Qdrant Client (Ensure it's properly set up)
client = QdrantClient(":memory:")  # Use in-memory; for persistent storage, use Qdrant Cloud or a local server.

# Create collection (Ensure vector size matches the embedding model used)
client.recreate_collection(
    collection_name="movies",
    vectors_config={"size": 1536, "distance": "Cosine"}  # Adjust size based on the embedding model
)

# Initialize OpenAI Embeddings
embeddings = OpenAIEmbeddings()

# Convert documents into a vector store (Do NOT pass `client` in kwargs)
vector_db = Qdrant.from_documents(
    documents=documents,
    embedding=embeddings,
    collection_name="movies",
    location=":memory:"  # Use "localhost" or Qdrant Cloud for persistence
)


  client.recreate_collection(
  embeddings = OpenAIEmbeddings()


#### Process Natural Language Queries

In [6]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

# Load LLM (GPT-based)
llm = ChatOpenAI(model="gpt-4o-mini")

# Retrieval system
retriever = vector_db.as_retriever()

# Define QA Chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever
)


  llm = ChatOpenAI(model="gpt-4o-mini")


#### Run Sample Query

In [None]:
# query = "Find me a crime thriller movie starring Al Pacino."
# response = qa_chain.run(query)
# print(response)

  response = qa_chain.run(query)


You can watch "Carlito's Way" (1993), which is a crime thriller starring Al Pacino.


#### Implement Follow-Up Interaction

In [None]:
from langchain.memory import ConversationBufferMemory

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

qa_chain_with_memory = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    memory=memory
)

# Follow-up interaction
# print(qa_chain_with_memory.run("Find me a comedy movie from the 1990s."))
# print(qa_chain_with_memory.run("Give me another suggestion."))

  memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
  print(qa_chain_with_memory.run("Find me a comedy movie from the 1990s."))


A comedy movie from the 1990s is "Clerks," released in 1994. It has a rating of 7.7 and was directed by Kevin Smith. The film follows two convenience clerks named Dante and Randal as they navigate their day, annoy customers, discuss movies, and play hockey on the store roof.
I don't know.


In [8]:
print(qa_chain_with_memory.run("When The Shawshank Redemption was released"))

The Shawshank Redemption was released in 1994.
