In [1]:
import os
import pandas as pd
import qdrant_client
from dotenv import load_dotenv
from langchain.schema import Document
from langchain.vectorstores import Qdrant
from langchain.embeddings import OpenAIEmbeddings
from qdrant_client.http.models import VectorParams, Distance
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
# Load environment variables
load_dotenv()

QDRANT_HOST = os.getenv("QDRANT_HOST")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
QDRANT_COLLECTION_NAME = "imdb"

In [3]:
# Load IMDb dataset
df = pd.read_csv("C:/Users/saisu/Documents/Learning/RAG_project_imdb/data/imdb_top_1000.csv")


In [4]:
# Convert rows into LangChain Document format
documents = [
    Document(
        metadata={"title": row["Series_Title"], "year": row["Released_Year"], "genre": row["Genre"], "rating": row["IMDB_Rating"]},
        page_content=(
            f"Movie: {row['Series_Title']}, Released: {row['Released_Year']}, Genre: {row['Genre']}, "
            f"Rating: {row['IMDB_Rating']}, Director: {row['Director']},  Overview: {row['Overview']}"
            f"Starring: {row['Star1']}, {row['Star2']}, {row['Star3']}, {row['Star4']}."
        )
    )
    for _, row in df.iterrows()
]

In [5]:
# Initialize Qdrant Client
client = qdrant_client.QdrantClient(url=QDRANT_HOST, api_key=QDRANT_API_KEY, timeout=120)

# Check if collection exists, then create it
if not client.collection_exists(QDRANT_COLLECTION_NAME):
    client.create_collection(
        collection_name=QDRANT_COLLECTION_NAME,
        vectors_config=VectorParams(size=1536, distance=Distance.COSINE)
    )


In [6]:
# Initialize embeddings
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
embeddings = OpenAIEmbeddings()

# Connect vector store
vector_store = Qdrant(
    client=client,
    collection_name=QDRANT_COLLECTION_NAME,
    embeddings=embeddings
)

  embeddings = OpenAIEmbeddings()
  vector_store = Qdrant(


In [7]:
# Define text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=50,
    separators=["\n"]
)

# Process documents and split text correctly
chunked_documents = []
for doc in documents:
    chunks = text_splitter.split_text(doc.page_content)  # Use page_content instead of passing Document object
    chunked_documents.extend(chunks)

# Display some chunked samples
print(chunked_documents[:5])

['Movie: The Shawshank Redemption, Released: 1994, Genre: Drama, Rating: 9.3, Director: Frank Darabont,  Overview: Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.Starring: Tim Robbins, Morgan Freeman, Bob Gunton, William Sadler.', "Movie: The Godfather, Released: 1972, Genre: Crime, Drama, Rating: 9.2, Director: Francis Ford Coppola,  Overview: An organized crime dynasty's aging patriarch transfers control of his clandestine empire to his reluctant son.Starring: Marlon Brando, Al Pacino, James Caan, Diane Keaton.", 'Movie: The Dark Knight, Released: 2008, Genre: Action, Crime, Drama, Rating: 9.0, Director: Christopher Nolan,  Overview: When the menace known as the Joker wreaks havoc and chaos on the people of Gotham, Batman must accept one of the greatest psychological and physical tests of his ability to fight injustice.Starring: Christian Bale, Heath Ledger, Aaron Eckhart, Michael Caine.', 'Movie: The Godfather: P

In [8]:
vector_store.add_texts(chunked_documents)

['b1ddf64b1de348ed950b4ca880881cd7',
 'c8eba45f14a24d47ad1446b5c59738c1',
 '021b37cd078b45c892d64b3d45bcad7f',
 '6b90bd42330f4757a1be2d3226f893fd',
 'e8d37f4b939a482a8869c1652fbc8b97',
 'ae8b84ef5e08430e873e883e00129850',
 'eba8891b745b4b699d977c70b7d42f45',
 'ec1b0a80756f43a2995f056c765579df',
 'cc7849e758ef424b8d780d3804a01c6c',
 '0a314f11387a4ed7b9d6e0c57aa3ef0a',
 'ffba7f3508394870ac50a5f78daa45b9',
 '91e5b92f0e7c445099de96b22fb39244',
 '91df3eafbd5d4a038109b29b7ff04e43',
 '58f7aeb365fd49b3881aa0346486fb6d',
 'abcdbb51570b4d51ac223912a0be647f',
 'df7e152dc3b14c95a8f7cdb3f0d7e2db',
 '8c52fbe53e8544d3a35ddcc62251ddb7',
 'a43e2fcc60ea44f49b0c00f315623a20',
 'b9272c17d7e84eefa8af337d940d4f0a',
 '940f2ec9f38642438e56a74469570114',
 'b2fce69b9f2145feb2d6bb332ecc702f',
 '1e334aa816c34779acad4cea82f614be',
 'c39b1fb42ef94918af88e8d007cfa8ef',
 'd56de1a0c4004d558b4b96f0c3f380ed',
 '28f4d6a3e74e4fea8e9d32f5039e7cee',
 'aed3921c5180420ba14ab19da7d10ddd',
 '126859432df34b2397eb3cbf56cf9b90',
 

In [7]:
# plug vector store into retrieval chain

from langchain.chains import RetrievalQA
from langchain_groq import ChatGroq

qa=RetrievalQA.from_chain_type(
    llm=ChatGroq(model='deepseek-r1-distill-llama-70b', temperature=0.5),
    chain_type='stuff',
    retriever=vector_store.as_retriever()
)

In [8]:
import re

def remove_think_tags(text):
    return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()

In [9]:
def get_response(query):
    response=qa.run(query)
    if "<think>" in response:
        response=remove_think_tags(response)
    return response

In [10]:
query="Inception, who are actors in it?"
print(get_response(query=query))

  response=qa.run(query)


The actors in Inception are Leonardo DiCaprio, Joseph Gordon-Levitt, Elliot Page, and Ken Watanabe.
