In [1]:
import os
import pandas as pd
import qdrant_client
from dotenv import load_dotenv
from langchain.schema import Document
from langchain.vectorstores import Qdrant
from langchain.embeddings import OpenAIEmbeddings
from qdrant_client.http.models import VectorParams, Distance
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
# Load environment variables
load_dotenv()

True

In [3]:
# Load IMDb dataset
df = pd.read_csv("C:/Users/saisu/Documents/Learning/RAG_project_imdb/data/imdb_top_1000.csv")


In [4]:
# Convert each row into a LangChain Document format
documents = [
    Document(
        metadata={"title": row["Series_Title"], "year": row["Released_Year"], "genre": row["Genre"], "rating": row["IMDB_Rating"]},
        page_content=f"Movie: {row['Series_Title']}, Released: {row['Released_Year']}, Genre: {row['Genre']}, Rating: {row['IMDB_Rating']}, "
                     f"Director: {row['Director']}, Starring: {row['Star1']}, {row['Star2']}, {row['Star3']}, {row['Star4']}. "
                     f"Overview: {row['Overview']}"
    ) 
    for _, row in df.iterrows()
]

In [5]:
# Qdrant Configuration
QDRANT_HOST = os.getenv("QDRANT_HOST")
API_KEY = os.getenv("API_KEY")
QDRANT_COLLECTION_NAME = "imdb"

In [6]:
client = qdrant_client.QdrantClient(url=QDRANT_HOST, api_key=API_KEY)

# Check if collection exists, then create if it doesn't
if not client.collection_exists(QDRANT_COLLECTION_NAME):
    client.create_collection(
        collection_name=QDRANT_COLLECTION_NAME,
        vectors_config=VectorParams(size=1536, distance=Distance.COSINE)
    )

In [7]:
# Load OpenAI Embeddings
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
embeddings = OpenAIEmbeddings()

vector_store = Qdrant(
    client=client,
    collection_name=QDRANT_COLLECTION_NAME,
    embeddings=embeddings
)

  embeddings = OpenAIEmbeddings()
  vector_store = Qdrant(


In [8]:
# Define text chunking strategy
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=50,
    separators=["\n"]
)

In [9]:
# Split documents into smaller text chunks
chunked_documents = []
for doc in documents:
    chunks = text_splitter.split_text(doc.page_content)  # Use doc.page_content instead of doc
    chunked_documents.extend(chunks)

# Display some chunked samples
print(chunked_documents[:5])

['Movie: The Shawshank Redemption, Released: 1994, Genre: Drama, Rating: 9.3, Director: Frank Darabont, Starring: Tim Robbins, Morgan Freeman, Bob Gunton, William Sadler. Overview: Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.', "Movie: The Godfather, Released: 1972, Genre: Crime, Drama, Rating: 9.2, Director: Francis Ford Coppola, Starring: Marlon Brando, Al Pacino, James Caan, Diane Keaton. Overview: An organized crime dynasty's aging patriarch transfers control of his clandestine empire to his reluctant son.", 'Movie: The Dark Knight, Released: 2008, Genre: Action, Crime, Drama, Rating: 9.0, Director: Christopher Nolan, Starring: Christian Bale, Heath Ledger, Aaron Eckhart, Michael Caine. Overview: When the menace known as the Joker wreaks havoc and chaos on the people of Gotham, Batman must accept one of the greatest psychological and physical tests of his ability to fight injustice.', 'Movie: The Godfather: P

In [10]:
vector_store.add_texts(chunked_documents)

ResponseHandlingException: The write operation timed out