In [None]:
# Install dependencies as needed
!pip install sentence-transformers qdrant-client

from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.http.models import PointStruct, VectorParams, Distance
import json
import uuid
import time

# Upload the unified_dataset.json file via Colab file uploader
from google.colab import files
uploaded = files.upload()

# Load the uploaded JSON file
uploaded_file_name = 'unified_dataset.json'
with open(uploaded_file_name, 'r', encoding='utf-8') as file:
    k_drama_df = json.load(file)

# Initialize the embedding model
model = SentenceTransformer('all-mpnet-base-v2')

# Connect to Qdrant
qdrant_client = QdrantClient(
    url='https://8108fa10-87c0-489a-a138-e5742baa513d.europe-west3-0.gcp.cloud.qdrant.io:6333',
    api_key='eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.ota-qmq7LDu8VAg1XW-RRzgXPngfjoSvuA01b7a-PLo'
)

# Define collection name
collection_name = 'story_plots'

# Check if the collection already exists, if not, create one
# existing_collections = [col.name for col in qdrant_client.get_collections().collections]
# if collection_name not in existing_collections:
#     qdrant_client.create_collection(
#         collection_name=collection_name,
#         vectors_config=VectorParams(size=768, distance=Distance.COSINE)
#     )
#     print(f"Collection '{collection_name}' created.")
# else:
#     print(f"Collection '{collection_name}' already exists.")

# Ingest data to Qdrant in batches
BATCH_SIZE = 100
START_INDEX = 600
points_batch = []
batch_count = 0
start_time = time.time()

dramas = list(k_drama_df['dramas'].items())[START_INDEX:]

for index, (drama_name, drama_data) in enumerate(dramas):
    title = drama_data['metadata']['title']
    genre = drama_data['metadata'].get('genres', [])
    description = drama_data['description']

    # Generate the embedding for the description
    embedding = model.encode(description).tolist()

    # Prepare data to be inserted in Qdrant
    point = PointStruct(
        id=str(uuid.uuid4()),
        vector=embedding,
        payload={
            'genre': genre,
            'title': title,
            'plot_summary': description
        }
    )

    points_batch.append(point)

    # Insert batch when batch size is reached
    if len(points_batch) == BATCH_SIZE:
        qdrant_client.upsert(
            collection_name=collection_name,
            points=points_batch
        )
        batch_count += 1
        print(f"✅ Batch {batch_count} inserted successfully.")
        points_batch = []  # Clear batch for next set

# Insert remaining points if any
if points_batch:
    qdrant_client.upsert(
        collection_name=collection_name,
        points=points_batch
    )
    batch_count += 1
    print(f"✅ Final Batch {batch_count} inserted successfully.")

end_time = time.time()
print(f'\n🎉 K-drama ingestion completed! Total Batches: {batch_count}. Time taken: {end_time - start_time:.2f} seconds')




Saving unified_dataset.json to unified_dataset.json
✅ Batch 1 inserted successfully.
✅ Batch 2 inserted successfully.
✅ Batch 3 inserted successfully.
✅ Batch 4 inserted successfully.
✅ Batch 5 inserted successfully.
✅ Batch 6 inserted successfully.
✅ Batch 7 inserted successfully.
✅ Batch 8 inserted successfully.
✅ Batch 9 inserted successfully.
✅ Batch 10 inserted successfully.
✅ Batch 11 inserted successfully.
✅ Final Batch 12 inserted successfully.

🎉 K-drama ingestion completed! Total Batches: 12. Time taken: 1878.59 seconds
