In [None]:
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
import json
import logging
from typing import List, Dict
import os
from datetime import datetime

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def parse_date(date_str: str) -> int:
    """Convert date string to timestamp"""
    try:
        # Parse date string in format "MMM DD, YYYY"
        date_obj = datetime.strptime(date_str, "%b %d, %Y")
        return int(date_obj.timestamp())
    except Exception as e:
        logger.warning(f"Error parsing date {date_str}: {e}")
        return 0

def load_json_data(file_path: str = r"C:\Users\sweth\Downloads\bwh.json") -> List[Dict]:
    """
    Load data from JSON file
    """
    try:
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"JSON file not found at: {file_path}")
            
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
            logger.info(f"Successfully loaded {len(data)} records from JSON file")
            return data
    except json.JSONDecodeError as e:
        logger.error(f"Error decoding JSON file: {e}")
        raise
    except Exception as e:
        logger.error(f"Error loading JSON file: {e}")
        raise

def load_data_to_chromadb():
    """
    Load JSON data into ChromaDB with embeddings
    """
    try:
        # Initialize the embedding model
        logger.info("Initializing embedding model...")
        model = SentenceTransformer("all-MiniLM-L6-v2")
        
        # Initialize ChromaDB
        logger.info("Setting up ChromaDB...")
        client = chromadb.PersistentClient(path="./chromadb", settings=Settings(is_persistent=True))
        
        # Reset collection if it exists
        try:
            client.delete_collection("my_collection")
            logger.info("Deleted existing collection")
        except:
            pass
            
        collection = client.create_collection(
            name="my_collection",
            metadata={"hnsw:space": "cosine"}
        )
        
        # Load JSON data
        logger.info("Loading JSON data...")
        posts = load_json_data()
        
        # Prepare data for ChromaDB
        documents = []
        metadatas = []
        embeddings = []
        
        for post in posts:
            # Extract content and ensure it's not empty
            content = post.get("Content", "").strip()
            if not content:
                continue
                
            # Convert date to timestamp
            timestamp = parse_date(post.get("Date", ""))
            
            # Prepare metadata
            metadata = {
                "Username": post.get("Username", ""),
                "PostTitle": post.get("PostTitle", ""),
                "Content": content,
                "Date": post.get("Date", ""),
                "Likes": post.get("Likes", "0"),
                "timestamp": timestamp
            }
            
            documents.append(content)
            metadatas.append(metadata)
            embeddings.append(model.encode(content).tolist())
        
        # Add data to collection in batches
        batch_size = 100
        total_posts = len(documents)
        
        for i in range(0, total_posts, batch_size):
            end_idx = min(i + batch_size, total_posts)
            batch_documents = documents[i:end_idx]
            batch_metadatas = metadatas[i:end_idx]
            batch_embeddings = embeddings[i:end_idx]
            batch_ids = [f"post_{j}" for j in range(i, end_idx)]
            
            collection.add(
                embeddings=batch_embeddings,
                metadatas=batch_metadatas,
                documents=batch_documents,
                ids=batch_ids
            )
            
            logger.info(f"Added batch {i//batch_size + 1} ({end_idx}/{total_posts} posts)")
        
        logger.info(f"Successfully loaded {total_posts} posts into ChromaDB")
        return total_posts
        
    except Exception as e:
        logger.error(f"Error in data loading process: {e}")
        raise

if __name__ == "__main__":
    try:
        num_posts = load_data_to_chromadb()
        print(f"Successfully loaded {num_posts} posts into the database.")
    except Exception as e:
        print(f"Failed to load data: {e}")
