# **CIROH AI Bot - Database Population and Embedding Generation**

In [None]:
# Import libraries
import requests
from database import DatabaseManager
from dotenv import load_dotenv
import os
import pandas as pd
import time
import openai
import json

In [None]:
# Environment variables
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")

BASE_URL = "https://docs.ciroh.org"

In [None]:
# Initialize the manager
db_manager = DatabaseManager()

def initialize_schema(schema_file='schema.sql'):
    """
    Checks if the core tables exist in the configured schema.
    If not, executes the provided SQL script to initialize the database.
    """
    # Tables to verify existence
    required_tables = ['TBLArtifactTypes', 'TBLArtifacts', 'TBLChunkTypes', 'TBLChunks']
    
    # Query to check existing tables in the current search_path
    check_query = """
    SELECT table_name 
    FROM information_schema.tables 
    WHERE table_schema = %s;
    """
    
    with db_manager as db:
        print(f"Checking schema: {db.schema}...")
        existing_tables = db.execute_query(check_query, (db.schema,), fetch=True)
        existing_table_names = [t['table_name'].lower() for t in existing_tables]

        # Determine if any required table is missing
        missing_tables = [t.lower() for t in required_tables if t.lower() not in existing_table_names]
        
        if missing_tables:
            print(f"Missing tables detected: {missing_tables}. Initializing schema...")
            
            if not os.path.exists(schema_file):
                print(f"Error: {schema_file} not found.")
                return

            with open(schema_file, 'r') as f:
                schema_sql = f.read()
            
            # Execute the full schema script
            # We use the session-based connection for safety
            db.execute_query(schema_sql)
            print("Schema initialized successfully.")
        else:
            print("All required tables are present. Ready to work.")

In [None]:
initialize_schema()

### **Populate Database Catalogs**

In [None]:
artifact_types = [
    ('DocuHub Page',),
    ('Publication',),
    ('Dataset',),
    ('GitHub Repository',),
    ('Course',),
    ('Presentation',)
]

with db_manager as db:
    # 1. Insert Artifact Types one by one
    print("Inserting Artifact Types...")
    db.execute_batch("INSERT INTO TBLArtifactTypes (TypeName) VALUES %s ON CONFLICT DO NOTHING;", artifact_types)
    
    # 2. Get IDs for mapping (PostgreSQL returns lowercase keys in RealDictCursor)
    res = db.execute_query("SELECT idArtifactType, TypeName FROM TBLArtifactTypes;", fetch=True)
    
    if not res:
        print("Error: TBLArtifactTypes is empty. Check your database connection or schema.")
    else:
        # Build the map (Postgres column names are lowercase by default)
        type_map = {r['typename']: r['idartifacttype'] for r in res}
        print(f"Mapping successful: {list(type_map.keys())}")

        # 3. Define Chunk Types
        chunk_types = [
            # For Publications (Papers)
            (type_map['Publication'], 'Abstract'),
            (type_map['Publication'], 'Background'),
            (type_map['Publication'], 'Research Goal'),
            (type_map['Publication'], 'Problem'),
            (type_map['Publication'], 'Method'),
            (type_map['Publication'], 'Data'),
            (type_map['Publication'], 'Result'),
            (type_map['Publication'], 'Discussion-Limitation'),
            (type_map['Publication'], 'Conclusion'),

            # [cite_start]For Datasets (HydroShare)
            (type_map['Dataset'], 'Abstract'),
            (type_map['Dataset'], 'Spatial Coverage'),
            (type_map['Dataset'], 'Temporal Coverage'),
            (type_map['Dataset'], 'Variable Metadata'),
            (type_map['Dataset'], 'Data Services Info'),
            (type_map['Dataset'], 'File Description'),
            (type_map['Dataset'], 'Collection Contents'),
            (type_map['Dataset'], 'Subject Keywords'),
            (type_map['Dataset'], 'Related Resources Context'),

            # [cite_start]For GitHub Repositories
            (type_map['GitHub Repository'], 'Project Overview'),
            (type_map['GitHub Repository'], 'Installation Setup'),
            (type_map['GitHub Repository'], 'Usage Examples'),
            (type_map['GitHub Repository'], 'Repository Structure'),
            (type_map['GitHub Repository'], 'Contributing Guidelines'),
            (type_map['GitHub Repository'], 'License Citation'),

            # [cite_start]General/DocuHub
            (type_map['DocuHub Page'], 'Section'),
            (type_map['DocuHub Page'], 'Subsection'),
            (type_map['DocuHub Page'], 'Subsubsection'),

            # For Courses (HydroLearn)
            (type_map['Course'], 'Problem Statement'),
            (type_map['Course'], 'Module Overview'),
            (type_map['Course'], 'Topic Covered'),
            (type_map['Course'], 'Prerequisites'),
            (type_map['Course'], 'Learning Objective'),
            (type_map['Course'], 'Suggested Implementation'),
            (type_map['Course'], 'Target Audience'),
            (type_map['Course'], 'Author'),
            (type_map['Course'], 'Tools Needed'),
            (type_map['Course'], 'Expected Effort'),
            (type_map['Course'], 'Citation'),

            # For Presentations (HydroShare)
            (type_map['Presentation'], 'Abstract'),
            (type_map['Presentation'], 'Subject Keywords'),
            (type_map['Presentation'], 'Collection Contents'),
            (type_map['Presentation'], 'Related Resources Context')
        ]
        
        # 4. Insert Chunk Types one by one
        print(f"Inserting {len(chunk_types)} Chunk Types...")
        db.execute_batch("INSERT INTO TBLChunkTypes (idArtifactType, TypeName) VALUES %s ON CONFLICT DO NOTHING;", chunk_types)
        
        print("Catalogs populated successfully.")