In [3]:
import pandas as pd

csvPath="../data/job profiles/2025-02-07_profiles.csv"
df=pd.read_csv(csvPath)

In [None]:
import sys
import os
sys.path.append(os.path.abspath('..'))
from notebooks.utils import get_job_profile_documents
from typing import List
from langchain_core.documents import Document

documents=get_job_profile_documents(csvPath)

In [10]:
# PROFILE PER DOCUMENT
def process_job_profiles(csv_path: str) -> List[Document]:
    """Process job profiles CSV and create one document per job profile"""
    df = pd.read_csv(csv_path)
    documents = []

    for idx, row in df.iterrows():
        # Parse JSON fields
        # json_fields = ['role', 'role_type', 'scopes', 'classifications', 'organizations']
        parsed_data = {}
        
        # for field in json_fields:
        #     if pd.notna(row.get(field)):
        #         try:
        #             data = json.loads(row[field])
        #             if field in ['classifications', 'organizations']:
        #                 parsed_data[field] = ", ".join([f"{item['name']}" + (f" ({item['code']})" if field == 'organizations' else "") for item in data])
        #             elif field == 'scopes':
        #                 parsed_data[field] = ", ".join([item["name"] for item in data])
        #             else:
        #                 parsed_data[field] = data['name']
        #         except json.JSONDecodeError:
        #             parsed_data[field] = ""

        # Process classifications
        if pd.notna(row.get('classifications')):
            try:
                classifications_data = json.loads(row['classifications'])
                # Create a set to remove duplicates
                classification_names = set(item['name'] for item in classifications_data)
                parsed_data['classifications'] = ", ".join(sorted(classification_names))
            except json.JSONDecodeError:
                parsed_data['classifications'] = ""

        # Process organizations
        if pd.notna(row.get('organizations')):
            try:
                organizations_data = json.loads(row['organizations'])
                # Create a set of tuples (name, code) to remove duplicates
                org_items = set((item['name'], item['code']) for item in organizations_data)
                # Join with formatting
                parsed_data['organizations'] = ", ".join(
                    f"{name} ({code})" for name, code in sorted(org_items)
                )
            except json.JSONDecodeError:
                parsed_data['organizations'] = ""

        # Process other JSON fields (role, role_type, scopes)
        for field in ['role', 'role_type', 'scopes']:
            if pd.notna(row.get(field)):
                try:
                    data = json.loads(row[field])
                    if field == 'scopes':
                        scope_names = set(item["name"] for item in data)  # Remove duplicates
                        parsed_data[field] = ", ".join(sorted(scope_names))
                    else:
                        parsed_data[field] = data['name']
                except json.JSONDecodeError:
                    parsed_data[field] = ""

        # Create metadata
        metadata = {
            "title": row.get("title", ""),
            "number": row.get("number", ""),
            "type": row.get("type", ""),
            "context": row.get("context", ""),
            "views": row.get("views", ""),
            "role": parsed_data.get('role', ""),
            "role_type": parsed_data.get('role_type', ""),
            "scopes": parsed_data.get('scopes', ""),
            "classifications": parsed_data.get('classifications', ""),
            "organizations": parsed_data.get('organizations', ""),
            "created_at": row.get("created_at", ""),
            "updated_at": row.get("updated_at", ""),
            "row_index": idx,
        }

        # Build content sections
        content_sections = [
            f"Job Profile Title: {metadata['title']}",
            f"Classifications: {metadata['classifications']}",
            f"Organizations: {metadata['organizations']}"
        ]

        # Array fields to process
        array_fields = {
            "behavioural_competencies": "Behavioural Competencies",
            "education": "Education",
            "job_experience": "Job Experience",
            "professional_registration_requirements": "Professional Registration Requirements",
            "preferences": "Preferences",
            "knowledge_skills_abilities": "Knowledge, Skills, and Abilities",
            "willingness_statements": "Willingness Statements",
            "security_screenings": "Security Screenings",
            "accountabilities": "Accountabilities",
        }

        # Process each section
        for field, section_title in array_fields.items():
            if pd.notna(row.get(field)):
                try:
                    items = json.loads(row[field])
                    if(len(items)==0):
                        continue
                    content_sections.append(f'\n{row.get("title", "")} {section_title}:')
                    
                    if field == "behavioural_competencies":
                        
                        section_items = [f"• {item['name']}: {item['description']}" for item in items]
                    else:
                        section_items = [f"• {item['text']}" for item in items]
                    
                    content_sections.extend(section_items)
                except json.JSONDecodeError:
                    continue

        # Create one document with all content
        doc = Document(
            page_content="\n".join(content_sections),
            metadata=metadata
        )
        documents.append(doc)

    return documents
documents=process_job_profiles(csvPath)

In [None]:
def dump_documents_to_console(documents, limit=10):
    """Prints a preview of the generated documents to the console."""
    print(f"Total documents generated: {len(documents)}\n")
    for i, doc in enumerate(documents[:limit]):  # Limit the number of documents printed
        print(f"Document {i + 1}:")
        print("-" * 50)
        print(f"Content:\n{doc.page_content}\n")
        print(f"Metadata:\n{json.dumps(doc.metadata, indent=4)}")
        print("=" * 50)

# Dump documents to console
dump_documents_to_console(documents, limit=100)  # Adjust limit as needed


In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import CSVLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from transformers import AutoTokenizer

import chromadb

client = chromadb.PersistentClient("../job_profiles_db2")
collection = client.get_or_create_collection("job_profiles",metadata={"hnsw:batch_size":10000})

def create_vectorstore_with_batching(documents, batch_size=100):  # Reduced batch size
    embeddings = HuggingFaceEmbeddings(model_name="thenlper/gte-small")
    
    vectorstore = Chroma(
        # persist_directory="job_profiles_db",
        client=client,
        embedding_function=embeddings,
        collection_name="job_profiles"
    )
    
    try:
        for i in range(0, len(documents), batch_size):
            batch = documents[i:i + batch_size]
            print(f"Processing batch {i//batch_size + 1}, size: {len(batch)}")
            
            vectorstore.add_documents(documents=batch)
            
            # Add verification step
            current_count = vectorstore._collection.count()
            print(f"Current document count: {current_count}")
            
    except Exception as e:
        print(f"Error during processing: {str(e)}")
        
    return vectorstore

# loader = CSVLoader(file_path="../data/job profiles/2025-02-07_profiles.csv", content_columns=["title", "overview"], encoding="utf-8-sig")
# documents = loader.load()


# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=1000,
#     chunk_overlap=200,
#     separators=["\n\n", "\n", " ", ""]
# )

text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
    AutoTokenizer.from_pretrained("thenlper/gte-small"),
    chunk_size=512,
    chunk_overlap=50,
    add_start_index=True,
    strip_whitespace=True
)

chunks = text_splitter.split_documents(documents)

# chunks = documents
print(f"Documents adding: {len(chunks)}")
print('creating vector store..')
vectorstore = create_vectorstore_with_batching(chunks)

print(f"Collection count: {vectorstore._collection.count()}")

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/394 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Documents adding: 2097
creating vector store..


modules.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/68.1k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/66.7M [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Processing batch 1, size: 100
Current document count: 100
Processing batch 2, size: 100
Current document count: 200
Processing batch 3, size: 100
Current document count: 300
Processing batch 4, size: 100
Current document count: 400
Processing batch 5, size: 100
Current document count: 500
Processing batch 6, size: 100
Current document count: 600
Processing batch 7, size: 100
Current document count: 700
Processing batch 8, size: 100
Current document count: 800
Processing batch 9, size: 100
Current document count: 900
Processing batch 10, size: 100
Current document count: 1000
Processing batch 11, size: 100
Current document count: 1100
Processing batch 12, size: 100
Current document count: 1200
Processing batch 13, size: 100
Current document count: 1300
Processing batch 14, size: 100
Current document count: 1400
Processing batch 15, size: 100
Current document count: 1500
Processing batch 16, size: 100
Current document count: 1600
Processing batch 17, size: 100
Current document count: 170