In [1]:
import weaviate
from weaviate.classes.init import Auth
import requests, json, os

# Best practice: store your credentials in environment variables
wcd_url = "https://p6ce0pj5rbib30et8ie7ug.c0.us-west3.gcp.weaviate.cloud"
wcd_api_key = "HSJEaIn0nUSl3ZOEHmaXd68KryjWAR8CX8vy"

client = weaviate.connect_to_weaviate_cloud(
    cluster_url=wcd_url,
    auth_credentials=Auth.api_key(wcd_api_key),
    headers={
        "X-OpenAI-Api-Key": os.getenv("OPENAI_API_KEY")
    }
)



In [12]:
from weaviate.classes.config import Configure

candidates = client.collections.create(
    name="Candidates",
    vectorizer_config=Configure.Vectorizer.text2vec_openai(), # Configure the Weaviate Embeddings integration
    generative_config=Configure.Generative.openai()        # Configure the Cohere generative AI integration
)

In [13]:
with open("form-submissions.json", "r", encoding="utf-8") as f:
    data = json.load(f)
    

successful_imports = 0
skipped_empty_names = 0
skipped_no_contact = 0

with candidates.batch.dynamic() as batch:
    for d in data:
        # VALIDATION 1: Name is required
        if not d.get("name"):
            skipped_empty_names += 1
            continue
            
        # VALIDATION 2: Either phone OR email is required
        if not d.get("phone") and not d.get("email"):
            skipped_no_contact += 1
            continue
        
        # Extract primary work experience (most recent/first in list)
        primary_work_experience = d.get("work_experiences", [])[0] if d.get("work_experiences") else {}
        
        # Extract highest education degree
        degrees = d.get("education", {}).get("degrees", [])
        primary_degree = degrees[0] if degrees else {}
        
        # Prepare full_time salary expectation
        salary_expectation = d.get("annual_salary_expectation", {}).get("full-time", "")
        
        # Format skills as a comma-separated string for searchability
        skills_string = ", ".join(d.get("skills", []))
        
        # Check if a candidate attended a top school (either top 25 or top 50)
        is_top_school = any((degree.get("isTop50", False) or degree.get("isTop25", False)) for degree in degrees)
        
        # Add the object to the batch
        batch.add_object({
            "name": d.get("name", ""),
            "email": d.get("email", ""),
            "phone": d.get("phone", ""),
            "location": d.get("location", ""),
            "submitted_at": d.get("submitted_at", ""),
            "work_availability": d.get("work_availability", []),
            "salary_expectation": salary_expectation,
            "work_experiences": d.get("work_experiences", []),
            "current_company": primary_work_experience.get("company", ""),
            "current_role": primary_work_experience.get("roleName", ""),
            "education_highest_level": d.get("education", {}).get("highest_level", ""),
            "education_degrees": degrees,
            "primary_degree_subject": primary_degree.get("subject", ""),
            "primary_degree_school": primary_degree.get("originalSchool", ""),
            "skills": d.get("skills", []),
            "skills_text": skills_string,
            "is_top_school": is_top_school
        })
        successful_imports += 1

# Print summary
print(f"Import complete!")
print(f"Successfully imported: {successful_imports} candidates")
print(f"Skipped due to empty names: {skipped_empty_names}")
print(f"Skipped due to missing contact info: {skipped_no_contact}")

failed_objects = candidates.batch.failed_objects
if failed_objects:
    print(f"Number of failed imports: {len(failed_objects)}")
    print(f"First failed object: {failed_objects[0]}")

Import complete!
Successfully imported: 930 candidates
Skipped due to empty names: 45
Skipped due to missing contact info: 0
