In [1]:
!pip install requests




[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [33]:
import requests
import json
from pathlib import Path

API_BASE_URL = "http://localhost:9200"
INGEST_ENDPOINT = f"{API_BASE_URL}/ingest"

# Load and test with just one candidate
data_dir = Path("../data")
sample_file = data_dir / "sample_candidates.json"


if sample_file.exists():
    with open(sample_file, 'r', encoding='utf-8') as f:
        candidates = json.load(f)
    
    # Test with first candidate only
    #print(f"Testing with 1 candidate (ID: {test_candidate[0].get('candidateId', 'N/A')})")
    #print(f"Sample structure - resumeData keys: {list(test_candidate[0].get('resumeData', {}).keys())}")
        
    for ix, candidate in enumerate(candidates[:1]):
        test_candidate = candidate
        if ix//10 == 0:
            print(f"Processing candidate {ix} of {len(candidates)}")
        try:
            response = requests.post(
                INGEST_ENDPOINT,
                json=test_candidate,
                params={"collection_name": "candidates_store"},
                headers={"Content-Type": "application/json"},
                timeout=60
            )
            
            response.raise_for_status()
            result = response.json()
            print(f"\n✓ Success! Test candidate ingested.")

            print(f"Result: {json.dumps(result, indent=2)}")
            
        except requests.exceptions.RequestException as e:
            print(f"\n✗ Error: {e}")
            if hasattr(e, 'response') and e.response is not None:
                print(f"Status: {e.response.status_code}")
                try:
                    error = e.response.json()
                    print(f"Error: {json.dumps(error, indent=2)}")
                except:
                    print(f"Response: {e.response.text}")
else:
    print("Sample file not found")


Processing candidate 0 of 100

✓ Success! Test candidate ingested.
Result: {
  "status": "accepted",
  "message": "Ingestion started in background for 1 candidates",
  "collection_name": "candidates_store",
  "total_candidates": 1,
  "note": "Check logs for final results and detailed progress"
}


In [4]:
import requests
import json
from pathlib import Path

API_BASE_URL = "http://localhost:9100"
INGEST_ENDPOINT = f"{API_BASE_URL}/ingest"

# Load and test with all candidates in array
data_dir = Path("../data")
# sample_file = data_dir / "sample_candidates.json"
sample_file = data_dir / "candidates_1000.json"

if sample_file.exists():
    with open(sample_file, 'r', encoding='utf-8') as f:
        candidates = json.load(f)
    
    print(f"Testing with {len(candidates)} candidates")
    if candidates:
        print(f"First candidate ID: {candidates[0].get('candidateId', 'N/A')}")
        print(f"Sample structure - resumeData keys: {list(candidates[0].get('resumeData', {}).keys())}")
    
    try:
        response = requests.post(
            INGEST_ENDPOINT,
            json=candidates,
            params={"collection_name": "candidates_store"},
            headers={"Content-Type": "application/json"},
            timeout=60
        )
        
        response.raise_for_status()
        result = response.json()
        print(f"\n✓ Success! {len(candidates)} candidates ingested.")

        print(f"Result: {json.dumps(result, indent=2)}")
        
    except requests.exceptions.RequestException as e:
        print(f"\n✗ Error: {e}")
        if hasattr(e, 'response') and e.response is not None:
            print(f"Status: {e.response.status_code}")
            try:
                error = e.response.json()
                print(f"Error: {json.dumps(error, indent=2)}")
            except:
                print(f"Response: {e.response.text}")
else:
    print("Sample file not found")


Testing with 1000 candidates
First candidate ID: 8284346
Sample structure - resumeData keys: ['ContactInformation', 'ProfessionalSummary', 'Objective', 'CoverLetter', 'PersonalAttributes', 'Education', 'EmploymentHistory', 'SkillsData', 'Skills', 'Certifications', 'Licenses', 'Associations', 'LanguageCompetencies', 'MilitaryExperience', 'SecurityCredentials', 'References', 'Achievements', 'Training', 'QualificationsSummary', 'Hobbies', 'Patents', 'Publications', 'SpeakingEngagements', 'ResumeMetadata', 'UserDefinedTags']

✗ Error: HTTPConnectionPool(host='localhost', port=9100): Read timed out. (read timeout=60)


In [None]:
import requests
import json

API_BASE_URL = "http://localhost:9100"
GET_COLLECTION_INFO_ENDPOINT = f"{API_BASE_URL}/collections/candidates_store"

try:
    response = requests.get(
        GET_COLLECTION_INFO_ENDPOINT,
        headers={"Content-Type": "application/json"},
        timeout=30
    )
    response.raise_for_status()
    result = response.json()
    print("✓ Success! Collection info fetched.")
    print(f"Result: {json.dumps(result, indent=2)}")
except requests.exceptions.RequestException as e:
    print(f"\n✗ Error: {e}")
    if hasattr(e, 'response') and e.response is not None:
        print(f"Status: {e.response.status_code}")
        try:
            error = e.response.json()
            print(f"Error: {json.dumps(error, indent=2)}")
        except:
            print(f"Response: {e.response.text}")


In [22]:
filter = result['candidate_ids']

In [20]:
for c in candidates[:5]:
    print(c['candidateId'])

8284346
8566860
5692722
8566304
5866975


In [23]:
API_BASE_URL = "http://localhost:9100"
INGEST_ENDPOINT = f"{API_BASE_URL}/ingest"

new_candidates = []
for candidate in candidates:
    if candidate['candidateId'] not in filter:
        new_candidates.append(candidate)

print(len(new_candidates))

try:
    response = requests.post(
        INGEST_ENDPOINT,
        json=new_candidates,
        params={"collection_name": "candidates_store"},
        headers={"Content-Type": "application/json"},
        timeout=60
    )
    
    response.raise_for_status()
    result = response.json()
    print(f"\n✓ Success! {len(candidates)} candidates ingested.")

    print(f"Result: {json.dumps(result, indent=2)}")
    
except requests.exceptions.RequestException as e:
    print(f"\n✗ Error: {e}")
    if hasattr(e, 'response') and e.response is not None:
        print(f"Status: {e.response.status_code}")
        try:
            error = e.response.json()
            print(f"Error: {json.dumps(error, indent=2)}")
        except:
            print(f"Response: {e.response.text}")

0

✗ Error: 400 Client Error: Bad Request for url: http://localhost:9100/ingest?collection_name=candidates_store
Status: 400
Error: {
  "detail": "No candidates provided"
}


In [51]:
data_dir = Path("../data")
sample_file = data_dir / "final_data.json"
if sample_file.exists():
    with open(sample_file, 'r', encoding='utf-8') as f:
        all_candidates = json.load(f)

In [52]:
len(all_candidates)

99996

In [50]:
API_BASE_URL = "http://localhost:9200"
INGEST_ENDPOINT = f"{API_BASE_URL}/ingest"

response = requests.post(
            INGEST_ENDPOINT,
            json=candidates[0],
            params={"collection_name": "candidates_store"},
            headers={"Content-Type": "application/json"},
            timeout=60
        )

In [74]:
API_BASE_URL = "http://localhost:9200"
INGEST_ENDPOINT = f"{API_BASE_URL}/ingest"
candidates = all_candidates[:1000]
if True:
    print(f"Testing with {len(candidates)} candidates")
    if candidates:
        print(f"First candidate ID: {candidates[0].get('candidateId', 'N/A')}")
        print(f"Sample structure - resumeData keys: {list(candidates[0].get('resumeData', {}).keys())}")
    
    try:
        response = requests.post(
            INGEST_ENDPOINT,
            json=upsert_cand_obj,
            params={"collection_name": "candidates_store"},
            headers={"Content-Type": "application/json"},
            timeout=60
        )
        
        response.raise_for_status()
        result = response.json()
        print(f"\n✓ Success! {len(candidates)} candidates ingested.")

        print(f"Result: {json.dumps(result, indent=2)}")
        
    except requests.exceptions.RequestException as e:
        print(f"\n✗ Error: {e}")
        if hasattr(e, 'response') and e.response is not None:
            print(f"Status: {e.response.status_code}")
            try:
                error = e.response.json()
                print(f"Error: {json.dumps(error, indent=2)}")
            except:
                print(f"Response: {e.response.text}")
else:
    print("Sample file not found")


Testing with 1000 candidates
First candidate ID: N/A
Sample structure - resumeData keys: []

✓ Success! 1000 candidates ingested.
Result: {
  "total": 14,
  "successful": 4,
  "failed": 10,
  "results": [
    {
      "candidate_id": "2046354350783",
      "success": false,
      "vector_id": null,
      "error": "Candidate already exists in collection (omit mode)"
    },
    {
      "candidate_id": "12572869158483",
      "success": false,
      "vector_id": null,
      "error": "Candidate already exists in collection (omit mode)"
    },
    {
      "candidate_id": "13777220638865",
      "success": false,
      "vector_id": null,
      "error": "Candidate already exists in collection (omit mode)"
    },
    {
      "candidate_id": "16460284648083",
      "success": false,
      "vector_id": null,
      "error": "Candidate already exists in collection (omit mode)"
    },
    {
      "candidate_id": "17301398855299",
      "success": false,
      "vector_id": null,
      "error": "Candi

In [63]:
jobs_candidates = data_dir / "filtered_candidates.json"
if jobs_candidates.exists():
    with open(jobs_candidates, 'r', encoding='utf-8') as f:
        jobs_candidates = json.load(f)


In [66]:
upsert_cand = set()
for cand in jobs_candidates:
    upsert_cand.add(cand['ats_candidate_id'])

In [70]:
upsert_cand_obj = [can for can in all_candidates if can['candidate_id'] in upsert_cand]

[{'candidate_id': '2046354350783',
  'client_id': None,
  'email': 'tarasavino@yahoo.com',
  'first_name': 'Tamara Dawn',
  'last_name': 'Savino',
  'primary_role_code': None,
  'current_role': None,
  'skills': [],
  'country': 'US',
  'city': 'Atlanta',
  'address1': '3060 Pharr Court North NW, Unit 822',
  'address2': '',
  'workphone': '',
  'homephone': '',
  'cellphone': '4045673706',
  'phone1': '',
  'phone2': '4045673706',
  'phone3': '',
  'phone4': '',
  'alternateemail': '',
  'years_experience_total': 28.5,
  'title_exp': [{'title': 'Artech', 'exp': '07/2024 - 03/2025', 'months': 8},
   {'title': 'IBM Security Services',
    'exp': '06/2015 - 03/2025',
    'months': 117},
   {'title': 'NextEra Energy. Florida Power Gas & Light / IBM Cybersecurity PM',
    'exp': '01/2024 - 06/2024',
    'months': 5},
   {'title': 'COX Manheim - Technology Solutions, Alpharetta GA',
    'exp': '03/2014 - 03/2015',
    'months': 12},
   {'title': 'McKesson Corporation - Technology Solutions,