In [51]:
from pymongo import MongoClient
import pandas as pd
import requests
import json

MONGO_URI = "mongodb://localhost:27017/"  # Adjust URI if needed
DB_NAME = "jobs_data"  # Replace with your database name
COLLECTION_NAME = "jobs"  # Replace with your collection name
API_URL = "http://127.0.0.1:1234/v1/completions"

# Connect to MongoDB
client = MongoClient(MONGO_URI)
db = client[DB_NAME]
collection = db[COLLECTION_NAME]

In [52]:
# # Step 2: Define the fields to extract
# relevant_fields = {
#     "name": 1,                    # Job title
#     "text": 1,                    # Job posting text
#     "position.careerLevel": 1,    # Career level
#     "json": 1,                    # Structured JSON data
#     "html": 1,                    # Raw HTML
#     "salary.value": 1,            # Salary value
#     "position.department": 1,     # Department
#     "orgAddress.city": 1,         # City
#     "orgTags": 1,                 # Tags
#     "_id": 0                     # Exclude the MongoDB _id field (optional)
# }

# # Step 3: Query MongoDB for a random sample of 1000 records
# pipeline = [
#     {"$sample": {"size": 1000}},  # Randomly select 1000 records
#     {"$project": relevant_fields}  # Include only the specified fields
# ]

# # Execute the aggregation query and fetch results as a list
# sample_records = list(collection.aggregate(pipeline))

# # Step 4: Load the sample into a pandas DataFrame
# df = pd.DataFrame(sample_records)

# # Print basic info about the DataFrame
# print(f"Retrieved {len(df)} records")
# print("DataFrame Info:")
# print(df.info())
# print("\nFirst few rows:")
# print(df.head())

# # Close the MongoDB connection
# client.close()

In [53]:
def call_llm(prompt_template, message, mode, api_url=API_URL):
    prompt = prompt_template.format(message)
    
    # API payload
    payload = {
        "prompt": prompt,
        "model": "llama-3.2-3b-instruct",
        "max_tokens": 100,
        "temperature": 0.2,
        "stop": ["\n\n"]
    }
    print(f"Payload: {json.dumps(payload, indent=2)}")
    
    try:
        # Send request to LLM API
        response = requests.post(api_url, json=payload, headers={"Content-Type": "application/json"})
        response.raise_for_status()  # Raise an error for bad responses
        
        # Parse the response
        result = response.json()
        text = result["choices"][0]["text"].strip()
        
        if mode == "ROLE":    
            # Extract the core role after "Core Job Role:"
            if "Core Job Role:" in text:
                core_role = text.split("Core Job Role:")[1].strip()
            else:
                core_role = text  # Fallback to full text if format is off
            
            return core_role if core_role else ""
        elif mode == "SKILL":
            # Extract skills after "Core Skills:"
            if "Core Skills:" in text:
                skills_text = text.split("Core Skills:")[1].strip()
            else:
                skills_text = text  # Fallback if format is unexpected
            
            # Clean up the skills text: handle numbered lists, bullets, and newlines
            skills = []
            for line in skills_text.split("\n"):
                line = line.strip()
                if line:
                    cleaned_line = line.lstrip("0123456789.* -•").strip()
                    if cleaned_line:
                        skills.append(cleaned_line)
            
            return skills if skills else []
    
    except Exception as e:
        print(f"Error processing record: {e}")
        return [] if mode == "SKILL" else ""

In [54]:
# Prompt template to extract core job role
JOB_PROMPT_TEMPLATE = """
Given the following job title, extract the core job role, removing any extraneous details such as company names, locations, salaries, incentives, or schedules. Return only the core position as a concise phrase.

Job Title: "{}"

Core Job Role:
"""

def extract_core_job():
    cursor = collection.find()
    
    total_records = collection.count_documents({})
    processed = 0
    
    for record in cursor:
        record_id = record["_id"]
        job_title = record.get("name", "Unknown")
        
        # Skip if core_position already exists
        if "core_position" in record and record["core_position"]:
            print(f"Skipping {job_title} (ID: {record_id}) - core_position already exists")
            processed += 1
            continue
        
        # Extract core position
        core_position = call_llm(JOB_PROMPT_TEMPLATE, job_title, "ROLE")
        if not core_position:
            print(f"Failed to extract core position for {job_title} (ID: {record_id})")
            continue
        
        # Update the document in MongoDB
        test = collection.update_one(
            {"_id": record_id},
            {"$set": {"core_position": core_position}}
        )
        
        processed += 1
        print(f"Processed {job_title} (ID: {record_id}) - Core Position: {core_position}")
        print(f"Progress: {processed}/{total_records} records processed")
    
    print(f"Finished processing {processed}/{total_records} records")

In [55]:
# Prompt template to extract core skills
SKILLS_PROMPT_TEMPLATE = """
Given the following job description, extract the core skills required, removing any extraneous details such as company names, locations, salaries, incentives, schedules, or unrelated text. Return only the core skills as a concise list of short skills, avoid soft skills (e.g., "teaching", "project management", "Python"), avoiding full sentences or detailed explanations. Each skill should only be 1-3 words

Job Description: "{}"

Core Skills:
"""

def extract_core_skills():
    # Query all records (or filter as needed)
    cursor = collection.find({"core_skills": {"$exists": False}})
    
    total_records = collection.count_documents({"core_skills": {"$exists": False}})
    processed = 0
    print("")
    
    for record in cursor:
        record_id = record["_id"]
        job_description = record.get("text", "Unknown")
        
        # Skip if core_skills already exists
        if "core_skills" in record and record["core_skills"]:
            print(f"\rSkipping {record_id} - core_skills already exists", end="")
            processed += 1
            continue
        
        # Extract core skills
        core_skills = call_llm(SKILLS_PROMPT_TEMPLATE, job_description, "SKILL")
        if not core_skills:
            print(f"Failed to extract core skills for {record_id}")
            continue
        
        # Update the document in MongoDB
        test = collection.update_one(
            {"_id": record_id},
            {"$set": {"core_skills": core_skills}}
        )
        
        processed += 1
        print(f"\r Progress: {processed}/{total_records} records | Processed {record_id} - Core Skills: {core_skills}", end="")
    
    print(f"Finished processing {processed}/{total_records} records")

In [56]:
# extract_core_job()

In [57]:
extract_core_skills()


Finished processing 0/0 records


In [58]:
# Run the aggregation query
pipeline = [
    {
        "$match": {
            "core_skills": {"$exists": True, "$ne": None, "$ne": []}
        }
    },
    {
        "$count": "records_with_core_skills"
    }
]
result = list(collection.aggregate(pipeline))

# Extract the count
count_with_core_skills = result[0]["records_with_core_skills"] if result else 0

# Get total records
total_records = collection.count_documents({})

# Print results
print(f"Records with non-empty 'core_skills': {count_with_core_skills}")
print(f"Total records: {total_records}")
print(f"Records without 'core_skills' or empty: {total_records - count_with_core_skills}")

Records with non-empty 'core_skills': 33055
Total records: 33055
Records without 'core_skills' or empty: 0
