In [2]:
from pymongo import MongoClient
import pandas as pd

MONGO_URI = "mongodb://localhost:27017/"  # Adjust URI if needed
DB_NAME = "jobs_data"  # Replace with your database name
COLLECTION_NAME = "jobs"  # Replace with your collection name

# Connect to MongoDB
client = MongoClient(MONGO_URI)
db = client[DB_NAME]
collection = db[COLLECTION_NAME]

In [None]:
# Step 2: Define the fields to extract
relevant_fields = {
    "name": 1,                    # Job title
    "text": 1,                    # Job posting text
    "position.careerLevel": 1,    # Career level
    "json": 1,                    # Structured JSON data
    "html": 1,                    # Raw HTML
    "salary.value": 1,            # Salary value
    "position.department": 1,     # Department
    "orgAddress.city": 1,         # City
    "orgTags": 1,                 # Tags
    "_id": 0                     # Exclude the MongoDB _id field (optional)
}

# # Step 3: Query MongoDB for a random sample of 1000 records
# pipeline = [
#     {"$sample": {"size": 1000}},  # Randomly select 1000 records
#     {"$project": relevant_fields}  # Include only the specified fields
# ]

# # Execute the aggregation query and fetch results as a list
# sample_records = list(collection.aggregate(pipeline))

# # Step 4: Load the sample into a pandas DataFrame
# df = pd.DataFrame(sample_records)

# # Print basic info about the DataFrame
# print(f"Retrieved {len(df)} records")
# print("DataFrame Info:")
# print(df.info())
# print("\nFirst few rows:")
# print(df.head())

# # Close the MongoDB connection
# client.close()

In [3]:
import pandas as pd
import requests
import json

# LM Studio API endpoint (adjust if your port differs)
API_URL = "http://127.0.0.1:1234/v1/completions"

# Prompt template to extract core job role
PROMPT_TEMPLATE = """
Given the following job title, extract the core job role, removing any extraneous details such as company names, locations, salaries, incentives, or schedules. Return only the core position as a concise phrase.

Job Title: "{}"

Core Job Role:
"""

def get_core_position(title, api_url=API_URL):
    """
    Send a job title to LM Studio and extract the core position using LLaMA 3.2.
    """
    # Prepare the prompt
    prompt = PROMPT_TEMPLATE.format(title)
    
    # API payload
    payload = {
        "prompt": prompt,
        "model": "llama-3.2-3b-instruct",  # Adjust if your model name differs in LM Studio
        "max_tokens": 50,      # Limit response length
        "temperature": 0.3,    # Low temperature for precise answers
        "stop": ["\n\n"]       # Stop at double newline to isolate the core role
    }
    
    try:
        # Send request to LM Studio API
        response = requests.post(api_url, json=payload, headers={"Content-Type": "application/json"})
        response.raise_for_status()  # Raise exception for bad status codes
        
        # Parse the response
        result = response.json()
        text = result["choices"][0]["text"].strip()
        
        # Extract the core role after "Core Job Role:"
        if "Core Job Role:" in text:
            core_role = text.split("Core Job Role:")[1].strip()
        else:
            core_role = text  # Fallback to full text if format is off
        
        return core_role if core_role else "Unknown"
    
    except Exception as e:
        print(f"Error processing '{title}': {e}")
        return "Unknown"

In [None]:
def process_mongo_records():
    """
    Process records one by one from MongoDB, skipping those with core_position.
    Update the database with core_position for others.
    """
    # Query all records (or filter as needed)
    cursor = collection.find()
    
    total_records = collection.count_documents({})
    processed = 0
    
    for record in cursor:
        record_id = record["_id"]
        job_title = record.get("name", "Unknown")
        
        # Skip if core_position already exists
        if "core_position" in record and record["core_position"]:
            print(f"Skipping {job_title} (ID: {record_id}) - core_position already exists")
            processed += 1
            continue
        
        # Extract core position
        core_position = get_core_position(job_title)
        
        # Update the document in MongoDB
        test = collection.update_one(
            {"_id": record_id},
            {"$set": {"core_position": core_position}}
        )
        
        processed += 1
        print(f"Processed {job_title} (ID: {record_id}) - Core Position: {core_position}")
        
        # Optional: Progress tracking
        if processed % 100 == 0:
            print(f"Progress: {processed}/{total_records} records processed")
    
    print(f"Finished processing {processed}/{total_records} records")

# Run the processing
process_mongo_records()

Skipping Account Manager (ID: 6457879fd1187d621cbbba9c) - core_position already exists
Skipping Music Teacher, Band Director (ID: 6457887cd1187d621cbbbae1) - core_position already exists
Skipping Floral Clerk FT Chapel Hill NC (ID: 6457889ad1187d621cbbbb01) - core_position already exists
Skipping Audio and Video Transcription - Flexible Schedules (Lansing) (ID: 645788afd1187d621cbbbb23) - core_position already exists
Skipping Sticker Production Assistant (ID: 645788b4d1187d621cbbbb25) - core_position already exists
Skipping Specifications Writer (ID: 645788b4d1187d621cbbbb26) - core_position already exists
Skipping Social Media Evaluator - Dynamic Work Opportunity (Mc Kees Rocks) (ID: 645788bcd1187d621cbbbb29) - core_position already exists
Skipping Photo Color Correction Specialist (ID: 645788bcd1187d621cbbbb2a) - core_position already exists
Skipping Service Writer (ID: 645788bed1187d621cbbbb2b) - core_position already exists
Skipping UAF CLA Term Instructor of Piano (ID: 645788c1d11

KeyboardInterrupt: 

In [11]:
import requests
import json

# LM Studio API endpoint (adjust if your port differs)
API_URL = "http://localhost:1234/v1/completions"

# Prompt template to extract core skills
PROMPT_TEMPLATE = """
Given the following job description, extract the core skills required, removing any extraneous details such as company names, locations, salaries, incentives, schedules, or unrelated text. Return only the core skills as a concise list of short phrases (e.g., "good communication", "project management", "Python"), avoiding full sentences or detailed explanations. Each skill should only be 1-3 words

Job Description: "{}"

Core Skills:
"""

def get_core_skills(description, api_url=API_URL):
    """
    Send a job description to LM Studio and extract the core skills as a clean list.
    """
    # Prepare the prompt
    prompt = PROMPT_TEMPLATE.format(description)
    
    # API payload
    payload = {
        "prompt": prompt,
        "model": "llama-3.2-3b-instruct",  # Adjust if your model name differs
        "max_tokens": 50,      # Limit response length
        "temperature": 0.3,    # Low temperature for precise answers
        "stop": ["\n\n"]       # Stop at double newline
    }
    
    try:
        # Send request to LM Studio API
        response = requests.post(api_url, json=payload, headers={"Content-Type": "application/json"})
        response.raise_for_status()
        
        # Parse the response
        result = response.json()
        text = result["choices"][0]["text"].strip()
        
        # Extract skills after "Core Skills:"
        if "Core Skills:" in text:
            skills_text = text.split("Core Skills:")[1].strip()
        else:
            skills_text = text  # Fallback if format is unexpected
        
        # Clean up the skills text: handle numbered lists, bullets, and newlines
        skills = []
        for line in skills_text.split("\n"):  # Split by newlines
            line = line.strip()  # Remove leading/trailing whitespace
            if line:  # Ignore empty lines
                # Remove numbering (e.g., "1.", "2.") or bullets (e.g., "*")
                cleaned_line = line.lstrip("0123456789.* -•").strip()
                if cleaned_line:  # Ensure something remains after cleaning
                    skills.append(cleaned_line)
        
        return skills if skills else ["Unknown"]
    
    except Exception as e:
        print(f"Error processing description: {e}")
        return ["Unknown"]

In [12]:
def process_mongo_records():
    """
    Process records one by one from MongoDB, skipping those with core_skills.
    Update the database with core_skills for others.
    """
    # Query all records (or filter as needed)
    cursor = collection.find()
    
    total_records = collection.count_documents({})
    processed = 0
    
    for record in cursor:
        record_id = record["_id"]
        job_title = record.get("name", "Unknown")
        
        # Skip if core_skills already exists
        if "core_skills" in record and record["core_skills"]:
            print(f"Skipping {job_title} (ID: {record_id}) - core_skills already exists")
            processed += 1
            continue
        
        # Extract core skills
        core_skills = get_core_skills(job_title)
        
        # Update the document in MongoDB
        test = collection.update_one(
            {"_id": record_id},
            {"$set": {"core_skills": core_skills}}
        )
        
        processed += 1
        print(f"Processed {job_title} (ID: {record_id}) - Core Skills: {core_skills}")
        
        # Optional: Progress tracking
        if processed % 100 == 0:
            print(f"Progress: {processed}/{total_records} records processed")
    
    print(f"Finished processing {processed}/{total_records} records")

# Run the processing
process_mongo_records()

Skipping Account Manager (ID: 6457879fd1187d621cbbba9c) - core_skills already exists
Skipping Music Teacher, Band Director (ID: 6457887cd1187d621cbbbae1) - core_skills already exists
Processed Floral Clerk FT Chapel Hill NC (ID: 6457889ad1187d621cbbbb01) - Core Skills: ['Good communication', 'Basic math', 'Basic computer skills', 'Cash handling', 'Customer service', 'Time management', 'Teamwork', 'Project management', 'Data entry', 'Microsoft Office', 'Python programming', 'Inventory management']


KeyboardInterrupt: 