In [1]:
import json
from pathlib import Path
from collections import Counter
import numpy as np

def analyze_jd_lengths(json_path: str, bin_size: int = 1000):
    lengths = []

    with open(json_path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                jd = json.loads(line)
                combined = " ".join(str(v) for v in jd.values() if isinstance(v, str))
                lengths.append(len(combined))
            except json.JSONDecodeError:
                continue

    max_length = max(lengths)
    avg_length = sum(lengths) / len(lengths)

    # Create buckets like 0-999, 1000-1999, ...
    bins = [((l // bin_size) * bin_size) for l in lengths]
    bin_counts = Counter(bins)

    # Convert to readable format
    sorted_bins = sorted(bin_counts.items())
    bucket_labels = [f"{start}-{start + bin_size - 1}" for start, _ in sorted_bins]
    counts = [count for _, count in sorted_bins]

    return {
        "max_length": max_length,
        "average_length": avg_length,
        "bucket_summary": dict(zip(bucket_labels, counts))
    }

# Run the analysis
result = analyze_jd_lengths("json_outputs_all_data/parsed_jds.json")
result


{'max_length': 3627,
 'average_length': 50.51864770809615,
 'bucket_summary': {'0-999': 123697,
  '1000-1999': 81,
  '2000-2999': 39,
  '3000-3999': 32}}

In [4]:
import json
from pathlib import Path
from collections import Counter

def classify_jd_length(length: int) -> str:
    if length < 2000:
        return "Short"
    elif length <= 8000:
        return "Normal"
    else:
        return "Long"

def analyze_jd_lengths_pure(json_path: str):
    lengths = []
    category_counter = Counter()
    bucket_counter = Counter()

    with open(json_path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                jd = json.loads(line)
                combined = " ".join(str(v) for v in jd.values() if isinstance(v, str))
                print(combined)
                print("Length of JD:", len(combined))
                length = len(combined)
                lengths.append(length)
                
                category = classify_jd_length(length)
                category_counter[category] += 1
                
                bucket = f"{(length // 1000) * 1000}-{((length // 1000) + 1) * 1000 - 1}"
                bucket_counter[bucket] += 1

            except json.JSONDecodeError:
                continue

    stats = {
        "max_length": max(lengths),
        "average_length": round(sum(lengths) / len(lengths), 2),
        "count_by_length_category": dict(category_counter),
        "count_by_bucket": dict(sorted(bucket_counter.items()))
    }

    return stats

# Run analysis
analyze_jd_lengths_pure("json_outputs_run1/parsed_jds.json")


Social Media Marketing Specialist Advertising Services Dental Revenue United States Marketing, Sales full-time Dental Revenue is searching for a talented and energetic Social Media Specialist to manage and administer the social media accounts of our clients and assist our internal marketing department.An ideal candidate will have a strong understanding of social media marketing be extremely organized, and have excellent energy and communication skills. As a Social Media Specialist, we expect you to be updated with the latest technologies and social media trends. While this is primarily an in-office position, there will be extensive interaction with clients both via web meetings and in their dental offices.
Responsibilities includeManage client social media accounts by developing, overseeing, and implementing strategiesCreate, manage, and maintain social media accounts for clients, posting to platforms, organic and paid, including Facebook, Instagram, TikTok, YouTube, etc.Ensuring all c

{'max_length': 9174,
 'average_length': 5110.27,
 'count_by_length_category': {'Normal': 55, 'Long': 5},
 'count_by_bucket': {'2000-2999': 8,
  '3000-3999': 10,
  '4000-4999': 10,
  '5000-5999': 15,
  '6000-6999': 8,
  '7000-7999': 4,
  '8000-8999': 4,
  '9000-9999': 1}}