In [3]:
import pandas as pd
import random
import os

# --- 1. Define Skills (Features) ---
# A comprehensive list of skills across different domains.
ALL_SKILLS = [
    # Technology/IT
    "Python", "Java", "C++", "JavaScript", "AWS", "Azure", "Google Cloud", "Docker", "Kubernetes", "Git", "SQL", "NoSQL", "Machine Learning", "Deep Learning", "Data Analysis", "Cybersecurity", "Network Administration", "UI/UX Design", "React", "Node.js", "TensorFlow", "PyTorch", "Linux", "React Native", "Flutter", "Kotlin", "Swift", "API Integration", "CI/CD", "Terraform", "Ansible", "Jenkins", "Scripting", "Database Management", "Performance Tuning", "Backup and Recovery", "Security", "ETL", "Data Warehousing", "Spark", "Kafka", "Airflow", "Infrastructure as Code",
    # Business & Finance
    "Financial Modeling", "Accounting", "Auditing", "Investment Banking", "Risk Management", "Business Development", "Sales", "CRM", "ERP Systems", "Market Research", "Excel", "QuickBooks", "Financial Reporting", "Tax Preparation", "Negotiation", "Requirements Gathering", "Business Process Modeling",
    # Marketing & Creative
    "SEO", "SEM", "Content Marketing", "Social Media Marketing", "Email Marketing", "Adobe Creative Suite", "Graphic Design", "Video Editing", "Copywriting", "Figma", "Google Analytics", "Campaign Management", "Typography", "Branding", "Visual Communication", "Editorial Planning",
    # Healthcare
    "Electronic Health Records (EHR)", "Medical Billing", "Patient Care", "Clinical Research", "Healthcare Data Analysis", "HIPAA", "Medical Devices", "Biomaterials",
    # Engineering
    "AutoCAD", "SolidWorks", "MATLAB", "Project Management", "Quality Assurance", "Lean Manufacturing", "Civil 3D", "Structural Analysis", "Geotechnical Engineering", "Circuit Design", "PLC Programming", "Power Systems",
    # HR & Customer Service
    "Recruiting", "Employee Relations", "HR Policies", "Onboarding", "Performance Management", "HRIS", "Customer Service", "Patience", "Product Knowledge", "Ticketing Systems",
    # Game Development
    "Unity", "Unreal Engine", "Game Design", "3D Modeling", "C#", "Physics Engine",
    # QA & Writing
    "Test Planning", "Test Cases", "Automation Testing", "Selenium", "JIRA", "Documentation", "API Documentation", "Markdown", "Technical Acumen",
    # Soft Skills (Common across many roles)
    "Communication", "Teamwork", "Leadership", "Problem-Solving", "Time Management", "Adaptability", "Critical Thinking"
]

# --- 2. Define Job Roles and Their Core Skills ---
# This dictionary establishes the logical link between skills and job roles.
# Each role has 'primary' (essential) and 'secondary' (common but not essential) skills.
JOB_ROLE_SKILLS = {
    # Original Roles
    "Software Engineer": {
        "primary": ["Python", "Java", "JavaScript", "Git", "SQL", "Problem-Solving", "Teamwork"],
        "secondary": ["C++", "AWS", "Docker", "Kubernetes", "React", "Node.js", "NoSQL", "Communication"]
    },
    "Data Scientist": {
        "primary": ["Python", "Machine Learning", "Deep Learning", "Data Analysis", "SQL", "Problem-Solving", "Critical Thinking"],
        "secondary": ["TensorFlow", "PyTorch", "AWS", "Spark", "Communication"]
    },
    "Product Manager": {
        "primary": ["Project Management", "Agile", "Scrum", "Market Research", "Leadership", "Communication"],
        "secondary": ["JIRA", "Data Analysis", "Problem-Solving", "Adaptability"]
    },
    "Marketing Manager": {
        "primary": ["SEO", "SEM", "Content Marketing", "Social Media Marketing", "Google Analytics", "Campaign Management"],
        "secondary": ["Email Marketing", "Copywriting", "CRM", "Leadership", "Communication"]
    },
    "Financial Analyst": {
        "primary": ["Financial Modeling", "Accounting", "Excel", "Data Analysis", "Risk Management", "Critical Thinking"],
        "secondary": ["Auditing", "SQL", "Communication"]
    },
    "UX/UI Designer": {
        "primary": ["Figma", "UI/UX Design", "Wireframing", "Prototyping", "User Research", "Adobe Creative Suite"],
        "secondary": ["Graphic Design", "Adaptability", "Teamwork", "Communication"]
    },
    "DevOps Engineer": {
        "primary": ["AWS", "Docker", "Kubernetes", "CI/CD", "Linux", "Scripting", "Problem-Solving"],
        "secondary": ["Azure", "Google Cloud", "Terraform", "Ansible", "Jenkins", "Networking", "Teamwork"]
    },
    "Cybersecurity Analyst": {
        "primary": ["Cybersecurity", "Networking", "Firewalls", "SIEM", "Problem-Solving"],
        "secondary": ["Penetration Testing", "Cryptography", "Python", "Linux", "Risk Management", "Communication"]
    },
    "Mechanical Engineer": {
        "primary": ["AutoCAD", "SolidWorks", "MATLAB", "Project Management", "Quality Assurance", "Problem-Solving"],
        "secondary": ["Lean Manufacturing", "Teamwork", "Time Management"]
    },
    "Registered Nurse": {
        "primary": ["Patient Care", "Electronic Health Records (EHR)", "HIPAA", "Communication", "Teamwork"],
        "secondary": ["Medical Billing", "Clinical Research", "Adaptability", "Problem-Solving"]
    },
    # Expanded Roles
    "Human Resources Manager": {
        "primary": ["Recruiting", "Employee Relations", "HR Policies", "Communication", "Leadership", "Problem-Solving"],
        "secondary": ["Onboarding", "Performance Management", "HRIS", "Time Management"]
    },
    "Customer Service Representative": {
        "primary": ["Customer Service", "Communication", "Problem-Solving", "Patience", "Product Knowledge"],
        "secondary": ["CRM", "Time Management", "Adaptability", "Teamwork"]
    },
    "Business Analyst": {
        "primary": ["Data Analysis", "SQL", "Requirements Gathering", "Communication", "Problem-Solving", "Critical Thinking"],
        "secondary": ["Project Management", "JIRA", "Business Process Modeling"]
    },
    "Database Administrator (DBA)": {
        "primary": ["SQL", "NoSQL", "Database Management", "Performance Tuning", "Backup and Recovery", "Security"],
        "secondary": ["Python", "Linux", "AWS", "Azure", "Scripting"]
    },
    "Network Engineer": {
        "primary": ["Network Administration", "Firewalls", "Routing", "Switching", "Problem-Solving"],
        "secondary": ["Cybersecurity", "Linux", "Scripting", "Cisco", "Juniper"]
    },
    "Systems Administrator": {
        "primary": ["Linux", "Windows Server", "Active Directory", "Virtualization", "Scripting", "Problem-Solving"],
        "secondary": ["Networking", "Security", "AWS", "Azure", "Backup and Recovery"]
    },
    "Mobile App Developer": {
        "primary": ["Kotlin", "Swift", "Java", "React Native", "API Integration", "Git"],
        "secondary": ["Flutter", "UI/UX Design", "Problem-Solving", "Teamwork"]
    },
    "Game Developer": {
        "primary": ["C++", "C#", "Unity", "Unreal Engine", "Game Design", "3D Modeling", "Problem-Solving"],
        "secondary": ["Teamwork", "Physics Engine", "Git"]
    },
    "QA Engineer": {
        "primary": ["Quality Assurance", "Test Planning", "Test Cases", "Automation Testing", "JIRA", "Critical Thinking"],
        "secondary": ["Selenium", "SQL", "Communication", "Problem-Solving"]
    },
    "Technical Writer": {
        "primary": ["Communication", "Copywriting", "Documentation", "API Documentation", "Markdown", "Technical Acumen"],
        "secondary": ["Git", "JIRA", "Critical Thinking"]
    },
    "Accountant": {
        "primary": ["Accounting", "Auditing", "Excel", "QuickBooks", "Financial Reporting", "Critical Thinking"],
        "secondary": ["Tax Preparation", "Risk Management"]
    },
    "Sales Representative": {
        "primary": ["Sales", "CRM", "Communication", "Negotiation", "Business Development", "Product Knowledge"],
        "secondary": ["Time Management", "Adaptability"]
    },
    "Graphic Designer": {
        "primary": ["Adobe Creative Suite", "Graphic Design", "Typography", "Branding", "Visual Communication", "Figma"],
        "secondary": ["Video Editing", "Communication", "Teamwork"]
    },
    "Data Engineer": {
        "primary": ["Python", "SQL", "ETL", "Data Warehousing", "Spark", "AWS", "Problem-Solving"],
        "secondary": ["NoSQL", "Kafka", "Airflow", "Docker", "Data Analysis"]
    },
    "Cloud Engineer": {
        "primary": ["AWS", "Azure", "Google Cloud", "Infrastructure as Code", "Terraform", "Kubernetes", "Docker"],
        "secondary": ["Networking", "Security", "Python", "Linux", "CI/CD"]
    },
    "IT Support Specialist": {
        "primary": ["Problem-Solving", "Communication", "Customer Service", "Hardware Troubleshooting", "Software Installation"],
        "secondary": ["Active Directory", "Networking", "Ticketing Systems", "Security"]
    },
    "Civil Engineer": {
        "primary": ["AutoCAD", "Civil 3D", "Project Management", "Structural Analysis", "Geotechnical Engineering"],
        "secondary": ["MATLAB", "Problem-Solving", "Teamwork"]
    },
    "Electrical Engineer": {
        "primary": ["Circuit Design", "MATLAB", "AutoCAD", "PLC Programming", "Power Systems", "Problem-Solving"],
        "secondary": ["Project Management", "Teamwork"]
    },
    "Biomedical Engineer": {
        "primary": ["SolidWorks", "MATLAB", "Medical Devices", "Biomaterials", "Clinical Research", "Quality Assurance"],
        "secondary": ["Problem-Solving", "Teamwork"]
    },
    "Content Strategist": {
        "primary": ["Content Marketing", "SEO", "Copywriting", "Editorial Planning", "Google Analytics", "Communication"],
        "secondary": ["Social Media Marketing", "Project Management"]
    }
}

def generate_applicant_profile(job_role):
    """
    Generates a single, more accurate row for the dataset with skill variability.
    """
    profile = {skill: 0 for skill in ALL_SKILLS}
    
    # Handle cases where a job role might not be in the dictionary
    if job_role not in JOB_ROLE_SKILLS:
        return None
        
    skills_for_role = JOB_ROLE_SKILLS[job_role]

    # 1. Add ALL primary skills to establish a strong baseline for the role.
    for skill in skills_for_role["primary"]:
        if skill in profile:
            profile[skill] = 1

    # 2. Add a variable number of secondary skills to create realistic variety.
    # This simulates different levels of experience or specialization.
    if skills_for_role["secondary"]:
        num_secondary_to_add = int(len(skills_for_role["secondary"]) * random.uniform(0.25, 0.75))
        secondary_skills_to_add = random.sample(skills_for_role["secondary"], num_secondary_to_add)
        for skill in secondary_skills_to_add:
            if skill in profile:
                profile[skill] = 1
    
    # 3. Add common soft skills to make profiles more realistic.
    # Every role gets a few common soft skills.
    common_soft_skills = ["Communication", "Teamwork", "Problem-Solving", "Adaptability", "Time Management", "Critical Thinking"]
    num_soft_skills_to_add = random.randint(2, 4)
    soft_skills_to_add = random.sample(common_soft_skills, num_soft_skills_to_add)
    for skill in soft_skills_to_add:
        # Ensure the skill exists in the main list before assigning
        if skill in profile:
            profile[skill] = 1

    # 4. Add the target variable
    profile["Job_Role"] = job_role

    return profile

def create_dataset(num_rows=15000):
    """
    Creates the full dataset with the specified number of rows.
    """
    print(f"Generating dataset with {num_rows} rows...")
    dataset = []
    job_roles = list(JOB_ROLE_SKILLS.keys())

    for i in range(num_rows):
        if (i + 1) % 1000 == 0:
            print(f"  ...generated {i+1} rows")
        
        # Choose a job role randomly to ensure a balanced dataset
        random_role = random.choice(job_roles)
        applicant_profile = generate_applicant_profile(random_role)
        if applicant_profile:
            dataset.append(applicant_profile)
    
    print("Dataset generation complete.")
    return pd.DataFrame(dataset)

if __name__ == "__main__":
    # --- Generate and Save the Dataset ---
    # Generate a number of rows between 10,000 and 20,000
    number_of_entries = random.randint(10000, 20000)
    
    df = create_dataset(num_rows=number_of_entries)

    # Reorder columns to have "Job_Role" at the end and ensure all skills are present
    final_cols = [col for col in ALL_SKILLS if col in df.columns] + ['Job_Role']
    df = df[final_cols]
    
    # Save to CSV
    output_filename = 'job_skills_dataset.csv'
    df.to_csv(output_filename, index=False)
    
    print(f"\nSuccessfully created '{output_filename}' with {len(df)} rows and {len(df.columns)} columns.")
    print(f"Total unique job roles: {df['Job_Role'].nunique()}")
    print("\nFirst 5 rows of the dataset:")
    print(df.head())


Generating dataset with 17512 rows...
  ...generated 1000 rows
  ...generated 2000 rows
  ...generated 3000 rows
  ...generated 4000 rows
  ...generated 5000 rows
  ...generated 6000 rows
  ...generated 7000 rows
  ...generated 8000 rows
  ...generated 9000 rows
  ...generated 10000 rows
  ...generated 11000 rows
  ...generated 12000 rows
  ...generated 13000 rows
  ...generated 14000 rows
  ...generated 15000 rows
  ...generated 16000 rows
  ...generated 17000 rows
Dataset generation complete.

Successfully created 'job_skills_dataset.csv' with 17512 rows and 129 columns.
Total unique job roles: 30

First 5 rows of the dataset:
   Python  Java  C++  JavaScript  AWS  Azure  Google Cloud  Docker  \
0       0     0    0           0    0      0             0       0   
1       0     0    0           0    0      0             0       0   
2       0     0    0           0    0      0             0       0   
3       0     0    0           0    0      0             0       0   
4       0    

In [2]:
# Save to CSV
output_filename = 'job_skills_dataset.csv'
df.to_csv(output_filename, index=False)

## Attempt 2 at creating a large and cleaner dataset


In [1]:
import pandas as pd
import random
import os

# --- 1. Expanded Skill List (Features) ---
# A much larger and more granular list of skills to support 100 roles.
ALL_SKILLS = [
    # Technology
    "Python", "Java", "JavaScript", "C#", "C++", "Go", "Ruby", "PHP", "Swift", "Kotlin", "TypeScript", "SQL", "NoSQL", "PostgreSQL", "MongoDB", "Redis", "React", "Angular", "Vue.js", "Node.js", "Django", "Flask", "Spring Boot", ".NET", "AWS", "Azure", "Google Cloud Platform (GCP)", "Docker", "Kubernetes", "Terraform", "Ansible", "Jenkins", "Git", "CI/CD", "Linux", "Scripting", "REST APIs",
    # Data Science & ML
    "Machine Learning", "Deep Learning", "Natural Language Processing (NLP)", "Computer Vision", "Data Analysis", "Statistics", "Pandas", "NumPy", "Scikit-learn", "TensorFlow", "PyTorch", "Keras", "Spark", "Hadoop", "Data Warehousing", "ETL", "Tableau", "Power BI", "A/B Testing",
    # Cybersecurity
    "Cybersecurity", "Network Security", "Penetration Testing", "SIEM", "Cryptography", "Firewalls", "Intrusion Detection", "Ethical Hacking", "Malware Analysis", "Security Auditing",
    # Design & Creative
    "UI/UX Design", "User Research", "Wireframing", "Prototyping", "Figma", "Sketch", "Adobe XD", "Adobe Creative Suite", "Graphic Design", "Illustration", "Video Editing", "Motion Graphics", "After Effects", "Premiere Pro", "Photography", "Copywriting", "Branding", "Typography",
    # Business & Finance
    "Financial Modeling", "Accounting", "Auditing", "Investment Banking", "Risk Management", "Business Development", "Sales", "CRM", "Salesforce", "ERP Systems", "SAP", "Market Research", "Excel", "QuickBooks", "Financial Reporting", "Tax Law", "Compliance", "Actuarial Science", "Quantitative Analysis", "Derivatives", "Wealth Management",
    # Marketing
    "Digital Marketing", "SEO", "SEM", "Content Marketing", "Social Media Marketing", "Email Marketing", "Google Analytics", "Google Ads", "HubSpot", "Marketing Automation", "PPC",
    # Engineering (Physical)
    "AutoCAD", "SolidWorks", "MATLAB", "Revit", "Civil Engineering", "Structural Engineering", "Mechanical Engineering", "Electrical Engineering", "Circuit Design", "PLC Programming", "HVAC", "Lean Manufacturing", "Six Sigma", "Quality Assurance",
    # Healthcare
    "Electronic Health Records (EHR)", "HIPAA", "Patient Care", "Medical Billing", "Pharmacology", "Clinical Trials", "Medical Imaging", "Public Health", "Epidemiology", "Genetics", "Nursing", "Surgical Procedures",
    # HR, Legal & Admin
    "Recruiting", "Human Resources (HR)", "Employee Relations", "Onboarding", "Labor Law", "Contract Law", "Litigation", "Legal Research", "Paralegal", "Executive Assistance", "Office Management", "Event Planning", "Scheduling",
    # Science & Academia
    "Scientific Research", "Lab Techniques", "Bioinformatics", "Chemistry", "Physics", "Grant Writing", "Teaching", "Curriculum Development", "Higher Education", "Statistical Analysis Software (SAS)",
    # Other Professional
    "Project Management", "Agile", "Scrum", "JIRA", "Supply Chain Management", "Logistics", "Procurement", "Real Estate", "Architecture", "Construction Management", "Journalism", "Public Relations", "Technical Writing", "Customer Support", "Zendesk",
    # Soft Skills
    "Communication", "Teamwork", "Leadership", "Problem-Solving", "Time Management", "Adaptability", "Critical Thinking", "Creativity", "Negotiation", "Empathy", "Patience", "Attention to Detail"
]

# --- 2. Define 100 Job Roles with Distinct Skill Sets ---
JOB_ROLE_SKILLS = {
    # Tech
    "Software Engineer": {"primary": ["Java", "Spring Boot", "SQL"], "secondary": ["Python", "Git", "Docker", "AWS"], "soft": ["Problem-Solving", "Teamwork"]},
    "Frontend Developer": {"primary": ["JavaScript", "React", "TypeScript", "CSS"], "secondary": ["Node.js", "Git", "Figma"], "soft": ["Attention to Detail", "Creativity"]},
    "Backend Developer": {"primary": ["Node.js", "Go", "PostgreSQL", "REST APIs"], "secondary": ["Python", "Docker", "Kubernetes", "GCP"], "soft": ["Problem-Solving", "Critical Thinking"]},
    "Full-Stack Developer": {"primary": ["React", "Node.js", "SQL", "JavaScript"], "secondary": ["AWS", "Docker", "Git", "TypeScript"], "soft": ["Adaptability", "Time Management"]},
    "Mobile App Developer": {"primary": ["Swift", "Kotlin", "React Native"], "secondary": ["Git", "REST APIs", "UI/UX Design"], "soft": ["Patience", "Attention to Detail"]},
    "DevOps Engineer": {"primary": ["AWS", "Docker", "Kubernetes", "CI/CD", "Terraform"], "secondary": ["Linux", "Scripting", "Python", "Ansible"], "soft": ["Problem-Solving", "Adaptability"]},
    "Cloud Engineer": {"primary": ["AWS", "Azure", "GCP", "Infrastructure as Code"], "secondary": ["Terraform", "Kubernetes", "Networking"], "soft": ["Critical Thinking", "Problem-Solving"]},
    "Data Scientist": {"primary": ["Machine Learning", "Python", "Statistics", "SQL"], "secondary": ["Pandas", "Scikit-learn", "TensorFlow", "Deep Learning"], "soft": ["Critical Thinking", "Communication"]},
    "Data Analyst": {"primary": ["SQL", "Tableau", "Power BI", "Excel", "Data Analysis"], "secondary": ["Python", "Statistics", "Pandas"], "soft": ["Attention to Detail", "Communication"]},
    "Data Engineer": {"primary": ["ETL", "Data Warehousing", "Spark", "SQL", "Python"], "secondary": ["AWS", "Hadoop", "Kafka"], "soft": ["Problem-Solving", "Critical Thinking"]},
    "Machine Learning Engineer": {"primary": ["TensorFlow", "PyTorch", "Deep Learning", "Python"], "secondary": ["Kubernetes", "AWS", "Scikit-learn"], "soft": ["Problem-Solving", "Adaptability"]},
    "Cybersecurity Analyst": {"primary": ["Cybersecurity", "SIEM", "Network Security", "Firewalls"], "secondary": ["Penetration Testing", "Linux", "Scripting"], "soft": ["Attention to Detail", "Critical Thinking"]},
    "Penetration Tester": {"primary": ["Ethical Hacking", "Penetration Testing", "Metasploit"], "secondary": ["Scripting", "Linux", "Cybersecurity"], "soft": ["Creativity", "Problem-Solving"]},
    "Database Administrator (DBA)": {"primary": ["PostgreSQL", "MongoDB", "Database Management", "Performance Tuning"], "secondary": ["SQL", "NoSQL", "Backup and Recovery"], "soft": ["Attention to Detail", "Problem-Solving"]},
    "Systems Administrator": {"primary": ["Linux", "Windows Server", "Networking", "Scripting"], "secondary": ["Active Directory", "Virtualization", "Security"], "soft": ["Problem-Solving", "Patience"]},
    "Network Engineer": {"primary": ["Cisco", "Juniper", "Routing", "Switching", "Network Security"], "secondary": ["Firewalls", "Linux", "Scripting"], "soft": ["Problem-Solving", "Attention to Detail"]},
    "QA Engineer": {"primary": ["Quality Assurance", "Test Planning", "Automation Testing", "Selenium"], "secondary": ["JIRA", "SQL", "CI/CD"], "soft": ["Attention to Detail", "Critical Thinking"]},
    "Game Developer": {"primary": ["C++", "C#", "Unity", "Unreal Engine"], "secondary": ["3D Modeling", "Game Design", "Physics Engine"], "soft": ["Creativity", "Problem-Solving"]},
    "WordPress Developer": {"primary": ["PHP", "WordPress", "MySQL", "JavaScript"], "secondary": ["HTML", "CSS", "SEO"], "soft": ["Problem-Solving", "Time Management"]},
    "Salesforce Developer": {"primary": ["Salesforce", "Apex", "Lightning Web Components"], "secondary": ["CRM", "SQL", "JavaScript"], "soft": ["Problem-Solving", "Communication"]},
    # Design
    "UX/UI Designer": {"primary": ["Figma", "UI/UX Design", "Wireframing", "Prototyping"], "secondary": ["User Research", "Adobe XD", "Sketch"], "soft": ["Empathy", "Creativity"]},
    "Graphic Designer": {"primary": ["Adobe Creative Suite", "Illustration", "Typography", "Branding"], "secondary": ["Graphic Design", "Figma", "Photography"], "soft": ["Creativity", "Attention to Detail"]},
    "Video Editor": {"primary": ["Premiere Pro", "After Effects", "Video Editing", "Motion Graphics"], "secondary": ["Storytelling", "Color Grading"], "soft": ["Creativity", "Patience"]},
    "Animator": {"primary": ["Maya", "Blender", "Animation", "Character Design"], "secondary": ["After Effects", "Storyboarding"], "soft": ["Creativity", "Patience"]},
    "Architect": {"primary": ["AutoCAD", "Revit", "Architecture", "Building Codes"], "secondary": ["SketchUp", "Project Management"], "soft": ["Creativity", "Problem-Solving"]},
    "Interior Designer": {"primary": ["AutoCAD", "SketchUp", "Interior Design", "Space Planning"], "secondary": ["Revit", "Client Management"], "soft": ["Creativity", "Communication"]},
    "Fashion Designer": {"primary": ["Adobe Illustrator", "Pattern Making", "Textiles", "Fashion Design"], "secondary": ["Trend Forecasting", "Sewing"], "soft": ["Creativity", "Attention to Detail"]},
    "Industrial Designer": {"primary": ["SolidWorks", "Keyshot", "Product Design", "Prototyping"], "secondary": ["AutoCAD", "Manufacturing Processes"], "soft": ["Creativity", "Problem-Solving"]},
    # Business
    "Product Manager": {"primary": ["Agile", "Scrum", "Roadmap", "Market Research"], "secondary": ["JIRA", "Data Analysis", "User Stories"], "soft": ["Leadership", "Communication"]},
    "Project Manager": {"primary": ["Project Management", "JIRA", "Scrum", "Budgeting"], "secondary": ["Agile", "Risk Management", "Gantt Charts"], "soft": ["Leadership", "Time Management"]},
    "Business Analyst": {"primary": ["Requirements Gathering", "SQL", "Data Analysis", "Business Process Modeling"], "secondary": ["JIRA", "Tableau", "Communication"], "soft": ["Critical Thinking", "Attention to Detail"]},
    "Management Consultant": {"primary": ["Strategy", "Data Analysis", "Client Management", "PowerPoint"], "secondary": ["Financial Modeling", "Market Research"], "soft": ["Communication", "Problem-Solving"]},
    "Financial Analyst": {"primary": ["Financial Modeling", "Excel", "Valuation", "Accounting"], "secondary": ["SQL", "Bloomberg Terminal"], "soft": ["Attention to Detail", "Critical Thinking"]},
    "Accountant": {"primary": ["Accounting", "QuickBooks", "Excel", "Auditing"], "secondary": ["Tax Preparation", "Financial Reporting"], "soft": ["Attention to Detail", "Integrity"]},
    "Investment Banker": {"primary": ["Investment Banking", "Financial Modeling", "Mergers & Acquisitions"], "secondary": ["Valuation", "Excel", "Pitch Books"], "soft": ["Negotiation", "Time Management"]},
    "Sales Manager": {"primary": ["Sales", "CRM", "Salesforce", "Leadership"], "secondary": ["Negotiation", "Business Development"], "soft": ["Communication", "Leadership"]},
    "Marketing Manager": {"primary": ["Digital Marketing", "SEO", "Content Marketing", "Campaign Management"], "secondary": ["Google Analytics", "Social Media Marketing"], "soft": ["Leadership", "Creativity"]},
    "Digital Marketer": {"primary": ["SEO", "SEM", "Google Ads", "Social Media Marketing"], "secondary": ["Content Marketing", "Email Marketing"], "soft": ["Adaptability", "Creativity"]},
    "Content Strategist": {"primary": ["Content Marketing", "SEO", "Copywriting", "Editorial Planning"], "secondary": ["Google Analytics", "Social Media Marketing"], "soft": ["Creativity", "Communication"]},
    "HR Manager": {"primary": ["Human Resources (HR)", "Employee Relations", "Recruiting", "Labor Law"], "secondary": ["Onboarding", "Performance Management"], "soft": ["Empathy", "Leadership"]},
    "Recruiter": {"primary": ["Recruiting", "Sourcing", "Interviewing", "LinkedIn Recruiter"], "secondary": ["HR", "Negotiation"], "soft": ["Communication", "Patience"]},
    "Supply Chain Manager": {"primary": ["Supply Chain Management", "Logistics", "Procurement", "Inventory Management"], "secondary": ["SAP", "ERP Systems"], "soft": ["Problem-Solving", "Negotiation"]},
    "Operations Manager": {"primary": ["Operations Management", "Process Improvement", "Budgeting"], "secondary": ["Lean Manufacturing", "Project Management"], "soft": ["Leadership", "Problem-Solving"]},
    "Real Estate Agent": {"primary": ["Real Estate", "Sales", "Negotiation", "Property Law"], "secondary": ["CRM", "Marketing"], "soft": ["Communication", "Patience"]},
    "Executive Assistant": {"primary": ["Scheduling", "Office Management", "Travel Coordination", "Microsoft Office"], "secondary": ["Event Planning", "Communication"], "soft": ["Time Management", "Adaptability"]},
    "Customer Support Specialist": {"primary": ["Customer Support", "Zendesk", "Communication", "Problem-Solving"], "secondary": ["CRM", "Patience"], "soft": ["Empathy", "Patience"]},
    # Engineering
    "Mechanical Engineer": {"primary": ["SolidWorks", "AutoCAD", "Mechanical Engineering", "Thermodynamics"], "secondary": ["MATLAB", "Finite Element Analysis"], "soft": ["Problem-Solving", "Critical Thinking"]},
    "Electrical Engineer": {"primary": ["Circuit Design", "MATLAB", "Electrical Engineering", "Power Systems"], "secondary": ["AutoCAD", "PLC Programming"], "soft": ["Problem-Solving", "Attention to Detail"]},
    "Civil Engineer": {"primary": ["Civil Engineering", "AutoCAD", "Structural Engineering", "Revit"], "secondary": ["Geotechnical Engineering", "Construction Management"], "soft": ["Problem-Solving", "Attention to Detail"]},
    "Chemical Engineer": {"primary": ["Chemical Engineering", "Process Simulation", "Aspen HYSYS"], "secondary": ["Thermodynamics", "Fluid Mechanics"], "soft": ["Problem-Solving", "Critical Thinking"]},
    "Aerospace Engineer": {"primary": ["Aerospace Engineering", "Aerodynamics", "MATLAB", "CATIA"], "secondary": ["SolidWorks", "Propulsion Systems"], "soft": ["Problem-Solving", "Critical Thinking"]},
    "Biomedical Engineer": {"primary": ["Biomedical Engineering", "Medical Devices", "SolidWorks", "FDA Regulations"], "secondary": ["MATLAB", "Biomaterials"], "soft": ["Problem-Solving", "Attention to Detail"]},
    "Environmental Engineer": {"primary": ["Environmental Engineering", "Water Quality", "GIS", "AutoCAD"], "secondary": ["Hydrology", "Waste Management"], "soft": ["Problem-Solving", "Critical Thinking"]},
    # Healthcare
    "Physician": {"primary": ["Medicine", "Diagnosis", "Patient Care", "Pharmacology"], "secondary": ["EHR", "Medical Ethics"], "soft": ["Empathy", "Communication"]},
    "Registered Nurse": {"primary": ["Nursing", "Patient Care", "EHR", "HIPAA"], "secondary": ["Medication Administration", "Wound Care"], "soft": ["Empathy", "Patience"]},
    "Pharmacist": {"primary": ["Pharmacology", "Medication Dispensing", "Patient Counseling"], "secondary": ["Pharmacy Law", "Chemistry"], "soft": ["Attention to Detail", "Communication"]},
    "Dentist": {"primary": ["Dentistry", "Oral Surgery", "X-Rays", "Patient Care"], "secondary": ["Cosmetic Dentistry", "Orthodontics"], "soft": ["Attention to Detail", "Patience"]},
    "Physical Therapist": {"primary": ["Physical Therapy", "Rehabilitation", "Anatomy", "Kinesiology"], "secondary": ["Patient Care", "EHR"], "soft": ["Empathy", "Patience"]},
    "Veterinarian": {"primary": ["Veterinary Medicine", "Animal Surgery", "Animal Husbandry"], "secondary": ["Pharmacology", "Radiology"], "soft": ["Empathy", "Communication"]},
    "Medical Lab Scientist": {"primary": ["Laboratory Techniques", "Microbiology", "Hematology", "Chemistry"], "secondary": ["Quality Control", "LIMS"], "soft": ["Attention to Detail", "Problem-Solving"]},
    "Radiologic Technologist": {"primary": ["Medical Imaging", "X-Rays", "MRI", "CT Scan"], "secondary": ["Patient Care", "Anatomy"], "soft": ["Attention to Detail", "Patience"]},
    "Public Health Official": {"primary": ["Public Health", "Epidemiology", "Health Policy", "Statistics"], "secondary": ["Grant Writing", "Community Outreach"], "soft": ["Communication", "Critical Thinking"]},
    # Legal
    "Lawyer": {"primary": ["Litigation", "Contract Law", "Legal Research", "Negotiation"], "secondary": ["Corporate Law", "Torts"], "soft": ["Critical Thinking", "Communication"]},
    "Paralegal": {"primary": ["Paralegal", "Legal Research", "Document Drafting", "Westlaw"], "secondary": ["Litigation Support", "Case Management"], "soft": ["Attention to Detail", "Time Management"]},
    "Compliance Officer": {"primary": ["Compliance", "Regulatory Affairs", "Risk Management", "Auditing"], "secondary": ["Legal Research", "Corporate Law"], "soft": ["Attention to Detail", "Integrity"]},
    # Science & Academia
    "Research Scientist": {"primary": ["Scientific Research", "Data Analysis", "Lab Techniques", "Grant Writing"], "secondary": ["Python", "Statistics", "Publishing"], "soft": ["Critical Thinking", "Problem-Solving"]},
    "Biologist": {"primary": ["Biology", "Genetics", "Lab Techniques", "Microscopy"], "secondary": ["Bioinformatics", "Ecology"], "soft": ["Attention to Detail", "Problem-Solving"]},
    "Chemist": {"primary": ["Chemistry", "Spectroscopy", "Chromatography", "Lab Techniques"], "secondary": ["Organic Chemistry", "Analytical Chemistry"], "soft": ["Attention to Detail", "Problem-Solving"]},
    "Physicist": {"primary": ["Physics", "Quantum Mechanics", "MATLAB", "Data Analysis"], "secondary": ["Optics", "Thermodynamics"], "soft": ["Critical Thinking", "Problem-Solving"]},
    "Geologist": {"primary": ["Geology", "Field Mapping", "GIS", "Petrology"], "secondary": ["Seismology", "Hydrology"], "soft": ["Problem-Solving", "Adaptability"]},
    "Professor": {"primary": ["Teaching", "Higher Education", "Curriculum Development", "Scientific Research"], "secondary": ["Grant Writing", "Public Speaking"], "soft": ["Communication", "Leadership"]},
    "Librarian": {"primary": ["Library Science", "Cataloging", "Research Assistance", "Database Management"], "secondary": ["Information Literacy", "Archiving"], "soft": ["Patience", "Attention to Detail"]},
    # Creative & Media
    "Journalist": {"primary": ["Journalism", "Interviewing", "Copywriting", "Fact-Checking"], "secondary": ["SEO", "Social Media"], "soft": ["Communication", "Critical Thinking"]},
    "Public Relations Specialist": {"primary": ["Public Relations", "Media Outreach", "Press Releases", "Crisis Management"], "secondary": ["Social Media Marketing", "Event Planning"], "soft": ["Communication", "Adaptability"]},
    "Technical Writer": {"primary": ["Technical Writing", "Documentation", "API Documentation", "Markdown"], "secondary": ["Git", "JIRA"], "soft": ["Attention to Detail", "Communication"]},
    "Photographer": {"primary": ["Photography", "Adobe Lightroom", "Adobe Photoshop", "Lighting"], "secondary": ["Client Management", "Photo Retouching"], "soft": ["Creativity", "Patience"]},
    "Musician": {"primary": ["Music Theory", "Instrument Performance", "Composition"], "secondary": ["DAW", "Audio Engineering"], "soft": ["Creativity", "Discipline"]},
    "Actor": {"primary": ["Acting", "Improvisation", "Script Analysis", "Voice and Diction"], "secondary": ["Auditioning", "Stage Combat"], "soft": ["Empathy", "Communication"]},
    # Skilled Trades & Others
    "Chef": {"primary": ["Culinary Arts", "Menu Development", "Food Safety", "Kitchen Management"], "secondary": ["Inventory Management", "Cost Control"], "soft": ["Creativity", "Time Management"]},
    "Electrician": {"primary": ["Electrical Wiring", "National Electrical Code (NEC)", "Blueprint Reading"], "secondary": ["Troubleshooting", "Safety Procedures"], "soft": ["Problem-Solving", "Attention to Detail"]},
    "Plumber": {"primary": ["Plumbing", "Pipefitting", "Blueprint Reading", "Drain Cleaning"], "secondary": ["HVAC", "Gas Fitting"], "soft": ["Problem-Solving", "Patience"]},
    "Carpenter": {"primary": ["Carpentry", "Woodworking", "Blueprint Reading", "Framing"], "secondary": ["Cabinet Making", "Finish Work"], "soft": ["Attention to Detail", "Problem-Solving"]},
    "Welder": {"primary": ["Welding", "MIG", "TIG", "Blueprint Reading"], "secondary": ["Fabrication", "Grinding"], "soft": ["Attention to Detail", "Patience"]},
    "Construction Manager": {"primary": ["Construction Management", "Project Management", "Budgeting", "OSHA"], "secondary": ["Blueprint Reading", "Contract Negotiation"], "soft": ["Leadership", "Problem-Solving"]},
    "Pilot": {"primary": ["Piloting", "Flight Planning", "Navigation", "FAA Regulations"], "secondary": ["Aerodynamics", "Meteorology"], "soft": ["Attention to Detail", "Problem-Solving"]},
    "Air Traffic Controller": {"primary": ["Air Traffic Control", "Radar Operation", "Communication", "FAA Regulations"], "secondary": ["Meteorology", "Emergency Procedures"], "soft": ["Attention to Detail", "Critical Thinking"]},
    "Translator": {"primary": ["Bilingual", "Translation", "Proofreading", "Localization"], "secondary": ["Cultural Awareness", "CAT Tools"], "soft": ["Attention to Detail", "Communication"]},
    "Event Planner": {"primary": ["Event Planning", "Budgeting", "Vendor Management", "Negotiation"], "secondary": ["Marketing", "Logistics"], "soft": ["Time Management", "Adaptability"]},
    "Fitness Trainer": {"primary": ["Personal Training", "Anatomy", "Kinesiology", "Nutrition"], "secondary": ["Client Management", "CPR/AED"], "soft": ["Communication", "Empathy"]},
    "Social Worker": {"primary": ["Social Work", "Case Management", "Counseling", "Crisis Intervention"], "secondary": ["Psychology", "Community Resources"], "soft": ["Empathy", "Patience"]},
    "Urban Planner": {"primary": ["Urban Planning", "GIS", "Zoning Laws", "Community Development"], "secondary": ["Data Analysis", "Public Policy"], "soft": ["Critical Thinking", "Communication"]},
}


def generate_applicant_profile(job_role):
    """
    Generates a single, cleaner row for the dataset with less skill overlap.
    """
    profile = {skill: 0 for skill in ALL_SKILLS}
    
    if job_role not in JOB_ROLE_SKILLS:
        return None
        
    skills_for_role = JOB_ROLE_SKILLS[job_role]

    # 1. Add ALL primary skills to create a strong, clear signal.
    for skill in skills_for_role["primary"]:
        if skill in profile:
            profile[skill] = 1

    # 2. Add a controlled, smaller number of secondary skills for realistic variance.
    if skills_for_role.get("secondary"):
        num_secondary_to_add = int(len(skills_for_role["secondary"]) * random.uniform(0.3, 0.6))
        secondary_skills_to_add = random.sample(skills_for_role["secondary"], num_secondary_to_add)
        for skill in secondary_skills_to_add:
            if skill in profile:
                profile[skill] = 1
    
    # 3. Add the specifically defined soft skills for the role.
    if skills_for_role.get("soft"):
        for skill in skills_for_role["soft"]:
            if skill in profile:
                profile[skill] = 1

    # 4. Add the target variable
    profile["Job_Role"] = job_role

    return profile

def create_dataset(num_rows=50000):
    """
    Creates the full dataset with the specified number of rows.
    """
    print(f"Generating dataset with {num_rows} rows and {len(JOB_ROLE_SKILLS)} unique job roles...")
    dataset = []
    job_roles = list(JOB_ROLE_SKILLS.keys())

    for i in range(num_rows):
        if (i + 1) % 5000 == 0:
            print(f"  ...generated {i+1} rows")
        
        random_role = random.choice(job_roles)
        applicant_profile = generate_applicant_profile(random_role)
        if applicant_profile:
            dataset.append(applicant_profile)
    
    print("Dataset generation complete.")
    return pd.DataFrame(dataset)

if __name__ == "__main__":
    # --- Generate and Save the Dataset ---
    df = create_dataset(num_rows=50000)

    # Reorder columns to have "Job_Role" at the end
    final_cols = [col for col in ALL_SKILLS if col in df.columns] + ['Job_Role']
    df = df[final_cols]
    
    # Save to CSV
    output_filename = 'job_skills_dataset_large.csv'
    df.to_csv(output_filename, index=False)
    
    print(f"\nSuccessfully created '{output_filename}' with {len(df)} rows and {len(df.columns)} columns.")
    print(f"Total unique job roles: {df['Job_Role'].nunique()}")
    print("\nFirst 5 rows of the dataset:")
    print(df.head())


Generating dataset with 50000 rows and 91 unique job roles...
  ...generated 5000 rows
  ...generated 10000 rows
  ...generated 15000 rows
  ...generated 20000 rows
  ...generated 25000 rows
  ...generated 30000 rows
  ...generated 35000 rows
  ...generated 40000 rows
  ...generated 45000 rows
  ...generated 50000 rows
Dataset generation complete.

Successfully created 'job_skills_dataset_large.csv' with 50000 rows and 193 columns.
Total unique job roles: 91

First 5 rows of the dataset:
   Python  Java  JavaScript  C#  C++  Go  Ruby  PHP  Swift  Kotlin  ...  \
0       0     0           0   0    0   0     0    0      0       0  ...   
1       0     0           0   0    0   0     0    0      0       0  ...   
2       0     0           0   0    0   0     0    0      0       0  ...   
3       0     0           0   0    0   0     0    0      0       0  ...   
4       0     0           0   0    0   0     0    0      0       0  ...   

   Problem-Solving  Time Management  Adaptability  Criti

## Attempt 3 weighted dataset

In [2]:
import pandas as pd
import random
import os

# --- 1. Use the same expanded skill list ---
ALL_SKILLS = [
    # Technology
    "Python", "Java", "JavaScript", "C#", "C++", "Go", "Ruby", "PHP", "Swift", "Kotlin", "TypeScript", "SQL", "NoSQL", "PostgreSQL", "MongoDB", "Redis", "React", "Angular", "Vue.js", "Node.js", "Django", "Flask", "Spring Boot", ".NET", "AWS", "Azure", "Google Cloud Platform (GCP)", "Docker", "Kubernetes", "Terraform", "Ansible", "Jenkins", "Git", "CI/CD", "Linux", "Scripting", "REST APIs",
    # Data Science & ML
    "Machine Learning", "Deep Learning", "Natural Language Processing (NLP)", "Computer Vision", "Data Analysis", "Statistics", "Pandas", "NumPy", "Scikit-learn", "TensorFlow", "PyTorch", "Keras", "Spark", "Hadoop", "Data Warehousing", "ETL", "Tableau", "Power BI", "A/B Testing",
    # Cybersecurity
    "Cybersecurity", "Network Security", "Penetration Testing", "SIEM", "Cryptography", "Firewalls", "Intrusion Detection", "Ethical Hacking", "Malware Analysis", "Security Auditing",
    # Design & Creative
    "UI/UX Design", "User Research", "Wireframing", "Prototyping", "Figma", "Sketch", "Adobe XD", "Adobe Creative Suite", "Graphic Design", "Illustration", "Video Editing", "Motion Graphics", "After Effects", "Premiere Pro", "Photography", "Copywriting", "Branding", "Typography",
    # Business & Finance
    "Financial Modeling", "Accounting", "Auditing", "Investment Banking", "Risk Management", "Business Development", "Sales", "CRM", "Salesforce", "ERP Systems", "SAP", "Market Research", "Excel", "QuickBooks", "Financial Reporting", "Tax Law", "Compliance", "Actuarial Science", "Quantitative Analysis", "Derivatives", "Wealth Management",
    # Marketing
    "Digital Marketing", "SEO", "SEM", "Content Marketing", "Social Media Marketing", "Email Marketing", "Google Analytics", "Google Ads", "HubSpot", "Marketing Automation", "PPC",
    # Engineering (Physical)
    "AutoCAD", "SolidWorks", "MATLAB", "Revit", "Civil Engineering", "Structural Engineering", "Mechanical Engineering", "Electrical Engineering", "Circuit Design", "PLC Programming", "HVAC", "Lean Manufacturing", "Six Sigma", "Quality Assurance",
    # Healthcare
    "Electronic Health Records (EHR)", "HIPAA", "Patient Care", "Medical Billing", "Pharmacology", "Clinical Trials", "Medical Imaging", "Public Health", "Epidemiology", "Genetics", "Nursing", "Surgical Procedures",
    # HR, Legal & Admin
    "Recruiting", "Human Resources (HR)", "Employee Relations", "Onboarding", "Labor Law", "Contract Law", "Litigation", "Legal Research", "Paralegal", "Executive Assistance", "Office Management", "Event Planning", "Scheduling",
    # Science & Academia
    "Scientific Research", "Lab Techniques", "Bioinformatics", "Chemistry", "Physics", "Grant Writing", "Teaching", "Curriculum Development", "Higher Education", "Statistical Analysis Software (SAS)",
    # Other Professional
    "Project Management", "Agile", "Scrum", "JIRA", "Supply Chain Management", "Logistics", "Procurement", "Real Estate", "Architecture", "Construction Management", "Journalism", "Public Relations", "Technical Writing", "Customer Support", "Zendesk",
    # Soft Skills
    "Communication", "Teamwork", "Leadership", "Problem-Solving", "Time Management", "Adaptability", "Critical Thinking", "Creativity", "Negotiation", "Empathy", "Patience", "Attention to Detail"
]

# --- 2. Use the same 100 Job Roles ---
JOB_ROLE_SKILLS = {
    # Tech
    "Software Engineer": {"primary": ["Java", "Spring Boot", "SQL"], "secondary": ["Python", "Git", "Docker", "AWS"], "soft": ["Problem-Solving", "Teamwork"]},
    "Frontend Developer": {"primary": ["JavaScript", "React", "TypeScript", "CSS"], "secondary": ["Node.js", "Git", "Figma"], "soft": ["Attention to Detail", "Creativity"]},
    "Backend Developer": {"primary": ["Node.js", "Go", "PostgreSQL", "REST APIs"], "secondary": ["Python", "Docker", "Kubernetes", "GCP"], "soft": ["Problem-Solving", "Critical Thinking"]},
    "Full-Stack Developer": {"primary": ["React", "Node.js", "SQL", "JavaScript"], "secondary": ["AWS", "Docker", "Git", "TypeScript"], "soft": ["Adaptability", "Time Management"]},
    "Mobile App Developer": {"primary": ["Swift", "Kotlin", "React Native"], "secondary": ["Git", "REST APIs", "UI/UX Design"], "soft": ["Patience", "Attention to Detail"]},
    "DevOps Engineer": {"primary": ["AWS", "Docker", "Kubernetes", "CI/CD", "Terraform"], "secondary": ["Linux", "Scripting", "Python", "Ansible"], "soft": ["Problem-Solving", "Adaptability"]},
    "Cloud Engineer": {"primary": ["AWS", "Azure", "GCP", "Infrastructure as Code"], "secondary": ["Terraform", "Kubernetes", "Networking"], "soft": ["Critical Thinking", "Problem-Solving"]},
    "Data Scientist": {"primary": ["Machine Learning", "Python", "Statistics", "SQL"], "secondary": ["Pandas", "Scikit-learn", "TensorFlow", "Deep Learning"], "soft": ["Critical Thinking", "Communication"]},
    "Data Analyst": {"primary": ["SQL", "Tableau", "Power BI", "Excel", "Data Analysis"], "secondary": ["Python", "Statistics", "Pandas"], "soft": ["Attention to Detail", "Communication"]},
    "Data Engineer": {"primary": ["ETL", "Data Warehousing", "Spark", "SQL", "Python"], "secondary": ["AWS", "Hadoop", "Kafka"], "soft": ["Problem-Solving", "Critical Thinking"]},
    "Machine Learning Engineer": {"primary": ["TensorFlow", "PyTorch", "Deep Learning", "Python"], "secondary": ["Kubernetes", "AWS", "Scikit-learn"], "soft": ["Problem-Solving", "Adaptability"]},
    "Cybersecurity Analyst": {"primary": ["Cybersecurity", "SIEM", "Network Security", "Firewalls"], "secondary": ["Penetration Testing", "Linux", "Scripting"], "soft": ["Attention to Detail", "Critical Thinking"]},
    "Penetration Tester": {"primary": ["Ethical Hacking", "Penetration Testing", "Metasploit"], "secondary": ["Scripting", "Linux", "Cybersecurity"], "soft": ["Creativity", "Problem-Solving"]},
    "Database Administrator (DBA)": {"primary": ["PostgreSQL", "MongoDB", "Database Management", "Performance Tuning"], "secondary": ["SQL", "NoSQL", "Backup and Recovery"], "soft": ["Attention to Detail", "Problem-Solving"]},
    "Systems Administrator": {"primary": ["Linux", "Windows Server", "Networking", "Scripting"], "secondary": ["Active Directory", "Virtualization", "Security"], "soft": ["Problem-Solving", "Patience"]},
    "Network Engineer": {"primary": ["Cisco", "Juniper", "Routing", "Switching", "Network Security"], "secondary": ["Firewalls", "Linux", "Scripting"], "soft": ["Problem-Solving", "Attention to Detail"]},
    "QA Engineer": {"primary": ["Quality Assurance", "Test Planning", "Automation Testing", "Selenium"], "secondary": ["JIRA", "SQL", "CI/CD"], "soft": ["Attention to Detail", "Critical Thinking"]},
    "Game Developer": {"primary": ["C++", "C#", "Unity", "Unreal Engine"], "secondary": ["3D Modeling", "Game Design", "Physics Engine"], "soft": ["Creativity", "Problem-Solving"]},
    "WordPress Developer": {"primary": ["PHP", "WordPress", "MySQL", "JavaScript"], "secondary": ["HTML", "CSS", "SEO"], "soft": ["Problem-Solving", "Time Management"]},
    "Salesforce Developer": {"primary": ["Salesforce", "Apex", "Lightning Web Components"], "secondary": ["CRM", "SQL", "JavaScript"], "soft": ["Problem-Solving", "Communication"]},
    # Design
    "UX/UI Designer": {"primary": ["Figma", "UI/UX Design", "Wireframing", "Prototyping"], "secondary": ["User Research", "Adobe XD", "Sketch"], "soft": ["Empathy", "Creativity"]},
    "Graphic Designer": {"primary": ["Adobe Creative Suite", "Illustration", "Typography", "Branding"], "secondary": ["Graphic Design", "Figma", "Photography"], "soft": ["Creativity", "Attention to Detail"]},
    "Video Editor": {"primary": ["Premiere Pro", "After Effects", "Video Editing", "Motion Graphics"], "secondary": ["Storytelling", "Color Grading"], "soft": ["Creativity", "Patience"]},
    "Animator": {"primary": ["Maya", "Blender", "Animation", "Character Design"], "secondary": ["After Effects", "Storyboarding"], "soft": ["Creativity", "Patience"]},
    "Architect": {"primary": ["AutoCAD", "Revit", "Architecture", "Building Codes"], "secondary": ["SketchUp", "Project Management"], "soft": ["Creativity", "Problem-Solving"]},
    "Interior Designer": {"primary": ["AutoCAD", "SketchUp", "Interior Design", "Space Planning"], "secondary": ["Revit", "Client Management"], "soft": ["Creativity", "Communication"]},
    "Fashion Designer": {"primary": ["Adobe Illustrator", "Pattern Making", "Textiles", "Fashion Design"], "secondary": ["Trend Forecasting", "Sewing"], "soft": ["Creativity", "Attention to Detail"]},
    "Industrial Designer": {"primary": ["SolidWorks", "Keyshot", "Product Design", "Prototyping"], "secondary": ["AutoCAD", "Manufacturing Processes"], "soft": ["Creativity", "Problem-Solving"]},
    # Business
    "Product Manager": {"primary": ["Agile", "Scrum", "Roadmap", "Market Research"], "secondary": ["JIRA", "Data Analysis", "User Stories"], "soft": ["Leadership", "Communication"]},
    "Project Manager": {"primary": ["Project Management", "JIRA", "Scrum", "Budgeting"], "secondary": ["Agile", "Risk Management", "Gantt Charts"], "soft": ["Leadership", "Time Management"]},
    "Business Analyst": {"primary": ["Requirements Gathering", "SQL", "Data Analysis", "Business Process Modeling"], "secondary": ["JIRA", "Tableau", "Communication"], "soft": ["Critical Thinking", "Attention to Detail"]},
    "Management Consultant": {"primary": ["Strategy", "Data Analysis", "Client Management", "PowerPoint"], "secondary": ["Financial Modeling", "Market Research"], "soft": ["Communication", "Problem-Solving"]},
    "Financial Analyst": {"primary": ["Financial Modeling", "Excel", "Valuation", "Accounting"], "secondary": ["SQL", "Bloomberg Terminal"], "soft": ["Attention to Detail", "Critical Thinking"]},
    "Accountant": {"primary": ["Accounting", "QuickBooks", "Excel", "Auditing"], "secondary": ["Tax Preparation", "Financial Reporting"], "soft": ["Attention to Detail", "Integrity"]},
    "Investment Banker": {"primary": ["Investment Banking", "Financial Modeling", "Mergers & Acquisitions"], "secondary": ["Valuation", "Excel", "Pitch Books"], "soft": ["Negotiation", "Time Management"]},
    "Sales Manager": {"primary": ["Sales", "CRM", "Salesforce", "Leadership"], "secondary": ["Negotiation", "Business Development"], "soft": ["Communication", "Leadership"]},
    "Marketing Manager": {"primary": ["Digital Marketing", "SEO", "Content Marketing", "Campaign Management"], "secondary": ["Google Analytics", "Social Media Marketing"], "soft": ["Leadership", "Creativity"]},
    "Digital Marketer": {"primary": ["SEO", "SEM", "Google Ads", "Social Media Marketing"], "secondary": ["Content Marketing", "Email Marketing"], "soft": ["Adaptability", "Creativity"]},
    "Content Strategist": {"primary": ["Content Marketing", "SEO", "Copywriting", "Editorial Planning"], "secondary": ["Google Analytics", "Social Media Marketing"], "soft": ["Creativity", "Communication"]},
    "HR Manager": {"primary": ["Human Resources (HR)", "Employee Relations", "Recruiting", "Labor Law"], "secondary": ["Onboarding", "Performance Management"], "soft": ["Empathy", "Leadership"]},
    "Recruiter": {"primary": ["Recruiting", "Sourcing", "Interviewing", "LinkedIn Recruiter"], "secondary": ["HR", "Negotiation"], "soft": ["Communication", "Patience"]},
    "Supply Chain Manager": {"primary": ["Supply Chain Management", "Logistics", "Procurement", "Inventory Management"], "secondary": ["SAP", "ERP Systems"], "soft": ["Problem-Solving", "Negotiation"]},
    "Operations Manager": {"primary": ["Operations Management", "Process Improvement", "Budgeting"], "secondary": ["Lean Manufacturing", "Project Management"], "soft": ["Leadership", "Problem-Solving"]},
    "Real Estate Agent": {"primary": ["Real Estate", "Sales", "Negotiation", "Property Law"], "secondary": ["CRM", "Marketing"], "soft": ["Communication", "Patience"]},
    "Executive Assistant": {"primary": ["Scheduling", "Office Management", "Travel Coordination", "Microsoft Office"], "secondary": ["Event Planning", "Communication"], "soft": ["Time Management", "Adaptability"]},
    "Customer Support Specialist": {"primary": ["Customer Support", "Zendesk", "Communication", "Problem-Solving"], "secondary": ["CRM", "Patience"], "soft": ["Empathy", "Patience"]},
    # Engineering
    "Mechanical Engineer": {"primary": ["SolidWorks", "AutoCAD", "Mechanical Engineering", "Thermodynamics"], "secondary": ["MATLAB", "Finite Element Analysis"], "soft": ["Problem-Solving", "Critical Thinking"]},
    "Electrical Engineer": {"primary": ["Circuit Design", "MATLAB", "Electrical Engineering", "Power Systems"], "secondary": ["AutoCAD", "PLC Programming"], "soft": ["Problem-Solving", "Attention to Detail"]},
    "Civil Engineer": {"primary": ["Civil Engineering", "AutoCAD", "Structural Engineering", "Revit"], "secondary": ["Geotechnical Engineering", "Construction Management"], "soft": ["Problem-Solving", "Attention to Detail"]},
    "Chemical Engineer": {"primary": ["Chemical Engineering", "Process Simulation", "Aspen HYSYS"], "secondary": ["Thermodynamics", "Fluid Mechanics"], "soft": ["Problem-Solving", "Critical Thinking"]},
    "Aerospace Engineer": {"primary": ["Aerospace Engineering", "Aerodynamics", "MATLAB", "CATIA"], "secondary": ["SolidWorks", "Propulsion Systems"], "soft": ["Problem-Solving", "Critical Thinking"]},
    "Biomedical Engineer": {"primary": ["Biomedical Engineering", "Medical Devices", "SolidWorks", "FDA Regulations"], "secondary": ["MATLAB", "Biomaterials"], "soft": ["Problem-Solving", "Attention to Detail"]},
    "Environmental Engineer": {"primary": ["Environmental Engineering", "Water Quality", "GIS", "AutoCAD"], "secondary": ["Hydrology", "Waste Management"], "soft": ["Problem-Solving", "Critical Thinking"]},
    # Healthcare
    "Physician": {"primary": ["Medicine", "Diagnosis", "Patient Care", "Pharmacology"], "secondary": ["EHR", "Medical Ethics"], "soft": ["Empathy", "Communication"]},
    "Registered Nurse": {"primary": ["Nursing", "Patient Care", "EHR", "HIPAA"], "secondary": ["Medication Administration", "Wound Care"], "soft": ["Empathy", "Patience"]},
    "Pharmacist": {"primary": ["Pharmacology", "Medication Dispensing", "Patient Counseling"], "secondary": ["Pharmacy Law", "Chemistry"], "soft": ["Attention to Detail", "Communication"]},
    "Dentist": {"primary": ["Dentistry", "Oral Surgery", "X-Rays", "Patient Care"], "secondary": ["Cosmetic Dentistry", "Orthodontics"], "soft": ["Attention to Detail", "Patience"]},
    "Physical Therapist": {"primary": ["Physical Therapy", "Rehabilitation", "Anatomy", "Kinesiology"], "secondary": ["Patient Care", "EHR"], "soft": ["Empathy", "Patience"]},
    "Veterinarian": {"primary": ["Veterinary Medicine", "Animal Surgery", "Animal Husbandry"], "secondary": ["Pharmacology", "Radiology"], "soft": ["Empathy", "Communication"]},
    "Medical Lab Scientist": {"primary": ["Laboratory Techniques", "Microbiology", "Hematology", "Chemistry"], "secondary": ["Quality Control", "LIMS"], "soft": ["Attention to Detail", "Problem-Solving"]},
    "Radiologic Technologist": {"primary": ["Medical Imaging", "X-Rays", "MRI", "CT Scan"], "secondary": ["Patient Care", "Anatomy"], "soft": ["Attention to Detail", "Patience"]},
    "Public Health Official": {"primary": ["Public Health", "Epidemiology", "Health Policy", "Statistics"], "secondary": ["Grant Writing", "Community Outreach"], "soft": ["Communication", "Critical Thinking"]},
    # Legal
    "Lawyer": {"primary": ["Litigation", "Contract Law", "Legal Research", "Negotiation"], "secondary": ["Corporate Law", "Torts"], "soft": ["Critical Thinking", "Communication"]},
    "Paralegal": {"primary": ["Paralegal", "Legal Research", "Document Drafting", "Westlaw"], "secondary": ["Litigation Support", "Case Management"], "soft": ["Attention to Detail", "Time Management"]},
    "Compliance Officer": {"primary": ["Compliance", "Regulatory Affairs", "Risk Management", "Auditing"], "secondary": ["Legal Research", "Corporate Law"], "soft": ["Attention to Detail", "Integrity"]},
    # Science & Academia
    "Research Scientist": {"primary": ["Scientific Research", "Data Analysis", "Lab Techniques", "Grant Writing"], "secondary": ["Python", "Statistics", "Publishing"], "soft": ["Critical Thinking", "Problem-Solving"]},
    "Biologist": {"primary": ["Biology", "Genetics", "Lab Techniques", "Microscopy"], "secondary": ["Bioinformatics", "Ecology"], "soft": ["Attention to Detail", "Problem-Solving"]},
    "Chemist": {"primary": ["Chemistry", "Spectroscopy", "Chromatography", "Lab Techniques"], "secondary": ["Organic Chemistry", "Analytical Chemistry"], "soft": ["Attention to Detail", "Problem-Solving"]},
    "Physicist": {"primary": ["Physics", "Quantum Mechanics", "MATLAB", "Data Analysis"], "secondary": ["Optics", "Thermodynamics"], "soft": ["Critical Thinking", "Problem-Solving"]},
    "Geologist": {"primary": ["Geology", "Field Mapping", "GIS", "Petrology"], "secondary": ["Seismology", "Hydrology"], "soft": ["Problem-Solving", "Adaptability"]},
    "Professor": {"primary": ["Teaching", "Higher Education", "Curriculum Development", "Scientific Research"], "secondary": ["Grant Writing", "Public Speaking"], "soft": ["Communication", "Leadership"]},
    "Librarian": {"primary": ["Library Science", "Cataloging", "Research Assistance", "Database Management"], "secondary": ["Information Literacy", "Archiving"], "soft": ["Patience", "Attention to Detail"]},
    # Creative & Media
    "Journalist": {"primary": ["Journalism", "Interviewing", "Copywriting", "Fact-Checking"], "secondary": ["SEO", "Social Media"], "soft": ["Communication", "Critical Thinking"]},
    "Public Relations Specialist": {"primary": ["Public Relations", "Media Outreach", "Press Releases", "Crisis Management"], "secondary": ["Social Media Marketing", "Event Planning"], "soft": ["Communication", "Adaptability"]},
    "Technical Writer": {"primary": ["Technical Writing", "Documentation", "API Documentation", "Markdown"], "secondary": ["Git", "JIRA"], "soft": ["Attention to Detail", "Communication"]},
    "Photographer": {"primary": ["Photography", "Adobe Lightroom", "Adobe Photoshop", "Lighting"], "secondary": ["Client Management", "Photo Retouching"], "soft": ["Creativity", "Patience"]},
    "Musician": {"primary": ["Music Theory", "Instrument Performance", "Composition"], "secondary": ["DAW", "Audio Engineering"], "soft": ["Creativity", "Discipline"]},
    "Actor": {"primary": ["Acting", "Improvisation", "Script Analysis", "Voice and Diction"], "secondary": ["Auditioning", "Stage Combat"], "soft": ["Empathy", "Communication"]},
    # Skilled Trades & Others
    "Chef": {"primary": ["Culinary Arts", "Menu Development", "Food Safety", "Kitchen Management"], "secondary": ["Inventory Management", "Cost Control"], "soft": ["Creativity", "Time Management"]},
    "Electrician": {"primary": ["Electrical Wiring", "National Electrical Code (NEC)", "Blueprint Reading"], "secondary": ["Troubleshooting", "Safety Procedures"], "soft": ["Problem-Solving", "Attention to Detail"]},
    "Plumber": {"primary": ["Plumbing", "Pipefitting", "Blueprint Reading", "Drain Cleaning"], "secondary": ["HVAC", "Gas Fitting"], "soft": ["Problem-Solving", "Patience"]},
    "Carpenter": {"primary": ["Carpentry", "Woodworking", "Blueprint Reading", "Framing"], "secondary": ["Cabinet Making", "Finish Work"], "soft": ["Attention to Detail", "Problem-Solving"]},
    "Welder": {"primary": ["Welding", "MIG", "TIG", "Blueprint Reading"], "secondary": ["Fabrication", "Grinding"], "soft": ["Attention to Detail", "Patience"]},
    "Construction Manager": {"primary": ["Construction Management", "Project Management", "Budgeting", "OSHA"], "secondary": ["Blueprint Reading", "Contract Negotiation"], "soft": ["Leadership", "Problem-Solving"]},
    "Pilot": {"primary": ["Piloting", "Flight Planning", "Navigation", "FAA Regulations"], "secondary": ["Aerodynamics", "Meteorology"], "soft": ["Attention to Detail", "Problem-Solving"]},
    "Air Traffic Controller": {"primary": ["Air Traffic Control", "Radar Operation", "Communication", "FAA Regulations"], "secondary": ["Meteorology", "Emergency Procedures"], "soft": ["Attention to Detail", "Critical Thinking"]},
    "Translator": {"primary": ["Bilingual", "Translation", "Proofreading", "Localization"], "secondary": ["Cultural Awareness", "CAT Tools"], "soft": ["Attention to Detail", "Communication"]},
    "Event Planner": {"primary": ["Event Planning", "Budgeting", "Vendor Management", "Negotiation"], "secondary": ["Marketing", "Logistics"], "soft": ["Time Management", "Adaptability"]},
    "Fitness Trainer": {"primary": ["Personal Training", "Anatomy", "Kinesiology", "Nutrition"], "secondary": ["Client Management", "CPR/AED"], "soft": ["Communication", "Empathy"]},
    "Social Worker": {"primary": ["Social Work", "Case Management", "Counseling", "Crisis Intervention"], "secondary": ["Psychology", "Community Resources"], "soft": ["Empathy", "Patience"]},
    "Urban Planner": {"primary": ["Urban Planning", "GIS", "Zoning Laws", "Community Development"], "secondary": ["Data Analysis", "Public Policy"], "soft": ["Critical Thinking", "Communication"]},
}

def generate_applicant_profile(job_role):
    """
    Generates a single profile using weighted skills for a much cleaner signal.
    """
    profile = {skill: 0.0 for skill in ALL_SKILLS} # Use floats instead of ints
    
    if job_role not in JOB_ROLE_SKILLS:
        return None
        
    skills_for_role = JOB_ROLE_SKILLS[job_role]

    # 1. Add primary skills with a high weight + some random variation
    for skill in skills_for_role["primary"]:
        if skill in profile:
            profile[skill] = round(random.uniform(0.8, 1.0), 2)

    # 2. Add secondary skills with a medium weight
    if skills_for_role.get("secondary"):
        for skill in skills_for_role["secondary"]:
            if skill in profile:
                profile[skill] = round(random.uniform(0.4, 0.7), 2)
    
    # 3. Add soft skills with a low, consistent weight
    if skills_for_role.get("soft"):
        for skill in skills_for_role["soft"]:
            if skill in profile:
                profile[skill] = round(random.uniform(0.1, 0.3), 2)

    # 4. Add the target variable
    profile["Job_Role"] = job_role

    return profile

def create_dataset(num_rows=50000):
    """
    Creates the full dataset with the specified number of rows.
    """
    print(f"Generating dataset with {num_rows} rows and {len(JOB_ROLE_SKILLS)} unique job roles...")
    dataset = []
    job_roles = list(JOB_ROLE_SKILLS.keys())

    for i in range(num_rows):
        if (i + 1) % 5000 == 0:
            print(f"  ...generated {i+1} rows")
        
        random_role = random.choice(job_roles)
        applicant_profile = generate_applicant_profile(random_role)
        if applicant_profile:
            dataset.append(applicant_profile)
    
    print("Dataset generation complete.")
    return pd.DataFrame(dataset)

if __name__ == "__main__":
    # --- Generate and Save the Dataset ---
    df = create_dataset(num_rows=50000)

    # Reorder columns to have "Job_Role" at the end
    final_cols = [col for col in ALL_SKILLS if col in df.columns] + ['Job_Role']
    df = df[final_cols]
    
    # Save to CSV
    output_filename = 'job_skills_weighted_dataset.csv'
    df.to_csv(output_filename, index=False)
    
    print(f"\nSuccessfully created '{output_filename}' with {len(df)} rows and {len(df.columns)} columns.")
    print(f"Total unique job roles: {df['Job_Role'].nunique()}")
    print("\nFirst 5 rows of the dataset:")
    print(df.head())


Generating dataset with 50000 rows and 91 unique job roles...
  ...generated 5000 rows
  ...generated 10000 rows
  ...generated 15000 rows
  ...generated 20000 rows
  ...generated 25000 rows
  ...generated 30000 rows
  ...generated 35000 rows
  ...generated 40000 rows
  ...generated 45000 rows
  ...generated 50000 rows
Dataset generation complete.

Successfully created 'job_skills_weighted_dataset.csv' with 50000 rows and 193 columns.
Total unique job roles: 91

First 5 rows of the dataset:
   Python  Java  JavaScript   C#  C++   Go  Ruby  PHP  Swift  Kotlin  ...  \
0    0.00   0.0         0.0  0.0  0.0  0.0   0.0  0.0    0.0     0.0  ...   
1    0.00   0.0         0.0  0.0  0.0  0.0   0.0  0.0    0.0     0.0  ...   
2    0.00   0.0         0.0  0.0  0.0  0.0   0.0  0.0    0.0     0.0  ...   
3    0.48   0.0         0.0  0.0  0.0  0.0   0.0  0.0    0.0     0.0  ...   
4    0.00   0.0         0.0  0.0  0.0  0.0   0.0  0.0    0.0     0.0  ...   

   Problem-Solving  Time Management  Adap

## Attempt 4 , 80 job roles

In [3]:
import pandas as pd
import random
import os

# --- 1. Comprehensive Skill List (Features) ---
ALL_SKILLS = [
    # Tech (30 Roles)
    "Python", "Java", "JavaScript", "C#", "Go", "TypeScript", "SQL", "NoSQL", "PostgreSQL", "MongoDB", "React", "Node.js", "Django", "Flask", "Spring Boot", ".NET", "AWS", "Azure", "Google Cloud Platform (GCP)", "Docker", "Kubernetes", "Terraform", "Ansible", "Jenkins", "Git", "CI/CD", "Linux", "Scripting", "REST APIs", "Microservices",
    "Machine Learning", "Deep Learning", "Natural Language Processing (NLP)", "Data Analysis", "Statistics", "Pandas", "Scikit-learn", "TensorFlow", "PyTorch", "Spark", "Data Warehousing", "ETL", "Tableau", "Power BI",
    "Cybersecurity", "Network Security", "Penetration Testing", "SIEM", "Cryptography", "Firewalls", "Ethical Hacking",
    "UI/UX Design", "User Research", "Wireframing", "Prototyping", "Figma", "Sketch", "Adobe XD",
    "Salesforce", "Apex", "SAP", "ERP Systems",
    "Game Design", "Unity", "Unreal Engine",
    "Quantum Computing", "Qiskit",
    # Other Top Professions (50 Roles)
    "Financial Modeling", "Accounting", "Auditing", "Investment Banking", "Risk Management", "Sales", "CRM", "Market Research", "Excel", "Financial Reporting", "Wealth Management",
    "Digital Marketing", "SEO", "Content Marketing", "Social Media Marketing", "Google Analytics", "Copywriting", "Branding",
    "Project Management", "Agile", "Scrum", "JIRA", "Budgeting",
    "Mechanical Engineering", "Electrical Engineering", "Civil Engineering", "AutoCAD", "SolidWorks", "MATLAB", "Revit", "Structural Engineering",
    "Patient Care", "Electronic Health Records (EHR)", "HIPAA", "Pharmacology", "Clinical Trials", "Medical Imaging", "Nursing", "Surgical Procedures", "Diagnosis", "Medicine",
    "Human Resources (HR)", "Recruiting", "Employee Relations", "Onboarding", "Labor Law",
    "Contract Law", "Litigation", "Legal Research",
    "Scientific Research", "Lab Techniques", "Grant Writing", "Teaching", "Curriculum Development",
    "Supply Chain Management", "Logistics", "Procurement",
    "Architecture", "Construction Management", "Journalism", "Public Relations",
    # Soft Skills
    "Communication", "Teamwork", "Leadership", "Problem-Solving", "Time Management", "Adaptability", "Critical Thinking", "Creativity", "Negotiation", "Empathy", "Patience", "Attention to Detail"
]

# --- 2. Define 80 Job Roles with Distinct Skill Sets ---
JOB_ROLE_SKILLS = {
    # --- 30 Tech Roles ---
    "Software Engineer": {"primary": ["Java", "Spring Boot", "SQL", "Git"], "secondary": ["Python", "Docker", "Microservices"], "soft": ["Problem-Solving", "Teamwork"]},
    "Frontend Developer": {"primary": ["JavaScript", "React", "TypeScript", "CSS"], "secondary": ["Node.js", "Git", "Figma"], "soft": ["Attention to Detail", "Creativity"]},
    "Backend Developer": {"primary": ["Node.js", "Go", "PostgreSQL", "REST APIs"], "secondary": ["Python", "Docker", "Kubernetes"], "soft": ["Problem-Solving", "Critical Thinking"]},
    "Full-Stack Developer": {"primary": ["React", "Node.js", "SQL", "JavaScript"], "secondary": ["AWS", "Docker", "Git"], "soft": ["Adaptability", "Time Management"]},
    "DevOps Engineer": {"primary": ["AWS", "Docker", "Kubernetes", "CI/CD", "Terraform"], "secondary": ["Linux", "Scripting", "Ansible"], "soft": ["Problem-Solving", "Adaptability"]},
    "Cloud Engineer": {"primary": ["AWS", "Azure", "GCP", "Infrastructure as Code"], "secondary": ["Terraform", "Kubernetes", "Networking"], "soft": ["Critical Thinking", "Problem-Solving"]},
    "Data Scientist": {"primary": ["Machine Learning", "Python", "Statistics", "SQL"], "secondary": ["Pandas", "Scikit-learn", "TensorFlow"], "soft": ["Critical Thinking", "Communication"]},
    "Data Analyst": {"primary": ["SQL", "Tableau", "Power BI", "Excel", "Data Analysis"], "secondary": ["Python", "Statistics"], "soft": ["Attention to Detail", "Communication"]},
    "Data Engineer": {"primary": ["ETL", "Data Warehousing", "Spark", "SQL", "Python"], "secondary": ["AWS", "Hadoop"], "soft": ["Problem-Solving", "Critical Thinking"]},
    "Machine Learning Engineer": {"primary": ["TensorFlow", "PyTorch", "Deep Learning", "Python", "MLOps"], "secondary": ["Kubernetes", "AWS"], "soft": ["Problem-Solving", "Adaptability"]},
    "Cybersecurity Analyst": {"primary": ["Cybersecurity", "SIEM", "Network Security", "Firewalls"], "secondary": ["Penetration Testing", "Linux"], "soft": ["Attention to Detail", "Critical Thinking"]},
    "Penetration Tester": {"primary": ["Ethical Hacking", "Penetration Testing", "Metasploit"], "secondary": ["Scripting", "Cybersecurity"], "soft": ["Creativity", "Problem-Solving"]},
    "Database Administrator (DBA)": {"primary": ["PostgreSQL", "MongoDB", "Database Management"], "secondary": ["SQL", "NoSQL", "Performance Tuning"], "soft": ["Attention to Detail", "Problem-Solving"]},
    "Systems Administrator": {"primary": ["Linux", "Windows Server", "Networking", "Scripting"], "secondary": ["Active Directory", "Virtualization"], "soft": ["Problem-Solving", "Patience"]},
    "Network Engineer": {"primary": ["Cisco", "Juniper", "Routing", "Switching"], "secondary": ["Firewalls", "Network Security"], "soft": ["Problem-Solving", "Attention to Detail"]},
    "QA Engineer": {"primary": ["Quality Assurance", "Test Planning", "Automation Testing"], "secondary": ["JIRA", "SQL", "Selenium"], "soft": ["Attention to Detail", "Critical Thinking"]},
    "Game Developer": {"primary": ["C++", "C#", "Unity", "Unreal Engine"], "secondary": ["Game Design", "3D Modeling"], "soft": ["Creativity", "Problem-Solving"]},
    "Salesforce Developer": {"primary": ["Salesforce", "Apex", "Lightning Web Components"], "secondary": ["CRM", "SQL"], "soft": ["Problem-Solving", "Communication"]},
    "UX/UI Designer": {"primary": ["Figma", "UI/UX Design", "Wireframing", "Prototyping"], "secondary": ["User Research", "Sketch"], "soft": ["Empathy", "Creativity"]},
    "Product Manager (Tech)": {"primary": ["Agile", "Scrum", "Roadmap", "JIRA"], "secondary": ["Market Research", "Data Analysis"], "soft": ["Leadership", "Communication"]},
    "IT Support Specialist": {"primary": ["Hardware Troubleshooting", "Software Installation", "Customer Support"], "secondary": ["Active Directory", "Networking"], "soft": ["Patience", "Problem-Solving"]},
    "Mobile App Developer": {"primary": ["Swift", "Kotlin", "React Native"], "secondary": ["Git", "REST APIs"], "soft": ["Patience", "Attention to Detail"]},
    "Solutions Architect": {"primary": ["AWS", "Azure", "System Design", "Microservices"], "secondary": ["GCP", "Terraform"], "soft": ["Communication", "Leadership"]},
    "Blockchain Developer": {"primary": ["Solidity", "Ethereum", "Smart Contracts"], "secondary": ["JavaScript", "Cryptography"], "soft": ["Problem-Solving", "Critical Thinking"]},
    "Robotics Engineer": {"primary": ["Robotics", "C++", "Python", "ROS"], "secondary": ["MATLAB", "Computer Vision"], "soft": ["Problem-Solving", "Creativity"]},
    "Firmware Engineer": {"primary": ["C", "C++", "Embedded Systems", "Microcontrollers"], "secondary": ["RTOS", "Linux"], "soft": ["Attention to Detail", "Problem-Solving"]},
    "Site Reliability Engineer (SRE)": {"primary": ["Kubernetes", "Prometheus", "Go", "Python"], "secondary": ["AWS", "CI/CD"], "soft": ["Problem-Solving", "Adaptability"]},
    "Security Engineer": {"primary": ["Cryptography", "DevSecOps", "Cloud Security"], "secondary": ["Python", "Penetration Testing"], "soft": ["Critical Thinking", "Attention to Detail"]},
    "Quantum Computing Scientist": {"primary": ["Quantum Computing", "Qiskit", "Python", "Linear Algebra"], "secondary": ["Physics", "Machine Learning"], "soft": ["Critical Thinking", "Problem-Solving"]},
    "AI Ethics Specialist": {"primary": ["AI Ethics", "Fairness", "Accountability", "Transparency"], "secondary": ["Machine Learning", "Public Policy"], "soft": ["Critical Thinking", "Communication"]},

    # --- 50 Other Top Professions ---
    "Physician": {"primary": ["Medicine", "Diagnosis", "Patient Care", "Pharmacology"], "secondary": ["EHR", "Medical Ethics"], "soft": ["Empathy", "Communication"]},
    "Lawyer": {"primary": ["Litigation", "Contract Law", "Legal Research", "Negotiation"], "secondary": ["Corporate Law", "Torts"], "soft": ["Critical Thinking", "Communication"]},
    "Management Consultant": {"primary": ["Strategy", "Data Analysis", "Client Management"], "secondary": ["Financial Modeling", "Market Research"], "soft": ["Communication", "Problem-Solving"]},
    "Investment Banker": {"primary": ["Investment Banking", "Financial Modeling", "Mergers & Acquisitions"], "secondary": ["Valuation", "Excel"], "soft": ["Negotiation", "Time Management"]},
    "Marketing Manager": {"primary": ["Digital Marketing", "SEO", "Campaign Management"], "secondary": ["Google Analytics", "Content Marketing"], "soft": ["Leadership", "Creativity"]},
    "Architect": {"primary": ["AutoCAD", "Revit", "Architecture", "Building Codes"], "secondary": ["SketchUp", "Project Management"], "soft": ["Creativity", "Problem-Solving"]},
    "Mechanical Engineer": {"primary": ["SolidWorks", "AutoCAD", "Mechanical Engineering"], "secondary": ["MATLAB", "Thermodynamics"], "soft": ["Problem-Solving", "Critical Thinking"]},
    "Registered Nurse": {"primary": ["Nursing", "Patient Care", "EHR", "HIPAA"], "secondary": ["Medication Administration", "Wound Care"], "soft": ["Empathy", "Patience"]},
    "Accountant": {"primary": ["Accounting", "QuickBooks", "Excel", "Auditing"], "secondary": ["Tax Preparation", "Financial Reporting"], "soft": ["Attention to Detail", "Integrity"]},
    "Financial Analyst": {"primary": ["Financial Modeling", "Excel", "Valuation"], "secondary": ["SQL", "Accounting"], "soft": ["Attention to Detail", "Critical Thinking"]},
    "HR Manager": {"primary": ["Human Resources (HR)", "Employee Relations", "Recruiting"], "secondary": ["Labor Law", "Onboarding"], "soft": ["Empathy", "Leadership"]},
    "Project Manager (Non-Tech)": {"primary": ["Project Management", "Budgeting", "Risk Management"], "secondary": ["Scrum", "Gantt Charts"], "soft": ["Leadership", "Time Management"]},
    "Civil Engineer": {"primary": ["Civil Engineering", "AutoCAD", "Structural Engineering"], "secondary": ["Revit", "Construction Management"], "soft": ["Problem-Solving", "Attention to Detail"]},
    "Electrical Engineer": {"primary": ["Circuit Design", "MATLAB", "Electrical Engineering"], "secondary": ["AutoCAD", "Power Systems"], "soft": ["Problem-Solving", "Attention to Detail"]},
    "Pharmacist": {"primary": ["Pharmacology", "Medication Dispensing", "Patient Counseling"], "secondary": ["Pharmacy Law", "Chemistry"], "soft": ["Attention to Detail", "Communication"]},
    "Dentist": {"primary": ["Dentistry", "Oral Surgery", "X-Rays"], "secondary": ["Patient Care", "Orthodontics"], "soft": ["Attention to Detail", "Patience"]},
    "Veterinarian": {"primary": ["Veterinary Medicine", "Animal Surgery", "Animal Husbandry"], "secondary": ["Pharmacology", "Radiology"], "soft": ["Empathy", "Communication"]},
    "Physical Therapist": {"primary": ["Physical Therapy", "Rehabilitation", "Anatomy"], "secondary": ["Patient Care", "EHR"], "soft": ["Empathy", "Patience"]},
    "Sales Manager": {"primary": ["Sales", "CRM", "Salesforce", "Leadership"], "secondary": ["Negotiation", "Business Development"], "soft": ["Communication", "Leadership"]},
    "Supply Chain Manager": {"primary": ["Supply Chain Management", "Logistics", "Procurement"], "secondary": ["SAP", "Inventory Management"], "soft": ["Problem-Solving", "Negotiation"]},
    "Operations Manager": {"primary": ["Operations Management", "Process Improvement", "Budgeting"], "secondary": ["Lean Manufacturing", "Project Management"], "soft": ["Leadership", "Problem-Solving"]},
    "Chef": {"primary": ["Culinary Arts", "Menu Development", "Food Safety"], "secondary": ["Kitchen Management", "Cost Control"], "soft": ["Creativity", "Time Management"]},
    "Pilot": {"primary": ["Piloting", "Flight Planning", "Navigation"], "secondary": ["FAA Regulations", "Meteorology"], "soft": ["Attention to Detail", "Problem-Solving"]},
    "Graphic Designer": {"primary": ["Adobe Creative Suite", "Illustration", "Typography"], "secondary": ["Branding", "Figma"], "soft": ["Creativity", "Attention to Detail"]},
    "Journalist": {"primary": ["Journalism", "Interviewing", "Copywriting"], "secondary": ["Fact-Checking", "SEO"], "soft": ["Communication", "Critical Thinking"]},
    "Public Relations Specialist": {"primary": ["Public Relations", "Media Outreach", "Press Releases"], "secondary": ["Crisis Management", "Social Media Marketing"], "soft": ["Communication", "Adaptability"]},
    "Construction Manager": {"primary": ["Construction Management", "Project Management", "OSHA"], "secondary": ["Budgeting", "Blueprint Reading"], "soft": ["Leadership", "Problem-Solving"]},
    "Real Estate Agent": {"primary": ["Real Estate", "Sales", "Negotiation"], "secondary": ["Property Law", "CRM"], "soft": ["Communication", "Patience"]},
    "Teacher": {"primary": ["Teaching", "Curriculum Development", "Classroom Management"], "secondary": ["Subject Matter Expertise", "Pedagogy"], "soft": ["Patience", "Communication"]},
    "Professor": {"primary": ["Higher Education", "Scientific Research", "Grant Writing"], "secondary": ["Teaching", "Publishing"], "soft": ["Communication", "Critical Thinking"]},
    "Librarian": {"primary": ["Library Science", "Cataloging", "Research Assistance"], "secondary": ["Database Management", "Archiving"], "soft": ["Patience", "Attention to Detail"]},
    "Social Worker": {"primary": ["Social Work", "Case Management", "Counseling"], "secondary": ["Crisis Intervention", "Community Resources"], "soft": ["Empathy", "Patience"]},
    "Psychologist": {"primary": ["Psychology", "Therapy", "Diagnosis"], "secondary": ["Cognitive Behavioral Therapy (CBT)", "Patient Care"], "soft": ["Empathy", "Communication"]},
    "Actuary": {"primary": ["Actuarial Science", "Statistics", "Risk Management"], "secondary": ["Financial Modeling", "SQL"], "soft": ["Critical Thinking", "Problem-Solving"]},
    "Statistician": {"primary": ["Statistics", "Data Analysis", "R", "SAS"], "secondary": ["Python", "Machine Learning"], "soft": ["Critical Thinking", "Attention to Detail"]},
    "Economist": {"primary": ["Economics", "Econometrics", "Data Analysis"], "secondary": ["Market Research", "Public Policy"], "soft": ["Critical Thinking", "Communication"]},
    "Urban Planner": {"primary": ["Urban Planning", "GIS", "Zoning Laws"], "secondary": ["Community Development", "Data Analysis"], "soft": ["Critical Thinking", "Communication"]},
    "Geologist": {"primary": ["Geology", "Field Mapping", "GIS"], "secondary": ["Petrology", "Seismology"], "soft": ["Problem-Solving", "Adaptability"]},
    "Biologist": {"primary": ["Biology", "Genetics", "Lab Techniques"], "secondary": ["Microscopy", "Bioinformatics"], "soft": ["Attention to Detail", "Problem-Solving"]},
    "Chemist": {"primary": ["Chemistry", "Spectroscopy", "Chromatography"], "secondary": ["Lab Techniques", "Organic Chemistry"], "soft": ["Attention to Detail", "Problem-Solving"]},
    "Event Planner": {"primary": ["Event Planning", "Budgeting", "Vendor Management"], "secondary": ["Negotiation", "Marketing"], "soft": ["Time Management", "Adaptability"]},
    "Paralegal": {"primary": ["Paralegal", "Legal Research", "Document Drafting"], "secondary": ["Westlaw", "Litigation Support"], "soft": ["Attention to Detail", "Time Management"]},
    "Translator": {"primary": ["Bilingual", "Translation", "Proofreading"], "secondary": ["Localization", "Cultural Awareness"], "soft": ["Attention to Detail", "Communication"]},
    "Fitness Trainer": {"primary": ["Personal Training", "Anatomy", "Kinesiology"], "secondary": ["Nutrition", "Client Management"], "soft": ["Communication", "Empathy"]},
    "Firefighter": {"primary": ["Firefighting", "Emergency Medical Services (EMS)", "Hazardous Materials"], "secondary": ["Rescue Operations", "CPR"], "soft": ["Teamwork", "Problem-Solving"]},
    "Police Officer": {"primary": ["Law Enforcement", "Criminal Law", "Patrol Procedures"], "secondary": ["Investigation", "Self-Defense"], "soft": ["Communication", "Critical Thinking"]},
    "Electrician": {"primary": ["Electrical Wiring", "National Electrical Code (NEC)", "Blueprint Reading"], "secondary": ["Troubleshooting", "Safety Procedures"], "soft": ["Problem-Solving", "Attention to Detail"]},
    "Plumber": {"primary": ["Plumbing", "Pipefitting", "Blueprint Reading"], "secondary": ["Drain Cleaning", "HVAC"], "soft": ["Problem-Solving", "Patience"]},
    "Welder": {"primary": ["Welding", "MIG", "TIG", "Blueprint Reading"], "secondary": ["Fabrication", "Grinding"], "soft": ["Attention to Detail", "Patience"]},
    "Artist": {"primary": ["Painting", "Drawing", "Sculpture"], "secondary": ["Art History", "Color Theory"], "soft": ["Creativity", "Patience"]},
}

def generate_applicant_profile(job_role):
    """
    Generates a single profile using weighted skills for a much cleaner signal.
    """
    profile = {skill: 0.0 for skill in ALL_SKILLS}
    
    if job_role not in JOB_ROLE_SKILLS:
        return None
        
    skills_for_role = JOB_ROLE_SKILLS[job_role]

    # 1. Add primary skills with a high weight + some random variation
    for skill in skills_for_role["primary"]:
        if skill in profile:
            profile[skill] = round(random.uniform(0.8, 1.0), 2)

    # 2. Add secondary skills with a medium weight
    if skills_for_role.get("secondary"):
        for skill in skills_for_role["secondary"]:
            if skill in profile:
                profile[skill] = round(random.uniform(0.4, 0.7), 2)
    
    # 3. Add soft skills with a low, consistent weight
    if skills_for_role.get("soft"):
        for skill in skills_for_role["soft"]:
            if skill in profile:
                profile[skill] = round(random.uniform(0.1, 0.3), 2)

    profile["Job_Role"] = job_role
    return profile

def create_dataset(num_rows=50000):
    """
    Creates the full dataset with the specified number of rows.
    """
    print(f"Generating dataset with {num_rows} rows and {len(JOB_ROLE_SKILLS)} unique job roles...")
    dataset = []
    job_roles = list(JOB_ROLE_SKILLS.keys())

    for i in range(num_rows):
        if (i + 1) % 5000 == 0:
            print(f"  ...generated {i+1} rows")
        
        random_role = random.choice(job_roles)
        applicant_profile = generate_applicant_profile(random_role)
        if applicant_profile:
            dataset.append(applicant_profile)
    
    print("Dataset generation complete.")
    return pd.DataFrame(dataset)

if __name__ == "__main__":
    df = create_dataset(num_rows=50000)

    final_cols = [col for col in ALL_SKILLS if col in df.columns] + ['Job_Role']
    df = df[final_cols]
    
    output_filename = 'job_skills_80_roles_weighted.csv'
    df.to_csv(output_filename, index=False)
    
    print(f"\nSuccessfully created '{output_filename}' with {len(df)} rows and {len(df.columns)} columns.")
    print(f"Total unique job roles: {df['Job_Role'].nunique()}")
    print("\nFirst 5 rows of the dataset:")
    print(df.head())


Generating dataset with 50000 rows and 80 unique job roles...
  ...generated 5000 rows
  ...generated 10000 rows
  ...generated 15000 rows
  ...generated 20000 rows
  ...generated 25000 rows
  ...generated 30000 rows
  ...generated 35000 rows
  ...generated 40000 rows
  ...generated 45000 rows
  ...generated 50000 rows
Dataset generation complete.

Successfully created 'job_skills_80_roles_weighted.csv' with 50000 rows and 141 columns.
Total unique job roles: 80

First 5 rows of the dataset:
   Python  Java  JavaScript   C#   Go  TypeScript   SQL  NoSQL  PostgreSQL  \
0     0.0   0.0         0.0  0.0  0.0         0.0  0.00    0.0         0.0   
1     0.0   0.0         0.0  0.0  0.0         0.0  0.00    0.0         0.0   
2     0.0   0.0         0.0  0.0  0.0         0.0  0.00    0.0         0.0   
3     0.0   0.0         0.0  0.0  0.0         0.0  0.00    0.0         0.0   
4     0.0   0.0         1.0  0.0  0.0         0.0  0.88    0.0         0.0   

   MongoDB  ...  Problem-Solving  