## Attempt 4 , 80 job roles

In [3]:
import pandas as pd
import random
import os

# --- 1. Comprehensive Skill List (Features) ---
ALL_SKILLS = [
    # Tech (30 Roles)
    "Python", "Java", "JavaScript", "C#", "Go", "TypeScript", "SQL", "NoSQL", "PostgreSQL", "MongoDB", "React", "Node.js", "Django", "Flask", "Spring Boot", ".NET", "AWS", "Azure", "Google Cloud Platform (GCP)", "Docker", "Kubernetes", "Terraform", "Ansible", "Jenkins", "Git", "CI/CD", "Linux", "Scripting", "REST APIs", "Microservices",
    "Machine Learning", "Deep Learning", "Natural Language Processing (NLP)", "Data Analysis", "Statistics", "Pandas", "Scikit-learn", "TensorFlow", "PyTorch", "Spark", "Data Warehousing", "ETL", "Tableau", "Power BI",
    "Cybersecurity", "Network Security", "Penetration Testing", "SIEM", "Cryptography", "Firewalls", "Ethical Hacking",
    "UI/UX Design", "User Research", "Wireframing", "Prototyping", "Figma", "Sketch", "Adobe XD",
    "Salesforce", "Apex", "SAP", "ERP Systems",
    "Game Design", "Unity", "Unreal Engine",
    "Quantum Computing", "Qiskit",
    # Other Top Professions (50 Roles)
    "Financial Modeling", "Accounting", "Auditing", "Investment Banking", "Risk Management", "Sales", "CRM", "Market Research", "Excel", "Financial Reporting", "Wealth Management",
    "Digital Marketing", "SEO", "Content Marketing", "Social Media Marketing", "Google Analytics", "Copywriting", "Branding",
    "Project Management", "Agile", "Scrum", "JIRA", "Budgeting",
    "Mechanical Engineering", "Electrical Engineering", "Civil Engineering", "AutoCAD", "SolidWorks", "MATLAB", "Revit", "Structural Engineering",
    "Patient Care", "Electronic Health Records (EHR)", "HIPAA", "Pharmacology", "Clinical Trials", "Medical Imaging", "Nursing", "Surgical Procedures", "Diagnosis", "Medicine",
    "Human Resources (HR)", "Recruiting", "Employee Relations", "Onboarding", "Labor Law",
    "Contract Law", "Litigation", "Legal Research",
    "Scientific Research", "Lab Techniques", "Grant Writing", "Teaching", "Curriculum Development",
    "Supply Chain Management", "Logistics", "Procurement",
    "Architecture", "Construction Management", "Journalism", "Public Relations",
    # Soft Skills
    "Communication", "Teamwork", "Leadership", "Problem-Solving", "Time Management", "Adaptability", "Critical Thinking", "Creativity", "Negotiation", "Empathy", "Patience", "Attention to Detail"
]

# --- 2. Define 80 Job Roles with Distinct Skill Sets ---
JOB_ROLE_SKILLS = {
    # --- 30 Tech Roles ---
    "Software Engineer": {"primary": ["Java", "Spring Boot", "SQL", "Git"], "secondary": ["Python", "Docker", "Microservices"], "soft": ["Problem-Solving", "Teamwork"]},
    "Frontend Developer": {"primary": ["JavaScript", "React", "TypeScript", "CSS"], "secondary": ["Node.js", "Git", "Figma"], "soft": ["Attention to Detail", "Creativity"]},
    "Backend Developer": {"primary": ["Node.js", "Go", "PostgreSQL", "REST APIs"], "secondary": ["Python", "Docker", "Kubernetes"], "soft": ["Problem-Solving", "Critical Thinking"]},
    "Full-Stack Developer": {"primary": ["React", "Node.js", "SQL", "JavaScript"], "secondary": ["AWS", "Docker", "Git"], "soft": ["Adaptability", "Time Management"]},
    "DevOps Engineer": {"primary": ["AWS", "Docker", "Kubernetes", "CI/CD", "Terraform"], "secondary": ["Linux", "Scripting", "Ansible"], "soft": ["Problem-Solving", "Adaptability"]},
    "Cloud Engineer": {"primary": ["AWS", "Azure", "GCP", "Infrastructure as Code"], "secondary": ["Terraform", "Kubernetes", "Networking"], "soft": ["Critical Thinking", "Problem-Solving"]},
    "Data Scientist": {"primary": ["Machine Learning", "Python", "Statistics", "SQL"], "secondary": ["Pandas", "Scikit-learn", "TensorFlow"], "soft": ["Critical Thinking", "Communication"]},
    "Data Analyst": {"primary": ["SQL", "Tableau", "Power BI", "Excel", "Data Analysis"], "secondary": ["Python", "Statistics"], "soft": ["Attention to Detail", "Communication"]},
    "Data Engineer": {"primary": ["ETL", "Data Warehousing", "Spark", "SQL", "Python"], "secondary": ["AWS", "Hadoop"], "soft": ["Problem-Solving", "Critical Thinking"]},
    "Machine Learning Engineer": {"primary": ["TensorFlow", "PyTorch", "Deep Learning", "Python", "MLOps"], "secondary": ["Kubernetes", "AWS"], "soft": ["Problem-Solving", "Adaptability"]},
    "Cybersecurity Analyst": {"primary": ["Cybersecurity", "SIEM", "Network Security", "Firewalls"], "secondary": ["Penetration Testing", "Linux"], "soft": ["Attention to Detail", "Critical Thinking"]},
    "Penetration Tester": {"primary": ["Ethical Hacking", "Penetration Testing", "Metasploit"], "secondary": ["Scripting", "Cybersecurity"], "soft": ["Creativity", "Problem-Solving"]},
    "Database Administrator (DBA)": {"primary": ["PostgreSQL", "MongoDB", "Database Management"], "secondary": ["SQL", "NoSQL", "Performance Tuning"], "soft": ["Attention to Detail", "Problem-Solving"]},
    "Systems Administrator": {"primary": ["Linux", "Windows Server", "Networking", "Scripting"], "secondary": ["Active Directory", "Virtualization"], "soft": ["Problem-Solving", "Patience"]},
    "Network Engineer": {"primary": ["Cisco", "Juniper", "Routing", "Switching"], "secondary": ["Firewalls", "Network Security"], "soft": ["Problem-Solving", "Attention to Detail"]},
    "QA Engineer": {"primary": ["Quality Assurance", "Test Planning", "Automation Testing"], "secondary": ["JIRA", "SQL", "Selenium"], "soft": ["Attention to Detail", "Critical Thinking"]},
    "Game Developer": {"primary": ["C++", "C#", "Unity", "Unreal Engine"], "secondary": ["Game Design", "3D Modeling"], "soft": ["Creativity", "Problem-Solving"]},
    "Salesforce Developer": {"primary": ["Salesforce", "Apex", "Lightning Web Components"], "secondary": ["CRM", "SQL"], "soft": ["Problem-Solving", "Communication"]},
    "UX/UI Designer": {"primary": ["Figma", "UI/UX Design", "Wireframing", "Prototyping"], "secondary": ["User Research", "Sketch"], "soft": ["Empathy", "Creativity"]},
    "Product Manager (Tech)": {"primary": ["Agile", "Scrum", "Roadmap", "JIRA"], "secondary": ["Market Research", "Data Analysis"], "soft": ["Leadership", "Communication"]},
    "IT Support Specialist": {"primary": ["Hardware Troubleshooting", "Software Installation", "Customer Support"], "secondary": ["Active Directory", "Networking"], "soft": ["Patience", "Problem-Solving"]},
    "Mobile App Developer": {"primary": ["Swift", "Kotlin", "React Native"], "secondary": ["Git", "REST APIs"], "soft": ["Patience", "Attention to Detail"]},
    "Solutions Architect": {"primary": ["AWS", "Azure", "System Design", "Microservices"], "secondary": ["GCP", "Terraform"], "soft": ["Communication", "Leadership"]},
    "Blockchain Developer": {"primary": ["Solidity", "Ethereum", "Smart Contracts"], "secondary": ["JavaScript", "Cryptography"], "soft": ["Problem-Solving", "Critical Thinking"]},
    "Robotics Engineer": {"primary": ["Robotics", "C++", "Python", "ROS"], "secondary": ["MATLAB", "Computer Vision"], "soft": ["Problem-Solving", "Creativity"]},
    "Firmware Engineer": {"primary": ["C", "C++", "Embedded Systems", "Microcontrollers"], "secondary": ["RTOS", "Linux"], "soft": ["Attention to Detail", "Problem-Solving"]},
    "Site Reliability Engineer (SRE)": {"primary": ["Kubernetes", "Prometheus", "Go", "Python"], "secondary": ["AWS", "CI/CD"], "soft": ["Problem-Solving", "Adaptability"]},
    "Security Engineer": {"primary": ["Cryptography", "DevSecOps", "Cloud Security"], "secondary": ["Python", "Penetration Testing"], "soft": ["Critical Thinking", "Attention to Detail"]},
    "Quantum Computing Scientist": {"primary": ["Quantum Computing", "Qiskit", "Python", "Linear Algebra"], "secondary": ["Physics", "Machine Learning"], "soft": ["Critical Thinking", "Problem-Solving"]},
    "AI Ethics Specialist": {"primary": ["AI Ethics", "Fairness", "Accountability", "Transparency"], "secondary": ["Machine Learning", "Public Policy"], "soft": ["Critical Thinking", "Communication"]},

    # --- 50 Other Top Professions ---
    "Physician": {"primary": ["Medicine", "Diagnosis", "Patient Care", "Pharmacology"], "secondary": ["EHR", "Medical Ethics"], "soft": ["Empathy", "Communication"]},
    "Lawyer": {"primary": ["Litigation", "Contract Law", "Legal Research", "Negotiation"], "secondary": ["Corporate Law", "Torts"], "soft": ["Critical Thinking", "Communication"]},
    "Management Consultant": {"primary": ["Strategy", "Data Analysis", "Client Management"], "secondary": ["Financial Modeling", "Market Research"], "soft": ["Communication", "Problem-Solving"]},
    "Investment Banker": {"primary": ["Investment Banking", "Financial Modeling", "Mergers & Acquisitions"], "secondary": ["Valuation", "Excel"], "soft": ["Negotiation", "Time Management"]},
    "Marketing Manager": {"primary": ["Digital Marketing", "SEO", "Campaign Management"], "secondary": ["Google Analytics", "Content Marketing"], "soft": ["Leadership", "Creativity"]},
    "Architect": {"primary": ["AutoCAD", "Revit", "Architecture", "Building Codes"], "secondary": ["SketchUp", "Project Management"], "soft": ["Creativity", "Problem-Solving"]},
    "Mechanical Engineer": {"primary": ["SolidWorks", "AutoCAD", "Mechanical Engineering"], "secondary": ["MATLAB", "Thermodynamics"], "soft": ["Problem-Solving", "Critical Thinking"]},
    "Registered Nurse": {"primary": ["Nursing", "Patient Care", "EHR", "HIPAA"], "secondary": ["Medication Administration", "Wound Care"], "soft": ["Empathy", "Patience"]},
    "Accountant": {"primary": ["Accounting", "QuickBooks", "Excel", "Auditing"], "secondary": ["Tax Preparation", "Financial Reporting"], "soft": ["Attention to Detail", "Integrity"]},
    "Financial Analyst": {"primary": ["Financial Modeling", "Excel", "Valuation"], "secondary": ["SQL", "Accounting"], "soft": ["Attention to Detail", "Critical Thinking"]},
    "HR Manager": {"primary": ["Human Resources (HR)", "Employee Relations", "Recruiting"], "secondary": ["Labor Law", "Onboarding"], "soft": ["Empathy", "Leadership"]},
    "Project Manager (Non-Tech)": {"primary": ["Project Management", "Budgeting", "Risk Management"], "secondary": ["Scrum", "Gantt Charts"], "soft": ["Leadership", "Time Management"]},
    "Civil Engineer": {"primary": ["Civil Engineering", "AutoCAD", "Structural Engineering"], "secondary": ["Revit", "Construction Management"], "soft": ["Problem-Solving", "Attention to Detail"]},
    "Electrical Engineer": {"primary": ["Circuit Design", "MATLAB", "Electrical Engineering"], "secondary": ["AutoCAD", "Power Systems"], "soft": ["Problem-Solving", "Attention to Detail"]},
    "Pharmacist": {"primary": ["Pharmacology", "Medication Dispensing", "Patient Counseling"], "secondary": ["Pharmacy Law", "Chemistry"], "soft": ["Attention to Detail", "Communication"]},
    "Dentist": {"primary": ["Dentistry", "Oral Surgery", "X-Rays"], "secondary": ["Patient Care", "Orthodontics"], "soft": ["Attention to Detail", "Patience"]},
    "Veterinarian": {"primary": ["Veterinary Medicine", "Animal Surgery", "Animal Husbandry"], "secondary": ["Pharmacology", "Radiology"], "soft": ["Empathy", "Communication"]},
    "Physical Therapist": {"primary": ["Physical Therapy", "Rehabilitation", "Anatomy"], "secondary": ["Patient Care", "EHR"], "soft": ["Empathy", "Patience"]},
    "Sales Manager": {"primary": ["Sales", "CRM", "Salesforce", "Leadership"], "secondary": ["Negotiation", "Business Development"], "soft": ["Communication", "Leadership"]},
    "Supply Chain Manager": {"primary": ["Supply Chain Management", "Logistics", "Procurement"], "secondary": ["SAP", "Inventory Management"], "soft": ["Problem-Solving", "Negotiation"]},
    "Operations Manager": {"primary": ["Operations Management", "Process Improvement", "Budgeting"], "secondary": ["Lean Manufacturing", "Project Management"], "soft": ["Leadership", "Problem-Solving"]},
    "Chef": {"primary": ["Culinary Arts", "Menu Development", "Food Safety"], "secondary": ["Kitchen Management", "Cost Control"], "soft": ["Creativity", "Time Management"]},
    "Pilot": {"primary": ["Piloting", "Flight Planning", "Navigation"], "secondary": ["FAA Regulations", "Meteorology"], "soft": ["Attention to Detail", "Problem-Solving"]},
    "Graphic Designer": {"primary": ["Adobe Creative Suite", "Illustration", "Typography"], "secondary": ["Branding", "Figma"], "soft": ["Creativity", "Attention to Detail"]},
    "Journalist": {"primary": ["Journalism", "Interviewing", "Copywriting"], "secondary": ["Fact-Checking", "SEO"], "soft": ["Communication", "Critical Thinking"]},
    "Public Relations Specialist": {"primary": ["Public Relations", "Media Outreach", "Press Releases"], "secondary": ["Crisis Management", "Social Media Marketing"], "soft": ["Communication", "Adaptability"]},
    "Construction Manager": {"primary": ["Construction Management", "Project Management", "OSHA"], "secondary": ["Budgeting", "Blueprint Reading"], "soft": ["Leadership", "Problem-Solving"]},
    "Real Estate Agent": {"primary": ["Real Estate", "Sales", "Negotiation"], "secondary": ["Property Law", "CRM"], "soft": ["Communication", "Patience"]},
    "Teacher": {"primary": ["Teaching", "Curriculum Development", "Classroom Management"], "secondary": ["Subject Matter Expertise", "Pedagogy"], "soft": ["Patience", "Communication"]},
    "Professor": {"primary": ["Higher Education", "Scientific Research", "Grant Writing"], "secondary": ["Teaching", "Publishing"], "soft": ["Communication", "Critical Thinking"]},
    "Librarian": {"primary": ["Library Science", "Cataloging", "Research Assistance"], "secondary": ["Database Management", "Archiving"], "soft": ["Patience", "Attention to Detail"]},
    "Social Worker": {"primary": ["Social Work", "Case Management", "Counseling"], "secondary": ["Crisis Intervention", "Community Resources"], "soft": ["Empathy", "Patience"]},
    "Psychologist": {"primary": ["Psychology", "Therapy", "Diagnosis"], "secondary": ["Cognitive Behavioral Therapy (CBT)", "Patient Care"], "soft": ["Empathy", "Communication"]},
    "Actuary": {"primary": ["Actuarial Science", "Statistics", "Risk Management"], "secondary": ["Financial Modeling", "SQL"], "soft": ["Critical Thinking", "Problem-Solving"]},
    "Statistician": {"primary": ["Statistics", "Data Analysis", "R", "SAS"], "secondary": ["Python", "Machine Learning"], "soft": ["Critical Thinking", "Attention to Detail"]},
    "Economist": {"primary": ["Economics", "Econometrics", "Data Analysis"], "secondary": ["Market Research", "Public Policy"], "soft": ["Critical Thinking", "Communication"]},
    "Urban Planner": {"primary": ["Urban Planning", "GIS", "Zoning Laws"], "secondary": ["Community Development", "Data Analysis"], "soft": ["Critical Thinking", "Communication"]},
    "Geologist": {"primary": ["Geology", "Field Mapping", "GIS"], "secondary": ["Petrology", "Seismology"], "soft": ["Problem-Solving", "Adaptability"]},
    "Biologist": {"primary": ["Biology", "Genetics", "Lab Techniques"], "secondary": ["Microscopy", "Bioinformatics"], "soft": ["Attention to Detail", "Problem-Solving"]},
    "Chemist": {"primary": ["Chemistry", "Spectroscopy", "Chromatography"], "secondary": ["Lab Techniques", "Organic Chemistry"], "soft": ["Attention to Detail", "Problem-Solving"]},
    "Event Planner": {"primary": ["Event Planning", "Budgeting", "Vendor Management"], "secondary": ["Negotiation", "Marketing"], "soft": ["Time Management", "Adaptability"]},
    "Paralegal": {"primary": ["Paralegal", "Legal Research", "Document Drafting"], "secondary": ["Westlaw", "Litigation Support"], "soft": ["Attention to Detail", "Time Management"]},
    "Translator": {"primary": ["Bilingual", "Translation", "Proofreading"], "secondary": ["Localization", "Cultural Awareness"], "soft": ["Attention to Detail", "Communication"]},
    "Fitness Trainer": {"primary": ["Personal Training", "Anatomy", "Kinesiology"], "secondary": ["Nutrition", "Client Management"], "soft": ["Communication", "Empathy"]},
    "Firefighter": {"primary": ["Firefighting", "Emergency Medical Services (EMS)", "Hazardous Materials"], "secondary": ["Rescue Operations", "CPR"], "soft": ["Teamwork", "Problem-Solving"]},
    "Police Officer": {"primary": ["Law Enforcement", "Criminal Law", "Patrol Procedures"], "secondary": ["Investigation", "Self-Defense"], "soft": ["Communication", "Critical Thinking"]},
    "Electrician": {"primary": ["Electrical Wiring", "National Electrical Code (NEC)", "Blueprint Reading"], "secondary": ["Troubleshooting", "Safety Procedures"], "soft": ["Problem-Solving", "Attention to Detail"]},
    "Plumber": {"primary": ["Plumbing", "Pipefitting", "Blueprint Reading"], "secondary": ["Drain Cleaning", "HVAC"], "soft": ["Problem-Solving", "Patience"]},
    "Welder": {"primary": ["Welding", "MIG", "TIG", "Blueprint Reading"], "secondary": ["Fabrication", "Grinding"], "soft": ["Attention to Detail", "Patience"]},
    "Artist": {"primary": ["Painting", "Drawing", "Sculpture"], "secondary": ["Art History", "Color Theory"], "soft": ["Creativity", "Patience"]},
}

def generate_applicant_profile(job_role):
    """
    Generates a single profile using weighted skills for a much cleaner signal.
    """
    profile = {skill: 0.0 for skill in ALL_SKILLS}
    
    if job_role not in JOB_ROLE_SKILLS:
        return None
        
    skills_for_role = JOB_ROLE_SKILLS[job_role]

    # 1. Add primary skills with a high weight + some random variation
    for skill in skills_for_role["primary"]:
        if skill in profile:
            profile[skill] = round(random.uniform(0.8, 1.0), 2)

    # 2. Add secondary skills with a medium weight
    if skills_for_role.get("secondary"):
        for skill in skills_for_role["secondary"]:
            if skill in profile:
                profile[skill] = round(random.uniform(0.4, 0.7), 2)
    
    # 3. Add soft skills with a low, consistent weight
    if skills_for_role.get("soft"):
        for skill in skills_for_role["soft"]:
            if skill in profile:
                profile[skill] = round(random.uniform(0.1, 0.3), 2)

    profile["Job_Role"] = job_role
    return profile

def create_dataset(num_rows=50000):
    """
    Creates the full dataset with the specified number of rows.
    """
    print(f"Generating dataset with {num_rows} rows and {len(JOB_ROLE_SKILLS)} unique job roles...")
    dataset = []
    job_roles = list(JOB_ROLE_SKILLS.keys())

    for i in range(num_rows):
        if (i + 1) % 5000 == 0:
            print(f"  ...generated {i+1} rows")
        
        random_role = random.choice(job_roles)
        applicant_profile = generate_applicant_profile(random_role)
        if applicant_profile:
            dataset.append(applicant_profile)
    
    print("Dataset generation complete.")
    return pd.DataFrame(dataset)

if __name__ == "__main__":
    df = create_dataset(num_rows=50000)

    final_cols = [col for col in ALL_SKILLS if col in df.columns] + ['Job_Role']
    df = df[final_cols]
    
    output_filename = 'job_skills_80_roles_weighted.csv'
    df.to_csv(output_filename, index=False)
    
    print(f"\nSuccessfully created '{output_filename}' with {len(df)} rows and {len(df.columns)} columns.")
    print(f"Total unique job roles: {df['Job_Role'].nunique()}")
    print("\nFirst 5 rows of the dataset:")
    print(df.head())


Generating dataset with 50000 rows and 80 unique job roles...
  ...generated 5000 rows
  ...generated 10000 rows
  ...generated 15000 rows
  ...generated 20000 rows
  ...generated 25000 rows
  ...generated 30000 rows
  ...generated 35000 rows
  ...generated 40000 rows
  ...generated 45000 rows
  ...generated 50000 rows
Dataset generation complete.

Successfully created 'job_skills_80_roles_weighted.csv' with 50000 rows and 141 columns.
Total unique job roles: 80

First 5 rows of the dataset:
   Python  Java  JavaScript   C#   Go  TypeScript   SQL  NoSQL  PostgreSQL  \
0     0.0   0.0         0.0  0.0  0.0         0.0  0.00    0.0         0.0   
1     0.0   0.0         0.0  0.0  0.0         0.0  0.00    0.0         0.0   
2     0.0   0.0         0.0  0.0  0.0         0.0  0.00    0.0         0.0   
3     0.0   0.0         0.0  0.0  0.0         0.0  0.00    0.0         0.0   
4     0.0   0.0         1.0  0.0  0.0         0.0  0.88    0.0         0.0   

   MongoDB  ...  Problem-Solving  