In [1]:
import pandas as pd
from faker import Faker
import random

fake = Faker()

NUM_EMPLOYEES = 10000
DEPARTMENTS = ['Engineering']
ROLES = ['Developer', 'Manager', 'Analyst', 'Designer']

SKILLS = [
    'Python', 'Communication', 'Project Management', 'Data Analysis', 'Leadership', 
    'ReactJS', 'NodeJS', 'MongoDB', 'UI/UX Design', 'DevOps', 
    'Machine Learning', 'Artificial Intelligence', 'Cloud Computing', 'Cybersecurity', 'Agile Methodology',
    'Software Testing', 'SQL', 'Networking', 'Mobile App Development', 'Technical Writing'
]

CERTIFICATIONS = [
    'AWS Certified', 'PMP', 'Google Analytics', 'Scrum Master', 'Cisco Certified', 
    'Microsoft Azure Fundamentals', 'Oracle Certified Professional', 'Certified Ethical Hacker (CEH)', 'Google Cloud Certified', 'CompTIA Security+',
    'Certified Kubernetes Administrator (CKA)', 'Certified Business Analysis Professional (CBAP)', 'Certified Information Systems Security Professional (CISSP)', 'Red Hat Certified Engineer (RHCE)', 'Certified Scrum Product Owner (CSPO)', 
    'ITIL Foundation Certification', 'Six Sigma Green Belt', 'Salesforce Certified Administrator', 'Adobe Certified Expert', 'Tableau Desktop Specialist'
]

COURSES = {
    'Engineering': {
        'Developer': ['Advanced Python Programming', 'ReactJS Mastery', 'Cloud Computing Essentials'],
        'Manager': ['Agile Project Management', 'Leadership in Tech', 'Cloud Strategy for Managers'],
        'Analyst': ['Data Science Fundamentals', 'SQL for Data Analysis', 'Python for Data Analysis'],
        'Designer': ['UI/UX Design Principles', 'Advanced UI/UX Techniques', 'Adobe XD Mastery']
    }
}


def generate_employees(num):
    employees = []
    for _ in range(num):
        employee_id = fake.unique.random_int(min=1000, max=99999)
        employees.append({
            'employee_id': employee_id,
            'name': fake.name(),
            'department': random.choice(DEPARTMENTS),
            'role': random.choice(ROLES),
            'email': fake.unique.email()
        })
    return pd.DataFrame(employees)


def generate_skills():
    skills = []
    for idx, skill in enumerate(SKILLS, start=1):
        skills.append({'skill_id': idx, 'skill_name': skill})
    return pd.DataFrame(skills)


def generate_certifications():
    certifications = []
    for idx, cert in enumerate(CERTIFICATIONS, start=1):
        certifications.append({'certification_id': idx, 'certification_name': cert})
    return pd.DataFrame(certifications)


def generate_courses():
    courses = []
    course_id = 1
    for dept, roles in COURSES.items():
        for role, role_courses in roles.items():
            for course in role_courses:
                courses.append({
                    'course_id': course_id,
                    'department': dept,
                    'role': role,
                    'course_name': course
                })
                course_id += 1
    return pd.DataFrame(courses)


def generate_employee_skills(employees_df):
    employee_skills = []
    for _, row in employees_df.iterrows():
        for skill_id in random.sample(range(1, len(SKILLS) + 1), k=random.randint(1, 3)):  # 1 to 3 skills per employee
            skill_level = random.choice(['Beginner', 'Intermediate', 'Advanced'])
            skill_progress = random.randint(0, 100)
            employee_skills.append({
                'employee_id': row['employee_id'],
                'skill_id': skill_id,
                'skill_level': skill_level,
                'skill_progress': skill_progress
            })
    return pd.DataFrame(employee_skills)


def generate_employee_certificates(employees_df):
    employee_certificates = []
    for _, row in employees_df.iterrows():
        for cert_id in random.sample(range(1, len(CERTIFICATIONS) + 1), k=random.randint(0, 5)):  # 0 to 5 certificates
            employee_certificates.append({
                'employee_certificate_id': fake.unique.random_int(min=1000, max=99999),
                'employee_id': row['employee_id'],
                'certification_id': cert_id
            })
    return pd.DataFrame(employee_certificates)


def generate_employee_courses(employees_df, courses_df):
    employee_courses = []
    for _, row in employees_df.iterrows():
        department = row['department']
        role = row['role']
        if department in COURSES and role in COURSES[department]:
            for course_id in random.sample(
                courses_df[(courses_df['department'] == department) & (courses_df['role'] == role)]['course_id'].tolist(),
                k=random.randint(1, 3)  # 1 to 3 courses per employee
            ):
                employee_courses.append({
                    'employee_id': row['employee_id'],
                    'course_id': course_id
                })
    return pd.DataFrame(employee_courses)


def main():
    employees_df = generate_employees(NUM_EMPLOYEES)
    skills_df = generate_skills()
    certifications_df = generate_certifications()
    courses_df = generate_courses()
    employee_skills_df = generate_employee_skills(employees_df)
    employee_certificates_df = generate_employee_certificates(employees_df)
    employee_courses_df = generate_employee_courses(employees_df, courses_df)

    employees_df.to_csv('staging/employees.csv', index=False)
    skills_df.to_csv('staging/skills.csv', index=False)
    certifications_df.to_csv('staging/certifications.csv', index=False)
    courses_df.to_csv('staging/courses.csv', index=False)
    employee_skills_df.to_csv('staging/employee_skills.csv', index=False)
    employee_certificates_df.to_csv('staging/employee_certificates.csv', index=False)
    employee_courses_df.to_csv('staging/employee_courses.csv', index=False)


if __name__ == "__main__":
    main()
