In [1]:
import pandas as pd
from faker import Faker
import random

# Initialize Faker
fake = Faker()

# Predefined roles with specific titles
roles_mapping = {
    'Data Scientist': ['junior data scientist', 'senior data scientist'],
    'Data Engineer': ['junior data engineer', 'senior data engineer'],
    'Software Engineer': ['junior software engineer', 'senior software engineer'],
    'Data Analyst': ['junior data analyst', 'senior data analyst']
}

# Expanded skills based on role
skills_mapping = {
    'junior data scientist': ['Python', 'SQL', 'Data Visualization'],
    'senior data scientist': ['Machine Learning', 'TensorFlow', 'Natural Language Processing'],
    'junior data engineer': ['Python', 'ETL', 'Data Warehousing'],
    'senior data engineer': ['Hadoop', 'Spark', 'Data Lakes'],
    'junior software engineer': ['JavaScript', 'HTML', 'CSS'],
    'senior software engineer': ['Microservices', 'DevOps', 'Cloud Computing'],
    'junior data analyst': ['Excel', 'Tableau', 'SQL'],
    'senior data analyst': ['Statistics', 'Predictive Analytics', 'Data Modeling']
}

# Helper function to get a random role
def get_random_role():
    role = random.choice(list(roles_mapping.keys()))
    return random.choice(roles_mapping[role])  # Randomly choose junior or senior role

# Helper function to generate a random skill based on role
def get_random_skill(role):
    available_skills = skills_mapping[role]
    return random.choice(available_skills)  # Select one random skill

# Sample employee DataFrame for generating employee IDs
def create_sample_employee_df(num_records):
    return pd.DataFrame({
        'employee_id': range(1, num_records + 1)
    })

# Generate employee IDs DataFrame
employee_df = create_sample_employee_df(500)

# Function to generate employee data
def generate_employee_data(num_records):
    employee_data = []
    
    for _ in range(num_records):
        role = get_random_role()
        skill = get_random_skill(role)
        employee_id = random.choice(employee_df['employee_id'].values)
        employee = {
            'employee_id': employee_id,
            'employee_name': fake.name(),
            'current_role': role,
            'skills': skill,  # Assign only one skill
            'dateofbirth': fake.date_of_birth(minimum_age=18, maximum_age=70),
            'dateofjoining': fake.date_between(start_date='-20y', end_date='today'),
            'email': fake.email(),
            'mobile_number': fake.phone_number(),
            'Address': fake.address().replace('\n', ', ')
        }
        employee_data.append(employee)

    return employee_data

employee_data = generate_employee_data(500)

# Convert to DataFrame
df = pd.DataFrame(employee_data)

# Save to CSV
df.to_csv('employee_data.csv', index=False)

print('Employee dataset generated and saved to employee_data.csv')


Employee dataset generated and saved to employee_data.csv
