In [24]:
import pandas as pd
import numpy as np
import random


# Set the seed for reproducibility
seed_value = 42
random.seed(seed_value)
np.random.seed(seed_value)


# Number of records to generate
num_records = 2000

# Define possible values for categorical fields
age_groups = ['Young', 'Middle-aged', 'Senior']
attrition_status = ['Yes', 'No']
business_travel_options = ['Travel_Rarely', 'Travel_Frequently', 'Non-Travel']
#departments = ['Sales', 'Research & Development', 'Human Resources']
education_fields = ['Life Sciences', 'Other', 'Medical', 'Technical Degree']
genders = ['Male', 'Female']
job_roles = ['Clinical Trial Assistant', 'Clinical Research Associate I', 'Clinical Research Associate II', 'Senior CRA','Clinical Trial Manager','Clinical Project Manager']
marital_statuses = ['Single', 'Married', 'Divorced']
salary_slabs = ['Low', 'Medium', 'High']
over_time_options = ['Yes', 'No']

# Generate synthetic data
data = {
    'EmpID': [f"E{1000+i}" for i in range(num_records)],
    'Age': np.random.randint(18, 60, num_records),
    'AgeGroup': [random.choice(age_groups) for _ in range(num_records)],
    'Attrition': [random.choice(attrition_status) for _ in range(num_records)],
    'BusinessTravel': [random.choice(business_travel_options) for _ in range(num_records)],
    'DailyRate': np.random.randint(100, 1500, num_records),
    #'Department': [random.choice(departments) for _ in range(num_records)],
    'DistanceFromHome': np.random.randint(1, 30, num_records),
    'Education': np.random.randint(1, 5, num_records),
    'EducationField': [random.choice(education_fields) for _ in range(num_records)],
    'EmployeeCount': [1]*num_records,
    'EmployeeNumber': np.random.randint(1000, 9999, num_records),
    'EnvironmentSatisfaction': np.random.randint(1, 5, num_records),
    'Gender': [random.choice(genders) for _ in range(num_records)],
    'HourlyRate': np.random.randint(20, 100, num_records),
    'JobInvolvement': np.random.randint(1, 5, num_records),
    'JobLevel': np.random.randint(1, 6, num_records),
    'JobRole': [random.choice(job_roles) for _ in range(num_records)],
    'JobSatisfaction': np.random.randint(1, 5, num_records),
    'MaritalStatus': [random.choice(marital_statuses) for _ in range(num_records)],
    'MonthlyIncome': np.random.randint(2000, 20000, num_records),
    'SalarySlab': [random.choice(salary_slabs) for _ in range(num_records)],
    'MonthlyRate': np.random.randint(1000, 20000, num_records),
    'NumCompaniesWorked': np.random.randint(0, 10, num_records),
    'Over18': ['Y']*num_records,
    'OverTime': [random.choice(over_time_options) for _ in range(num_records)],
    'PercentSalaryHike': np.random.randint(10, 25, num_records),
    'PerformanceRating': np.random.randint(1, 5, num_records),
    'RelationshipSatisfaction': np.random.randint(1, 5, num_records),
    'StandardHours': [8]*num_records,
    'StockOptionLevel': np.random.randint(0, 4, num_records),
    'TotalWorkingYears': np.random.randint(0, 40, num_records),
    'TrainingTimesLastYear': np.random.randint(0, 10, num_records),
    'WorkLifeBalance': np.random.randint(1, 5, num_records),
    'YearsAtCompany': np.random.randint(0, 30, num_records),
    'YearsInCurrentRole': np.random.randint(0, 15, num_records),
    'YearsSinceLastPromotion': np.random.randint(0, 10, num_records),
    'YearsWithCurrManager': np.random.randint(0, 15, num_records)
}

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv("synthetic_employee_database.csv", index=False)
