In [2]:
import pandas as pd
from faker import Faker
import random

# Initialize Faker
fake = Faker()

# Define the number of employees
num_employees = 10000

# Define the proportions of H1B petitions from different countries (based on 2019 data)
h1b_countries = {
    'India': 0.72,
    'China': 0.13,
    'Canada': 0.05,
    'South Korea': 0.04,
    'Philippines': 0.02,
    'Taiwan': 0.02,
    'Mexico': 0.02
}

# Define departments and their proportions
departments = {
    'Legal': 0.05,
    'Marketing': 0.10,
    'Administrative': 0.10,
    'Operations': 0.20,
    'Sales': 0.10,
    'Finance': 0.05,
    'I/T': 0.10,
    'Product': 0.20,
    'Human Resource': 0.10
}

# Define salaries for each department (use realistic ranges from salary.com)
salary_ranges = {
    'Legal': (60000, 120000),
    'Marketing': (50000, 100000),
    'Administrative': (30000, 70000),
    'Operations': (40000, 90000),
    'Sales': (45000, 95000),
    'Finance': (50000, 110000),
    'I/T': (60000, 130000),
    'Product': (70000, 150000),
    'Human Resource': (40000, 80000)
}

# Gender balancing
genders = ['Male', 'Female']

# Generate synthetic employee data
employees = []

for _ in range(num_employees):
    gender = random.choice(genders)
    department = random.choices(list(departments.keys()), weights=departments.values())[0]
    salary = random.randint(*salary_ranges[department])
    languages_spoken = random.sample(['Spanish', 'Chinese', 'Hindi', 'Korean', 'Tagalog', 'French'], k=random.randint(0, 2))
    country = random.choices(list(h1b_countries.keys()), weights=h1b_countries.values())[0] if random.random() < 0.40 else 'USA'

    employee = {
        'Name': fake.name(),
        'Gender': gender,
        'Department': department,
        'Salary': salary,
        'Languages_Spoken': ', '.join(languages_spoken),
        'Country': country,
        'Social_Security_Number': fake.ssn()
    }

    employees.append(employee)

# Convert to DataFrame
df = pd.DataFrame(employees)

# Save the DataFrame to a CSV file
df.to_csv('synthetic_employees.csv', index=False)

# Display the DataFrame
print(df.head())


                   Name  Gender  Department  Salary Languages_Spoken Country  \
0  Christopher Villegas  Female       Legal   72074   Tagalog, Hindi     USA   
1        Gail Hernandez  Female       Sales   50994          Tagalog   India   
2            Hannah Cox    Male  Operations   71465                    India   
3            Jacob Diaz  Female         I/T   63085   Hindi, Tagalog  Taiwan   
4     Travis Stephenson  Female  Operations   56602          Tagalog     USA   

  Social_Security_Number  
0            555-29-5793  
1            077-64-7466  
2            702-63-0636  
3            783-86-3601  
4            827-60-3532  


In [1]:
pip install faker

Collecting faker
  Downloading Faker-25.8.0-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: faker
Successfully installed faker-25.8.0
