# HR Data Generator Library Demo

This notebook demonstrates the basic functionality of the `hr_data_generator` library.

## 1. Installation & Import

First, ensure the library is installed:
```bash
pip install -e .
```

In [None]:
from hr_data_generator import generate_hr_data, HRDataGenerator
import pandas as pd

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

## 2. Basic Usage

Generate a complete HR dataset with 100 employees using a seed for reproducibility.

In [None]:
# Generate HR data
data = generate_hr_data(n_employees=100, seed=42)

# Show what tables were generated
print("Generated tables:")
for name, df in data.items():
    print(f"  {name}: {len(df)} rows, {len(df.columns)} columns")

## 3. Explore the Employee Table (Hub)

The `employee` table is the central hub with one row per person.

In [None]:
employees = data['employee']
print(f"Employee table: {len(employees)} rows\n")
print("Columns:", list(employees.columns))
employees.head(10)

In [None]:
# Check the CEO (employee with no manager)
ceo = employees[employees['manager_id'].isna()]
print("CEO (no manager_id):")
ceo

In [None]:
# Employee demographics summary
print("Gender distribution:")
print(employees['gender'].value_counts())
print("\nEmployment type distribution:")
print(employees['employment_type'].value_counts())

## 4. Job Assignments (Time-Variant)

The `employee_job_assignment` table tracks job history including promotions.

In [None]:
job_assignments = data['employee_job_assignment']
print(f"Job assignments: {len(job_assignments)} records for {len(employees)} employees")
print(f"Average assignments per employee: {len(job_assignments) / len(employees):.2f}\n")
job_assignments.head(10)

In [None]:
# Show an employee with multiple job assignments (promotion history)
emp_job_counts = job_assignments.groupby('employee_id').size()
promoted_emp = emp_job_counts[emp_job_counts > 1].index[0] if len(emp_job_counts[emp_job_counts > 1]) > 0 else None

if promoted_emp:
    print(f"Career history for {promoted_emp}:")
    display(job_assignments[job_assignments['employee_id'] == promoted_emp].sort_values('start_date'))
else:
    print("No employees with promotions in this dataset")

In [None]:
# Job level distribution
current_jobs = job_assignments[job_assignments['end_date'].isna()]
print("Current job level distribution:")
print(current_jobs['job_level'].value_counts())
print("\nSeniority level distribution:")
print(current_jobs['seniority_level'].value_counts().sort_index())

## 5. Organization Assignments

In [None]:
org_assignments = data['employee_org_assignment']
print(f"Org assignments: {len(org_assignments)} records\n")
org_assignments.head(10)

In [None]:
# Business unit distribution
current_orgs = org_assignments[org_assignments['end_date'].isna()]
print("Current business unit distribution:")
print(current_orgs['business_unit'].value_counts())

## 6. Compensation Records

In [None]:
compensation = data['employee_compensation']
print(f"Compensation records: {len(compensation)} rows\n")
compensation.head(10)

In [None]:
# Current salary statistics
current_comp = compensation[compensation['end_date'].isna()]
print("Current salary statistics:")
print(current_comp['base_salary'].describe())
print("\nChange reasons:")
print(compensation['change_reason'].value_counts())

In [None]:
# Salary history for one employee
emp_with_raises = compensation.groupby('employee_id').size()
emp_id = emp_with_raises[emp_with_raises > 2].index[0] if len(emp_with_raises[emp_with_raises > 2]) > 0 else compensation['employee_id'].iloc[0]

print(f"Salary history for {emp_id}:")
compensation[compensation['employee_id'] == emp_id].sort_values('start_date')

## 7. Performance Reviews

In [None]:
performance = data['employee_performance']
print(f"Performance reviews: {len(performance)} records\n")
performance.head(10)

In [None]:
# Rating distribution
print("Performance rating distribution:")
rating_dist = performance.groupby(['rating', 'rating_label']).size().reset_index(name='count')
rating_dist['percentage'] = (rating_dist['count'] / rating_dist['count'].sum() * 100).round(1)
rating_dist

## 8. Reference Tables

In [None]:
# Organization hierarchy
org_units = data['organization_unit']
print(f"Organization units: {len(org_units)} departments\n")
org_units.head(10)

In [None]:
# Job roles
job_roles = data['job_role']
print(f"Job roles: {len(job_roles)} roles\n")
print("Jobs by family and level:")
job_roles.groupby(['job_family', 'job_level']).size().unstack(fill_value=0)

In [None]:
# Locations
locations = data['location']
print(f"Locations: {len(locations)} offices\n")
print("Locations by country:")
print(locations['country'].value_counts())

## 9. Referential Integrity Check

Verify that all foreign keys reference valid primary keys.

In [None]:
# Verify referential integrity
emp_ids = set(employees['employee_id'])
job_ids = set(job_roles['job_id'])
org_ids = set(org_units['org_id'])

checks = [
    ("manager_id -> employee_id", 
     set(employees['manager_id'].dropna()).issubset(emp_ids)),
    ("job_assignment.employee_id -> employee_id", 
     set(job_assignments['employee_id']).issubset(emp_ids)),
    ("job_assignment.job_id -> job_role.job_id", 
     set(job_assignments['job_id']).issubset(job_ids)),
    ("org_assignment.employee_id -> employee_id", 
     set(org_assignments['employee_id']).issubset(emp_ids)),
    ("org_assignment.org_id -> organization_unit.org_id", 
     set(org_assignments['org_id']).issubset(org_ids)),
    ("compensation.employee_id -> employee_id", 
     set(compensation['employee_id']).issubset(emp_ids)),
    ("performance.employee_id -> employee_id", 
     set(performance['employee_id']).issubset(emp_ids)),
]

print("Referential Integrity Checks:")
for check_name, passed in checks:
    status = "PASS" if passed else "FAIL"
    print(f"  [{status}] {check_name}")

## 10. Advanced Usage: Custom Date Ranges

In [None]:
# Generate data for a specific time period
generator = HRDataGenerator(seed=123)
custom_data = generator.generate(
    n_employees=50,
    start_date="2020-01-01",
    end_date="2024-12-31",
    include_performance=True,
    include_compensation=True
)

print("Custom date range generation:")
print(f"  Employees: {len(custom_data['employee'])}")
print(f"  Performance reviews: {len(custom_data['employee_performance'])}")
print(f"  Review years: {sorted(custom_data['employee_performance']['review_period_year'].unique())}")

## 11. Reproducibility Test

In [None]:
# Same seed produces identical results
data1 = generate_hr_data(n_employees=20, seed=999)
data2 = generate_hr_data(n_employees=20, seed=999)

print("Reproducibility test (same seed=999):")
print(f"  Employee tables identical: {data1['employee'].equals(data2['employee'])}")
print(f"  Job assignments identical: {data1['employee_job_assignment'].equals(data2['employee_job_assignment'])}")

# Different seeds produce different results
data3 = generate_hr_data(n_employees=20, seed=1)
data4 = generate_hr_data(n_employees=20, seed=2)

print("\nDifferent seeds (1 vs 2):")
print(f"  Employee tables identical: {data3['employee'].equals(data4['employee'])}")

## 12. Export to CSV (Optional)

In [None]:
# Uncomment to export tables to CSV
# import os
# os.makedirs('output', exist_ok=True)
# for name, df in data.items():
#     df.to_csv(f'output/{name}.csv', index=False)
#     print(f"Exported {name}.csv")

---

## Summary

The `hr_data_generator` library provides:

- **6 interconnected tables** with full referential integrity
- **Time-variant records** for job, org, and compensation history
- **Realistic manager hierarchy** with seniority constraints
- **Reproducible results** via seed parameter
- **Configurable** employee counts and date ranges