### Setting up the environment

In [1]:
# Import necessary  libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Set plot style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("deep")
sns.set_context("notebook", font_scale=1.2)

# Pandas display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

# For reproducibility
np.random.seed(42)

### Loading the data

In [3]:
data_dir = '../data/raw/'

# Load employee data
employees_df = pd.read_csv(f'{data_dir}employees.csv', parse_dates=['HireDate', 'ExitDate'])

# Load survey data
surveys_df = pd.read_csv(f'{data_dir}surveys.csv', parse_dates=['SurveyDate'])

# Load performance data
performance_df = pd.read_csv(f'{data_dir}performance.csv')

# Load promotion history
promotions_df = pd.read_csv(f'{data_dir}promotions.csv', parse_dates=['PromotionDate'])

# Load training data
training_df = pd.read_csv(f'{data_dir}training.csv', parse_dates=['CompletionDate'])

# Load exit interview data
exit_interviews_df = pd.read_csv(f'{data_dir}exit_interviews.csv', parse_dates=['ExitDate', 'InterviewDate'])

# Load recruitment cost data
recruitment_costs_df = pd.read_csv(f'{data_dir}recruitment_costs.csv')

### Initial data overview

In [4]:
# Employee data overview
print(f"Employee data shape: {employees_df.shape}")
employees_df.head()

Employee data shape: (3900, 25)


Unnamed: 0,EmployeeID,FirstName,LastName,Age,Gender,OfficeLocation,DistanceFromHome,Department,JobRole,JobLevel,Education,FieldOfStudy,HireDate,YearsAtCompany,YearsSinceLastPromotion,JobSatisfaction,EnvironmentSatisfaction,WorkLifeBalance,PerformanceRating,Overtime,MonthlyIncome,AnnualIncome,TrainingTimesLastYear,Attrition,ExitDate
0,10001,Tuti,Hidayat,25,Male,Jakarta,12.41,Sales,Sales Representative,Entry Level,Diploma,Economics,2025-01-14 04:30:59.134004,0.15,1,3.8,4,5,3,False,3.91,50.85,71,True,2025-01-26 04:30:59.134004
1,10002,Yanto,Purnama,46,Male,Jakarta,8.59,Sales,Branch Sales Supervisor,Senior,Bachelor's Degree,Other,2023-01-24 04:30:59.134004,2.13,2,1.5,3,4,3,False,43.28,562.7,17,True,2024-04-18 04:30:59.134004
2,10003,Fitri,Setiawan,29,Female,Semarang,1.73,HR,HR Assistant,Junior,Bachelor's Degree,Business Administration,2023-10-25 04:30:59.134004,1.38,0,3.0,3,4,3,False,11.29,146.77,22,False,NaT
3,10004,Dedi,Purnama,54,Male,Depok,15.93,Operations,Operations Supervisor,Senior,High School,High School,2022-01-15 04:30:59.134004,3.15,3,3.2,4,3,4,False,55.51,721.63,47,True,2024-12-22 04:30:59.134004
4,10005,Hadi,Santoso,49,Female,Medan,1.77,Operations,Loan Processor,Entry Level,Bachelor's Degree,Business Administration,2022-03-06 04:30:59.134004,3.01,2,3.5,4,4,2,False,8.97,116.56,76,True,2023-09-15 04:30:59.134004


In [5]:
# Basic employee statistics
print("Basic statistics:")
print(f"Total employees: {len(employees_df)}")
print(f"Current employees: {len(employees_df[~employees_df['Attrition']])}")
print(f"Former employees: {len(employees_df[employees_df['Attrition']])}")
print(f"Attrition rate: {len(employees_df[employees_df['Attrition']]) / len(employees_df):.2%}")

# Department distribution
dept_counts = employees_df['Department'].value_counts()
print("\nEmployee count by department:")
for dept, count in dept_counts.items():
    print(f"  - {dept}: {count} ({count/len(employees_df):.1%})")

Basic statistics:
Total employees: 3900
Current employees: 642
Former employees: 3258
Attrition rate: 83.54%

Employee count by department:
  - Sales: 941 (24.1%)
  - Collections: 795 (20.4%)
  - Operations: 593 (15.2%)
  - Customer Service: 389 (10.0%)
  - Finance: 328 (8.4%)
  - Risk Management: 319 (8.2%)
  - IT: 215 (5.5%)
  - HR: 162 (4.2%)
  - Legal: 85 (2.2%)
  - Executive: 73 (1.9%)
