In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import scipy.stats as stats

# 1. Load the datasets
payroll_df = pd.read_csv('sample_payroll.csv')
timekeeping_df = pd.read_csv('sample_timekeeping.csv')

# 2. Convert datetime columns to proper datetime objects
datetime_columns = ['punchin1', 'punchout1', 'punchin2', 'punchout2', 'punchin3', 'punchout3']
for col in datetime_columns:
    timekeeping_df[col] = pd.to_datetime(timekeeping_df[col], errors='coerce')

# Convert date column to datetime
timekeeping_df['date'] = pd.to_datetime(timekeeping_df['date'], errors='coerce')

# 3. Calculate break durations
# Break 1: Time between punchout1 and punchin2
timekeeping_df['break1_duration'] = (timekeeping_df['punchin2'] - timekeeping_df['punchout1']).dt.total_seconds() / 60  # in minutes

# Break 2: Time between punchout2 and punchin3
timekeeping_df['break2_duration'] = (timekeeping_df['punchin3'] - timekeeping_df['punchout2']).dt.total_seconds() / 60  # in minutes

# 4. Calculate work durations
# Work period 1: From punchin1 to punchout1
timekeeping_df['work_period1'] = (timekeeping_df['punchout1'] - timekeeping_df['punchin1']).dt.total_seconds() / 60  # in minutes

# Work period 2: From punchin2 to punchout2
timekeeping_df['work_period2'] = (timekeeping_df['punchout2'] - timekeeping_df['punchin2']).dt.total_seconds() / 60  # in minutes

# Work period 3: From punchin3 to punchout3 (if applicable)
timekeeping_df['work_period3'] = (timekeeping_df['punchout3'] - timekeeping_df['punchin3']).dt.total_seconds() / 60  # in minutes

# 5. Calculate total working time and total break time per record
timekeeping_df['total_work_time'] = timekeeping_df[['work_period1', 'work_period2', 'work_period3']].sum(axis=1, skipna=True)
timekeeping_df['total_break_time'] = timekeeping_df[['break1_duration', 'break2_duration']].sum(axis=1, skipna=True)

# 6. Filter out unreasonable values (negative or extreme)
# For work periods
max_reasonable_work = 720  # 12 hours in minutes
min_reasonable_work = 0
cols_to_clean = ['work_period1', 'work_period2', 'work_period3', 'total_work_time']
for col in cols_to_clean:
    timekeeping_df[col] = timekeeping_df[col].apply(
        lambda x: np.nan if pd.isna(x) or x < min_reasonable_work or x > max_reasonable_work else x)

# For break periods
max_reasonable_break = 240  # 4 hours in minutes
min_reasonable_break = 0
cols_to_clean = ['break1_duration', 'break2_duration', 'total_break_time']
for col in cols_to_clean:
    timekeeping_df[col] = timekeeping_df[col].apply(
        lambda x: np.nan if pd.isna(x) or x < min_reasonable_break or x > max_reasonable_break else x)

# 7. Merge with payroll data
merged_df = pd.merge(timekeeping_df, payroll_df, on='employee_id', how='left')

# 8. Extract day of week and add work efficiency metrics
merged_df['day_of_week'] = merged_df['date'].dt.day_name()

# 9. Calculate work efficiency (as a proxy for productivity)
# Efficiency is assessed by comparing work periods before and after breaks
# Assuming 2nd work period comes after 1st break
merged_df['pre_break_efficiency'] = merged_df['work_period1'] / merged_df['work_period1'].mean()
merged_df['post_break_efficiency'] = merged_df['work_period2'] / merged_df['work_period2'].mean()
merged_df['efficiency_change'] = merged_df['post_break_efficiency'] - merged_df['pre_break_efficiency']

# Calculate employee-level average efficiency change
employee_efficiency = merged_df.groupby('employee_id').agg({
    'efficiency_change': 'mean',
    'break1_duration': 'mean',
    'total_break_time': 'mean'
}).reset_index()

print("======= BREAK IMPACT ANALYSIS =======")

# 10. Analyze relationship between break duration and efficiency change
# Calculate correlation between break duration and efficiency change
corr_coef = merged_df['break1_duration'].corr(merged_df['efficiency_change'])
print(f"\nCorrelation between break duration and efficiency change: {corr_coef:.4f}")

# Create break duration bins for analysis
merged_df['break_duration_bin'] = pd.cut(
    merged_df['break1_duration'],
    bins=[0, 15, 30, 45, 60, 120, 240],
    labels=['0-15', '15-30', '30-45', '45-60', '60-120', '120-240']
)

# Calculate average efficiency change by break duration bin
efficiency_by_break_duration = merged_df.groupby('break_duration_bin')['efficiency_change'].mean().reset_index()
print("\nEfficiency Change by Break Duration:")
print(efficiency_by_break_duration)

# 11. Find optimal break duration based on efficiency change
optimal_duration_bin = efficiency_by_break_duration.loc[efficiency_by_break_duration['efficiency_change'].idxmax(), 'break_duration_bin']
print(f"\nOptimal break duration range for maximum efficiency: {optimal_duration_bin} minutes")

# 12. Visualize break duration vs. efficiency change
plt.figure(figsize=(12, 6))
sns.barplot(x='break_duration_bin', y='efficiency_change', data=merged_df, 
            ci=None, palette='viridis')
plt.title('Impact of Break Duration on Efficiency Change', fontsize=16)
plt.xlabel('Break Duration (minutes)', fontsize=14)
plt.ylabel('Average Efficiency Change', fontsize=14)
plt.axhline(y=0, color='red', linestyle='-', alpha=0.3)
plt.grid(axis='y', alpha=0.3)
plt.savefig('break_impact_efficiency.png', dpi=300, bbox_inches='tight')
plt.close()

# 13. Analyze relationship between break duration and pay rate
# Create pay rate bins
merged_df['pay_rate_bin'] = pd.qcut(
    merged_df['pay_rate'], 
    q=5, 
    labels=['Lowest 20%', 'Low 20%', 'Middle 20%', 'High 20%', 'Highest 20%']
)

# Calculate average break durations by pay rate bin
break_duration_by_pay = merged_df.groupby('pay_rate_bin')['break1_duration'].mean().reset_index()
print("\nAverage Break Duration by Pay Rate Quintile:")
print(break_duration_by_pay)

# 14. Visualize break duration by pay rate
plt.figure(figsize=(12, 6))
sns.barplot(x='pay_rate_bin', y='break1_duration', data=merged_df, 
            ci=None, palette='viridis')
plt.title('Average Break Duration by Pay Rate Quintile', fontsize=16)
plt.xlabel('Pay Rate Quintile', fontsize=14)
plt.ylabel('Average Break Duration (minutes)', fontsize=14)
plt.grid(axis='y', alpha=0.3)
plt.savefig('break_duration_by_pay.png', dpi=300, bbox_inches='tight')
plt.close()

# 15. Analyze relationship between efficiency change and pay rate
efficiency_by_pay = merged_df.groupby('pay_rate_bin')['efficiency_change'].mean().reset_index()
print("\nEfficiency Change by Pay Rate Quintile:")
print(efficiency_by_pay)

# 16. Visualize efficiency change by pay rate
plt.figure(figsize=(12, 6))
sns.barplot(x='pay_rate_bin', y='efficiency_change', data=merged_df, 
            ci=None, palette='viridis')
plt.title('Efficiency Change by Pay Rate Quintile', fontsize=16)
plt.xlabel('Pay Rate Quintile', fontsize=14)
plt.ylabel('Average Efficiency Change', fontsize=14)
plt.axhline(y=0, color='red', linestyle='-', alpha=0.3)
plt.grid(axis='y', alpha=0.3)
plt.savefig('efficiency_by_pay.png', dpi=300, bbox_inches='tight')
plt.close()

FileNotFoundError: [Errno 2] No such file or directory: 'sample_timekeeping.csv'