In [1]:
print("\n======= BREAK EQUITY ANALYSIS =======")

# 17. Analyze break durations across departments, roles, locations
# By department
dept_break_analysis = merged_df.groupby('department')['break1_duration'].agg(
    ['mean', 'median', 'std', 'count']).reset_index()
dept_break_analysis = dept_break_analysis.sort_values('mean', ascending=False)
print("\nBreak Duration by Department:")
print(dept_break_analysis)

# By location
loc_break_analysis = merged_df.groupby('location')['break1_duration'].agg(
    ['mean', 'median', 'std', 'count']).reset_index()
loc_break_analysis = loc_break_analysis.sort_values('mean', ascending=False)
print("\nBreak Duration by Location:")
print(loc_break_analysis)

# By supervisor
sup_break_analysis = merged_df.groupby('supervisor')['break1_duration'].agg(
    ['mean', 'median', 'std', 'count']).reset_index()
sup_break_analysis = sup_break_analysis.sort_values('mean', ascending=False)
print("\nBreak Duration by Supervisor:")
print(sup_break_analysis.head(10))  # Show only top 10 for brevity

# 18. Analyze break equity across pay rate groups
# Calculate statistical tests to see if break durations differ significantly across pay groups
try:
    # ANOVA test for break duration differences among pay rate groups
    groups = [merged_df[merged_df['pay_rate_bin'] == bin]['break1_duration'].dropna() 
              for bin in merged_df['pay_rate_bin'].unique()]
    f_stat, p_value = stats.f_oneway(*groups)
    print(f"\nANOVA Test for Break Duration Differences Among Pay Rate Groups:")
    print(f"F-statistic: {f_stat:.4f}, p-value: {p_value:.4f}")
    
    if p_value < 0.05:
        print("The difference in break durations across pay rate groups is statistically significant.")
    else:
        print("No statistically significant difference in break durations across pay rate groups.")
except:
    print("Couldn't perform ANOVA test - may be due to insufficient data in some groups.")

# 19. Visualize break equity across departments
plt.figure(figsize=(14, 8))
sns.boxplot(x='department', y='break1_duration', data=merged_df, palette='viridis')
plt.title('Break Duration Distribution by Department', fontsize=16)
plt.xlabel('Department', fontsize=14)
plt.ylabel('Break Duration (minutes)', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('break_equity_by_department.png', dpi=300, bbox_inches='tight')
plt.close()

# 20. Analyze supervisor influence on break durations
# Find supervisors with statistically different break allowances
# Group by supervisor and calculate mean break durations
supervisor_breaks = merged_df.groupby('supervisor')['break1_duration'].agg(['mean', 'count']).reset_index()
# Filter to supervisors with sufficient data points
min_employees = 10  # Minimum number of employees required for analysis
supervisor_breaks = supervisor_breaks[supervisor_breaks['count'] >= min_employees]

# Calculate global mean for comparison
global_mean = merged_df['break1_duration'].mean()
supervisor_breaks['deviation_from_mean'] = supervisor_breaks['mean'] - global_mean
supervisor_breaks['percent_deviation'] = (supervisor_breaks['deviation_from_mean'] / global_mean) * 100

# Sort by deviation
supervisor_breaks = supervisor_breaks.sort_values('percent_deviation', ascending=False)
print("\nSupervisors with Most Deviation in Break Durations:")
print(supervisor_breaks.head(10))

# 21. Visualize supervisor influence on breaks
plt.figure(figsize=(12, 6))
sns.barplot(x='supervisor', y='percent_deviation', data=supervisor_breaks.head(10), palette='viridis')
plt.title('Supervisor Influence on Break Durations (% Deviation from Mean)', fontsize=16)
plt.xlabel('Supervisor', fontsize=14)
plt.ylabel('% Deviation from Mean Break Duration', fontsize=14)
plt.axhline(y=0, color='red', linestyle='-', alpha=0.3)
plt.grid(axis='y', alpha=0.3)
plt.savefig('supervisor_break_influence.png', dpi=300, bbox_inches='tight')
plt.close()

# 22. Analyze correlation between pay rate and break duration
corr_pay_break = merged_df['pay_rate'].corr(merged_df['break1_duration'])
print(f"\nCorrelation between pay rate and break duration: {corr_pay_break:.4f}")

# 23. Visualize relationship between pay rate and break duration
plt.figure(figsize=(10, 6))
sns.scatterplot(x='pay_rate', y='break1_duration', data=merged_df, alpha=0.5)
plt.title('Relationship Between Pay Rate and Break Duration', fontsize=16)
plt.xlabel('Pay Rate', fontsize=14)
plt.ylabel('Break Duration (minutes)', fontsize=14)
# Add trend line
sns.regplot(x='pay_rate', y='break1_duration', data=merged_df, scatter=False, color='red')
plt.grid(alpha=0.3)
plt.savefig('pay_rate_vs_break_duration.png', dpi=300, bbox_inches='tight')
plt.close()

# 24. Create composite visualization for break equity
plt.figure(figsize=(15, 10))

# Location comparison
plt.subplot(2, 2, 1)
sns.barplot(x='location', y='break1_duration', data=merged_df, palette='viridis', ci=None)
plt.title('Break Duration by Location', fontsize=14)
plt.xlabel('Location', fontsize=12)
plt.ylabel('Average Break Duration (min)', fontsize=12)
plt.xticks(rotation=45, ha='right')

# Supervisor top/bottom comparison
top_5_supervisors = supervisor_breaks.head(5)['supervisor'].tolist()
bottom_5_supervisors = supervisor_breaks.tail(5)['supervisor'].tolist()
comparison_supervisors = top_5_supervisors + bottom_5_supervisors
supervisor_subset = merged_df[merged_df['supervisor'].isin(comparison_supervisors)]

plt.subplot(2, 2, 2)
sns.barplot(x='supervisor', y='break1_duration', data=supervisor_subset, palette='viridis', ci=None)
plt.title('Break Duration: Top 5 vs Bottom 5 Supervisors', fontsize=14)
plt.xlabel('Supervisor', fontsize=12)
plt.ylabel('Average Break Duration (min)', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.axhline(y=global_mean, color='red', linestyle='--', label='Global Mean')
plt.legend()

# Pay rate scatter plot with department coloring
plt.subplot(2, 2, 3)
sns.scatterplot(x='pay_rate', y='break1_duration', hue='department', data=merged_df, alpha=0.7, palette='tab10')
plt.title('Pay Rate vs Break Duration by Department', fontsize=14)
plt.xlabel('Pay Rate', fontsize=12)
plt.ylabel('Break Duration (min)', fontsize=12)
plt.legend(title='Department', bbox_to_anchor=(1.05, 1), loc='upper left')

# Pay bins box plot
plt.subplot(2, 2, 4)
sns.boxplot(x='pay_rate_bin', y='break1_duration', data=merged_df, palette='viridis')
plt.title('Break Duration Distribution by Pay Rate Quintile', fontsize=14)
plt.xlabel('Pay Rate Quintile', fontsize=12)
plt.ylabel('Break Duration (min)', fontsize=12)
plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.savefig('break_equity_composite.png', dpi=300, bbox_inches='tight')
plt.close()

# 25. Generate efficiency impact insights by department
dept_efficiency = merged_df.groupby('department')['efficiency_change'].mean().reset_index()
dept_efficiency = dept_efficiency.sort_values('efficiency_change', ascending=False)
print("\nEfficiency Impact by Department:")
print(dept_efficiency)

# 26. Identify departments with optimal break patterns
dept_breaks_efficiency = merged_df.groupby('department').agg({
    'break1_duration': 'mean',
    'efficiency_change': 'mean'
}).reset_index()
dept_breaks_efficiency = dept_breaks_efficiency.sort_values('efficiency_change', ascending=False)
print("\nDepartments with Most Effective Break Patterns:")
print(dept_breaks_efficiency)

# 27. Visualize department break effectiveness
plt.figure(figsize=(14, 8))
# Create a scatter plot with bubble size proportional to break duration
plt.scatter(
    dept_breaks_efficiency['break1_duration'], 
    dept_breaks_efficiency['efficiency_change'],
    s=dept_breaks_efficiency['break1_duration'] * 5,  # Size proportional to break duration
    alpha=0.7,
    c=range(len(dept_breaks_efficiency)),  # Color gradient
    cmap='viridis'
)

# Add department labels to each point
for i, row in dept_breaks_efficiency.iterrows():
    plt.annotate(
        row['department'],
        (row['break1_duration'], row['efficiency_change']),
        xytext=(5, 5),
        textcoords='offset points',
        fontsize=10
    )

plt.title('Department Break Effectiveness', fontsize=16)
plt.xlabel('Average Break Duration (minutes)', fontsize=14)
plt.ylabel('Efficiency Change After Break', fontsize=14)
plt.axhline(y=0, color='red', linestyle='--', alpha=0.3)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig('department_break_effectiveness.png', dpi=300, bbox_inches='tight')
plt.close()

print("\nBreak Impact and Equity Analysis Complete. Results saved to CSV and PNG files.")




NameError: name 'merged_df' is not defined