In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import calendar

# 1. Load the datasets
payroll_df = pd.read_csv('sample_payroll.csv')
timekeeping_df = pd.read_csv('sample_timekeeping.csv')

# 2. Convert datetime columns to proper datetime objects
datetime_columns = ['punchin1', 'punchout1', 'punchin2', 'punchout2', 'punchin3', 'punchout3']
for col in datetime_columns:
    timekeeping_df[col] = pd.to_datetime(timekeeping_df[col], errors='coerce')

# Convert date column to datetime
timekeeping_df['date'] = pd.to_datetime(timekeeping_df['date'], errors='coerce')

# 3. Extract time of day for breaks
# First, extract the hour and minute for break start (punchout1)
timekeeping_df['break1_start_hour'] = timekeeping_df['punchout1'].dt.hour
timekeeping_df['break1_start_minute'] = timekeeping_df['punchout1'].dt.minute
# Convert to decimalized hours for easier plotting (e.g., 9:30 = 9.5)
timekeeping_df['break1_start_time'] = timekeeping_df['break1_start_hour'] + timekeeping_df['break1_start_minute']/60

# Same for second break
timekeeping_df['break2_start_hour'] = timekeeping_df['punchout2'].dt.hour
timekeeping_df['break2_start_minute'] = timekeeping_df['punchout2'].dt.minute
timekeeping_df['break2_start_time'] = timekeeping_df['break2_start_hour'] + timekeeping_df['break2_start_minute']/60

# 4. Extract day-of-week and month for temporal analysis
timekeeping_df['day_of_week'] = timekeeping_df['date'].dt.day_name()
timekeeping_df['day_of_week_num'] = timekeeping_df['date'].dt.dayofweek
timekeeping_df['month'] = timekeeping_df['date'].dt.month
timekeeping_df['month_name'] = timekeeping_df['date'].dt.month_name()

# 5. Merge with payroll data for additional insights
merged_df = pd.merge(timekeeping_df, payroll_df, on='employee_id', how='left')

# 6. Analysis: Break Start Time Distribution
plt.figure(figsize=(15, 10))

# 6.1 Distribution of first break start times
plt.subplot(2, 2, 1)
plt.hist(timekeeping_df['break1_start_time'].dropna(), bins=24, color='skyblue', edgecolor='black')
plt.title('Distribution of First Break Start Times')
plt.xlabel('Hour of Day (24-hour format)')
plt.ylabel('Number of Breaks')
plt.xticks(range(0, 24))
plt.grid(axis='y', alpha=0.75)

# 6.2 Break start times by department
plt.subplot(2, 2, 2)
departments = timekeeping_df['department'].unique()
colors = plt.cm.tab10(np.linspace(0, 1, len(departments)))

for i, dept in enumerate(departments):
    dept_data = timekeeping_df[timekeeping_df['department'] == dept]
    plt.hist(dept_data['break1_start_time'].dropna(), bins=24, alpha=0.5, 
             label=dept, color=colors[i])

plt.title('First Break Start Times by Department')
plt.xlabel('Hour of Day (24-hour format)')
plt.ylabel('Number of Breaks')
plt.xticks(range(0, 24))
plt.legend(fontsize='x-small')
plt.grid(axis='y', alpha=0.75)

# 6.3 Heatmap of breaks by day and hour
plt.subplot(2, 2, 3)
# Create day-hour combinations
timekeeping_df['break_day_hour'] = timekeeping_df['day_of_week_num'].astype(str) + '-' + timekeeping_df['break1_start_hour'].astype(str)

# Count breaks by day and hour
break_counts = timekeeping_df.groupby(['day_of_week_num', 'break1_start_hour']).size().reset_index(name='count')
break_counts_pivot = break_counts.pivot(index='day_of_week_num', columns='break1_start_hour', values='count')

# Plot heatmap
plt.imshow(break_counts_pivot, cmap='YlOrRd', aspect='auto')
plt.colorbar(label='Number of Breaks')
plt.title('Heatmap of Break Times by Day of Week and Hour')
plt.xlabel('Hour of Day (24-hour format)')
plt.ylabel('Day of Week')
plt.yticks(range(7), ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
plt.xticks(range(0, 24, 2))

# 6.4 Monthly break patterns
plt.subplot(2, 2, 4)
monthly_breaks = timekeeping_df.groupby('month_name')['break1_start_time'].count().reindex(
    ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'])
plt.bar(monthly_breaks.index, monthly_breaks.values, color='green')
plt.title('Number of Breaks by Month')
plt.xlabel('Month')
plt.ylabel('Number of Breaks')
plt.xticks(rotation=45)
plt.tight_layout()

plt.savefig('break_pattern_analysis.png')
plt.close()

  timekeeping_df[col] = pd.to_datetime(timekeeping_df[col], errors='coerce')
  timekeeping_df[col] = pd.to_datetime(timekeeping_df[col], errors='coerce')
  timekeeping_df[col] = pd.to_datetime(timekeeping_df[col], errors='coerce')
  timekeeping_df[col] = pd.to_datetime(timekeeping_df[col], errors='coerce')
  timekeeping_df[col] = pd.to_datetime(timekeeping_df[col], errors='coerce')
  timekeeping_df[col] = pd.to_datetime(timekeeping_df[col], errors='coerce')
  timekeeping_df['date'] = pd.to_datetime(timekeeping_df['date'], errors='coerce')
