# Park Cleaning Records Data Profile

This notebook provides a comprehensive profile of the daily tasks park cleaning records dataset.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set style for better plots
plt.style.use('default')
sns.set_palette('husl')

In [None]:
# Load the dataset
df = pd.read_csv('data/daily_tasks_park_cleaning_records_20250923.csv')

print(f"Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

## Dataset Overview

In [None]:
# Basic information about the dataset
print("=== DATASET INFO ===")
print(f"Number of rows: {len(df)}")
print(f"Number of columns: {len(df.columns)}")
print(f"\nColumn names and types:")
print(df.dtypes)
print(f"\nMemory usage: {df.memory_usage().sum() / 1024:.2f} KB")

In [None]:
# Display first few rows
print("=== FIRST 5 ROWS ===")
display(df.head())

In [None]:
# Check for missing values
print("=== MISSING VALUES ===")
missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100
missing_df = pd.DataFrame({
    'Column': missing_data.index,
    'Missing Count': missing_data.values,
    'Missing Percentage': missing_percent.values
})
display(missing_df[missing_df['Missing Count'] > 0])

## Temporal Analysis

In [None]:
# Date and time analysis
print("=== TEMPORAL ANALYSIS ===")

# Convert date/time columns
if 'date_worked' in df.columns:
    df['date_worked'] = pd.to_datetime(df['date_worked'])
    print(f"Date range: {df['date_worked'].min()} to {df['date_worked'].max()}")
    print(f"Unique dates: {df['date_worked'].nunique()}")

if 'start_time' in df.columns:
    df['start_time'] = pd.to_datetime(df['start_time'])
    df['end_time'] = pd.to_datetime(df['end_time'])
    
    # Extract hour for shift analysis
    df['start_hour'] = df['start_time'].dt.hour
    print(f"\nWorking hours range: {df['start_hour'].min()}:00 to {df['start_hour'].max()}:00")
    
    # Calculate duration
    df['duration_hours'] = (df['end_time'] - df['start_time']).dt.total_seconds() / 3600
    print(f"Task duration - Mean: {df['duration_hours'].mean():.2f}h, Range: {df['duration_hours'].min():.2f}h - {df['duration_hours'].max():.2f}h")

## Activity Analysis

In [None]:
# Activity type analysis
print("=== ACTIVITY ANALYSIS ===")

if 'activity' in df.columns:
    activity_counts = df['activity'].value_counts()
    print("Tasks by activity type:")
    display(activity_counts)
    
    print(f"\nTotal unique activities: {df['activity'].nunique()}")
    
    # Work vs non-work activities
    work_pct = (df['activity'] == 'Work').sum() / len(df) * 100
    print(f"Work activities: {work_pct:.1f}%")

In [None]:
# Waste and maintenance issues analysis
print("=== WASTE & MAINTENANCE ISSUES ===")

waste_columns = ['animal_waste', 'broken_glass', 'dumping', 'graffiti', 'medical_waste']
boolean_columns = ['animal_waste', 'broken_glass', 'dumping', 'graffiti', 'medical_waste']

# Convert Yes/No to boolean for analysis
for col in boolean_columns:
    if col in df.columns:
        df[f'{col}_bool'] = df[col] == 'Yes'

# Count issues
issue_stats = {}
for col in waste_columns:
    if col in df.columns:
        yes_count = (df[col] == 'Yes').sum()
        yes_pct = yes_count / len(df) * 100
        issue_stats[col] = {'count': yes_count, 'percentage': yes_pct}
        print(f"{col}: {yes_count} occurrences ({yes_pct:.1f}%)")

# Most common issue
if issue_stats:
    most_common = max(issue_stats.items(), key=lambda x: x[1]['count'])
    print(f"\nMost common issue: {most_common[0]} ({most_common[1]['count']} occurrences)")

## Geographic and Organizational Analysis

In [None]:
# District and sector analysis
print("=== GEOGRAPHIC ANALYSIS ===")

if 'district' in df.columns:
    district_counts = df['district'].value_counts()
    print("Tasks by district:")
    display(district_counts)

if 'sector_name' in df.columns:
    sector_counts = df['sector_name'].value_counts().head(10)
    print("\nTop 10 sectors by task count:")
    display(sector_counts)

# Vehicle utilization
if 'vehicle_number' in df.columns:
    vehicle_stats = df['vehicle_number'].value_counts()
    print(f"\nVehicle utilization:")
    print(f"Total vehicles used: {df['vehicle_number'].nunique()}")
    print(f"Most used vehicle: {vehicle_stats.index[0]} ({vehicle_stats.iloc[0]} tasks)")
    print(f"Average tasks per vehicle: {vehicle_stats.mean():.2f}")

## Workforce Analysis

In [None]:
# Crew and workforce analysis
print("=== WORKFORCE ANALYSIS ===")

workforce_cols = ['napsw', 'ncpw', 'ncsa', 'npop', 'nnpw', 'ncrew']
workforce_data = {}

for col in workforce_cols:
    if col in df.columns:
        workforce_data[col] = {
            'mean': df[col].mean(),
            'max': df[col].max(),
            'total': df[col].sum()
        }
        print(f"{col} - Mean: {df[col].mean():.2f}, Max: {df[col].max()}, Total: {df[col].sum()}")

# Fixed post analysis
if 'fixed_post' in df.columns:
    fixed_post_counts = df['fixed_post'].value_counts()
    print(f"\nFixed post assignments:")
    display(fixed_post_counts)

# Hours analysis
if 'nhours' in df.columns:
    print(f"\nHours statistics:")
    print(f"Total hours: {df['nhours'].sum():.2f}")
    print(f"Average hours per task: {df['nhours'].mean():.2f}")
    print(f"Hour range: {df['nhours'].min():.2f} - {df['nhours'].max():.2f}")

## Data Quality Analysis

In [None]:
# Check for data quality issues
print("=== DATA QUALITY ANALYSIS ===")

# Duplicate analysis
print(f"Total duplicate rows: {df.duplicated().sum()}")
print(f"Duplicate daily_task_ids: {df['daily_task_id'].duplicated().sum()}")

# Time consistency checks
if 'start_time' in df.columns and 'end_time' in df.columns:
    negative_duration = (df['end_time'] < df['start_time']).sum()
    print(f"Tasks with negative duration: {negative_duration}")
    
    zero_duration = (df['duration_hours'] == 0).sum()
    print(f"Tasks with zero duration: {zero_duration}")

# Check for impossible values
if 'nhours' in df.columns:
    impossible_hours = (df['nhours'] > 24).sum()
    print(f"Tasks with >24 hours: {impossible_hours}")

# Missing critical data
if 'gispropnum' in df.columns:
    missing_locations = df['gispropnum'].isnull().sum()
    print(f"Records missing location data: {missing_locations}")

## Data Visualizations

In [None]:
# Create comprehensive visualizations
fig, axes = plt.subplots(3, 2, figsize=(16, 18))

# Activity distribution
if 'activity' in df.columns:
    activity_counts = df['activity'].value_counts()
    axes[0, 0].pie(activity_counts.values, labels=activity_counts.index, autopct='%1.1f%%')
    axes[0, 0].set_title('Distribution of Activity Types')

# Start hour distribution
if 'start_hour' in df.columns:
    hour_counts = df['start_hour'].value_counts().sort_index()
    axes[0, 1].bar(hour_counts.index, hour_counts.values)
    axes[0, 1].set_title('Tasks by Start Hour')
    axes[0, 1].set_xlabel('Hour of Day')
    axes[0, 1].set_ylabel('Number of Tasks')

# Duration distribution
if 'duration_hours' in df.columns:
    axes[1, 0].hist(df['duration_hours'], bins=30, alpha=0.7, edgecolor='black')
    axes[1, 0].set_title('Task Duration Distribution')
    axes[1, 0].set_xlabel('Duration (hours)')
    axes[1, 0].set_ylabel('Frequency')

# Issues frequency
if any(col in df.columns for col in waste_columns):
    issue_counts = []
    issue_names = []
    for col in waste_columns:
        if col in df.columns:
            count = (df[col] == 'Yes').sum()
            issue_counts.append(count)
            issue_names.append(col.replace('_', ' ').title())
    
    axes[1, 1].barh(issue_names, issue_counts)
    axes[1, 1].set_title('Frequency of Issues')
    axes[1, 1].set_xlabel('Number of Occurrences')

# District distribution
if 'district' in df.columns:
    district_counts = df['district'].value_counts()
    axes[2, 0].bar(range(len(district_counts)), district_counts.values)
    axes[2, 0].set_xticks(range(len(district_counts)))
    axes[2, 0].set_xticklabels(district_counts.index, rotation=45)
    axes[2, 0].set_title('Tasks by District')
    axes[2, 0].set_ylabel('Number of Tasks')

# Crew size distribution
if 'ncrew' in df.columns:
    crew_counts = df['ncrew'].value_counts().sort_index()
    axes[2, 1].bar(crew_counts.index, crew_counts.values)
    axes[2, 1].set_title('Distribution of Crew Sizes')
    axes[2, 1].set_xlabel('Number of Crew Members')
    axes[2, 1].set_ylabel('Number of Tasks')

plt.tight_layout()
plt.show()

In [None]:
# Time series analysis
if 'date_worked' in df.columns:
    print("=== TIME SERIES ANALYSIS ===")
    
    # Daily task counts
    daily_counts = df['date_worked'].value_counts().sort_index()
    
    plt.figure(figsize=(15, 6))
    plt.plot(daily_counts.index, daily_counts.values, marker='o')
    plt.title('Daily Task Volume Over Time')
    plt.xlabel('Date')
    plt.ylabel('Number of Tasks')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    print(f"Average daily tasks: {daily_counts.mean():.2f}")
    print(f"Peak day: {daily_counts.idxmax()} ({daily_counts.max()} tasks)")
    print(f"Lowest day: {daily_counts.idxmin()} ({daily_counts.min()} tasks)")

## Cross-Analysis & Correlations

In [None]:
# Cross-tabulation analysis
print("=== CROSS-TABULATION ANALYSIS ===")

if 'district' in df.columns and 'activity' in df.columns:
    print("\nDistrict vs Activity:")
    crosstab1 = pd.crosstab(df['district'], df['activity'], margins=True)
    display(crosstab1)

# Issue correlation analysis
if all(col in df.columns for col in waste_columns):
    print("\n=== ISSUE CORRELATION MATRIX ===")
    
    # Create boolean matrix for issues
    issue_matrix = df[waste_columns].replace({'Yes': 1, 'No': 0})
    correlation_matrix = issue_matrix.corr()
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
                square=True, fmt='.2f')
    plt.title('Correlation Matrix of Waste/Maintenance Issues')
    plt.tight_layout()
    plt.show()

In [None]:
# Efficiency analysis
print("=== EFFICIENCY ANALYSIS ===")

if 'nhours' in df.columns and 'ncrew' in df.columns:
    # Productivity metrics
    df['crew_hours'] = df['nhours'] * df['ncrew']
    
    efficiency_by_district = df.groupby('district').agg({
        'crew_hours': 'sum',
        'row_id': 'count',
        'nhours': 'mean'
    }).rename(columns={'row_id': 'total_tasks'})
    
    efficiency_by_district['hours_per_task'] = efficiency_by_district['crew_hours'] / efficiency_by_district['total_tasks']
    
    print("Efficiency by district:")
    display(efficiency_by_district.sort_values('hours_per_task'))

# Peak hours analysis
if 'start_hour' in df.columns:
    peak_hours = df.groupby('start_hour').agg({
        'row_id': 'count',
        'nhours': 'sum'
    }).rename(columns={'row_id': 'task_count', 'nhours': 'total_hours'})
    
    print("\nActivity by hour:")
    display(peak_hours.sort_values('task_count', ascending=False).head(10))

## Data Quality Summary & Recommendations

In [None]:
print("=== DATA QUALITY SUMMARY & RECOMMENDATIONS ===")

print("\n1. COMPLETENESS:")
critical_cols = ['gispropnum', 'date_worked', 'start_time', 'end_time', 'activity']
for col in critical_cols:
    if col in df.columns:
        missing_pct = (df[col].isnull().sum() / len(df)) * 100
        if missing_pct > 0:
            print(f"   - {col}: {missing_pct:.1f}% missing values")

print("\n2. TEMPORAL CONSISTENCY:")
if 'start_time' in df.columns and 'end_time' in df.columns:
    time_issues = (df['end_time'] <= df['start_time']).sum()
    print(f"   - Tasks with end time before/equal to start time: {time_issues}")
    
    if 'nhours' in df.columns:
        hour_mismatch = abs(df['duration_hours'] - df['nhours']).sum()
        print(f"   - Potential hour calculation mismatches: {hour_mismatch:.2f}")

print("\n3. LOGICAL CONSISTENCY:")
if 'ncrew' in df.columns:
    zero_crew = (df['ncrew'] == 0).sum()
    print(f"   - Tasks with zero crew members: {zero_crew}")

if 'vehicle_number' in df.columns:
    missing_vehicles = df['vehicle_number'].isnull().sum()
    print(f"   - Tasks missing vehicle assignment: {missing_vehicles}")

print("\n4. OPERATIONAL INSIGHTS:")
if 'activity' in df.columns:
    work_pct = (df['activity'] == 'Work').sum() / len(df) * 100
    print(f"   - Work vs non-work ratio: {work_pct:.1f}% work activities")

if any(col in df.columns for col in waste_columns):
    total_issues = sum((df[col] == 'Yes').sum() for col in waste_columns if col in df.columns)
    avg_issues_per_task = total_issues / len(df)
    print(f"   - Average issues per task: {avg_issues_per_task:.2f}")

print("\n5. RECOMMENDATIONS:")
recommendations = [
    "Implement real-time data validation for start/end times",
    "Standardize missing value handling for location data",
    "Add data quality checks for crew and vehicle assignments",
    "Consider automated duration calculation based on start/end times",
    "Implement outlier detection for unusual task durations",
    "Create dashboard for monitoring daily productivity metrics",
    "Establish data retention policies for historical analysis",
    "Consider geocoding validation for location accuracy"
]

for i, rec in enumerate(recommendations, 1):
    print(f"   {i}. {rec}")

print("\n6. KEY METRICS FOR MONITORING:")
metrics = [
    "Daily task completion rates",
    "Average crew hours per task",
    "Issue frequency trends",
    "Vehicle utilization rates",
    "Sector-wise productivity",
    "Peak operational hours"
]

for i, metric in enumerate(metrics, 1):
    print(f"   {i}. {metric}")