# Daily Tasks Data Profiling

This notebook provides basic profiling of the daily tasks data in `data/daily_tasks/`.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")

print("Data Profiling Setup Complete")

In [None]:
# Load data files
data_dir = Path('../data/daily_tasks')
data_files = list(data_dir.glob('*.csv'))

print(f"Found {len(data_files)} data files:")
for file in sorted(data_files):
    size_mb = file.stat().st_size / (1024 * 1024)
    print(f"  {file.name}: {size_mb:.1f} MB")

In [None]:
# Load a sample file to understand structure
sample_file = data_dir / '2024.csv'
print(f"Loading sample from {sample_file.name}...")

# Read a small sample first
sample_df = pd.read_csv(sample_file, nrows=1000)
print(f"Sample shape: {sample_df.shape}")
print(f"\nColumns ({len(sample_df.columns)}):")
print(sample_df.columns.tolist())

In [None]:
# Basic info about the sample data
print("Data Types:")
print(sample_df.dtypes)
print("\nFirst few rows:")
sample_df.head()

In [None]:
# Missing values analysis
print("Missing Values:")
missing_counts = sample_df.isnull().sum()
missing_pct = (missing_counts / len(sample_df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_counts,
    'Missing %': missing_pct
}).sort_values('Missing %', ascending=False)

print(missing_df[missing_df['Missing Count'] > 0])

In [None]:
# Unique values for key categorical columns
categorical_cols = ['district', 'sector', 'activity', 'animal_waste', 'broken_glass', 
                   'dumping', 'graffiti', 'medical_waste', 'fixed_post']

print("Unique values for key categorical columns:")
for col in categorical_cols:
    if col in sample_df.columns:
        unique_count = sample_df[col].nunique()
        print(f"{col}: {unique_count} unique values")
        if unique_count <= 10:
            print(f"  Values: {sample_df[col].unique()}")
        print()

In [None]:
# Numeric columns analysis
numeric_cols = sample_df.select_dtypes(include=[np.number]).columns
print(f"Numeric columns: {list(numeric_cols)}")

if len(numeric_cols) > 0:
    print("\nNumeric columns summary:")
    sample_df[numeric_cols].describe()

In [None]:
# Date analysis
date_cols = ['date_worked', 'start_time', 'end_time']
for col in date_cols:
    if col in sample_df.columns:
        print(f"\n{col} examples:")
        print(sample_df[col].head())
        
        # Try to parse dates
        try:
            parsed_dates = pd.to_datetime(sample_df[col])
            print(f"Date range: {parsed_dates.min()} to {parsed_dates.max()}")
        except:
            print("Could not parse as datetime")

In [None]:
# Load full datasets to get overview statistics
file_stats = []

for file in sorted(data_files):
    print(f"Processing {file.name}...")
    
    # Get basic stats without loading full file into memory
    chunk_size = 10000
    total_rows = 0
    
    for chunk in pd.read_csv(file, chunksize=chunk_size):
        total_rows += len(chunk)
    
    file_stats.append({
        'file': file.name,
        'year': file.stem,
        'total_rows': total_rows,
        'size_mb': file.stat().st_size / (1024 * 1024)
    })

stats_df = pd.DataFrame(file_stats)
print("\nFile Statistics:")
stats_df

In [None]:
# Visualize file statistics
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Rows by year
ax1.bar(stats_df['year'], stats_df['total_rows'])
ax1.set_title('Total Rows by Year')
ax1.set_xlabel('Year')
ax1.set_ylabel('Number of Rows')
ax1.tick_params(axis='x', rotation=45)

# File size by year
ax2.bar(stats_df['year'], stats_df['size_mb'])
ax2.set_title('File Size by Year (MB)')
ax2.set_xlabel('Year')
ax2.set_ylabel('Size (MB)')
ax2.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

print(f"Total records across all files: {stats_df['total_rows'].sum():,}")
print(f"Total data size: {stats_df['size_mb'].sum():.1f} MB")

In [None]:
# Activity distribution analysis (from sample)
if 'activity' in sample_df.columns:
    activity_counts = sample_df['activity'].value_counts()
    
    plt.figure(figsize=(10, 6))
    activity_counts.plot(kind='bar')
    plt.title('Activity Distribution (Sample)')
    plt.xlabel('Activity Type')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    print("Activity distribution:")
    print(activity_counts)

In [None]:
# Borough/District analysis (from sample)
if 'district' in sample_df.columns:
    district_counts = sample_df['district'].value_counts()
    
    plt.figure(figsize=(12, 6))
    district_counts.head(20).plot(kind='bar')
    plt.title('Top 20 Districts by Task Count (Sample)')
    plt.xlabel('District')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    print(f"Total unique districts: {sample_df['district'].nunique()}")
    print("\nTop 10 districts:")
    print(district_counts.head(10))

## Summary

This profiling provides an overview of the daily tasks data structure, including:

- **Data Volume**: Multiple years of data with hundreds of thousands of records
- **Key Columns**: Activity types, districts, sectors, timestamps, and various waste/maintenance flags
- **Data Quality**: Missing value analysis and data type verification
- **Temporal Coverage**: Multi-year dataset spanning 2022-2025
- **Geographic Coverage**: Multiple districts and sectors across NYC

The data appears to be NYC Department of Sanitation daily task records with detailed location, timing, and activity information.