# Public Parks Toilets Data Profile

This notebook provides a comprehensive profile of the directory of toilets in public parks dataset.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set style for better plots
plt.style.use('default')
sns.set_palette('husl')

In [None]:
# Load the dataset
df = pd.read_csv('data/directory_of_toilets_in_public_parks_20250923.csv')

print(f"Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

## Dataset Overview

In [None]:
# Basic information about the dataset
print("=== DATASET INFO ===")
print(f"Number of rows: {len(df)}")
print(f"Number of columns: {len(df.columns)}")
print(f"\nColumn names and types:")
print(df.dtypes)
print(f"\nMemory usage: {df.memory_usage().sum() / 1024:.2f} KB")

In [None]:
# Display first few rows
print("=== FIRST 5 ROWS ===")
display(df.head())

In [None]:
# Check for missing values
print("=== MISSING VALUES ===")
missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100
missing_df = pd.DataFrame({
    'Column': missing_data.index,
    'Missing Count': missing_data.values,
    'Missing Percentage': missing_percent.values
})
display(missing_df[missing_df['Missing Count'] > 0])

## Data Quality Analysis

In [None]:
# Check for duplicate records
print("=== DUPLICATE ANALYSIS ===")
print(f"Total duplicate rows: {df.duplicated().sum()}")
print(f"Duplicate names: {df['Name'].duplicated().sum()}")
if df.duplicated().sum() > 0:
    print("\nDuplicate rows:")
    display(df[df.duplicated()])

In [None]:
# Analyze categorical columns
print("=== CATEGORICAL COLUMN ANALYSIS ===")
categorical_cols = ['Open Year-Round', 'Handicap Accessible', 'Borough']

for col in categorical_cols:
    if col in df.columns:
        print(f"\n{col}:")
        print(df[col].value_counts(dropna=False))
        print(f"Unique values: {df[col].nunique()}")

## Statistical Summary

In [None]:
# String length analysis for text columns
print("=== TEXT COLUMN STATISTICS ===")
text_cols = ['Name', 'Location', 'Comments']

for col in text_cols:
    if col in df.columns:
        lengths = df[col].astype(str).str.len()
        print(f"\n{col} length statistics:")
        print(f"  Mean: {lengths.mean():.2f}")
        print(f"  Min: {lengths.min()}")
        print(f"  Max: {lengths.max()}")
        print(f"  Median: {lengths.median():.2f}")

## Data Visualizations

In [None]:
# Borough distribution
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Borough distribution
if 'Borough' in df.columns:
    borough_counts = df['Borough'].value_counts()
    axes[0, 0].pie(borough_counts.values, labels=borough_counts.index, autopct='%1.1f%%')
    axes[0, 0].set_title('Distribution by Borough')

# Open year-round distribution
if 'Open Year-Round' in df.columns:
    open_counts = df['Open Year-Round'].value_counts()
    axes[0, 1].bar(open_counts.index, open_counts.values)
    axes[0, 1].set_title('Open Year-Round Distribution')
    axes[0, 1].set_ylabel('Count')

# Handicap accessibility
if 'Handicap Accessible' in df.columns:
    handicap_counts = df['Handicap Accessible'].value_counts(dropna=False)
    axes[1, 0].bar(range(len(handicap_counts)), handicap_counts.values)
    axes[1, 0].set_xticks(range(len(handicap_counts)))
    axes[1, 0].set_xticklabels(handicap_counts.index, rotation=45)
    axes[1, 0].set_title('Handicap Accessibility')
    axes[1, 0].set_ylabel('Count')

# Name length distribution
name_lengths = df['Name'].astype(str).str.len()
axes[1, 1].hist(name_lengths, bins=20, alpha=0.7)
axes[1, 1].set_title('Distribution of Name Lengths')
axes[1, 1].set_xlabel('Character Length')
axes[1, 1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Cross-tabulation analysis
print("=== CROSS-TABULATION ANALYSIS ===")

if 'Borough' in df.columns and 'Open Year-Round' in df.columns:
    print("\nBorough vs Open Year-Round:")
    crosstab1 = pd.crosstab(df['Borough'], df['Open Year-Round'], margins=True)
    display(crosstab1)

if 'Borough' in df.columns and 'Handicap Accessible' in df.columns:
    print("\nBorough vs Handicap Accessible:")
    crosstab2 = pd.crosstab(df['Borough'], df['Handicap Accessible'], margins=True, dropna=False)
    display(crosstab2)

## Data Quality Issues & Recommendations

In [None]:
print("=== DATA QUALITY SUMMARY ===")
print("\n1. COMPLETENESS:")
for col in df.columns:
    missing_pct = (df[col].isnull().sum() / len(df)) * 100
    if missing_pct > 0:
        print(f"   - {col}: {missing_pct:.1f}% missing values")

print("\n2. CONSISTENCY:")
if 'Open Year-Round' in df.columns:
    unique_open = df['Open Year-Round'].unique()
    print(f"   - Open Year-Round values: {unique_open}")
if 'Handicap Accessible' in df.columns:
    unique_handicap = df['Handicap Accessible'].dropna().unique()
    print(f"   - Handicap Accessible values: {unique_handicap}")

print("\n3. DUPLICATES:")
print(f"   - Duplicate rows: {df.duplicated().sum()}")
print(f"   - Duplicate names: {df['Name'].duplicated().sum()}")

print("\n4. RECOMMENDATIONS:")
recommendations = []
if df['Handicap Accessible'].isnull().sum() > 0:
    recommendations.append("Fill missing 'Handicap Accessible' values with 'Unknown' or investigate further")
if df.duplicated().sum() > 0:
    recommendations.append("Review and remove duplicate records")
if df['Comments'].isnull().sum() > len(df) * 0.8:
    recommendations.append("Comments column has many missing values - consider if it's necessary")

for i, rec in enumerate(recommendations, 1):
    print(f"   {i}. {rec}")