# LinkNYC Permits Analysis

Comprehensive analysis of LinkNYC new site permit applications, including status tracking, geographic distribution, and timeline analysis.


## Setup and Data Loading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('default')
sns.set_palette("viridis")
%matplotlib inline

In [None]:
# Load the permits data
data_file = '../data/LinkNYC_New_Site_Permit_Applications_20250924.csv'
df = pd.read_csv(data_file)

# Clean column names
df.columns = df.columns.str.strip()

# Convert date columns
date_cols = ['Final Submission (A)', 'GF DoITT Submitted to CB (A)', 'GF CB Comment Period Ends (A)']
for col in date_cols:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')

print(f"Data loaded successfully!")
print(f"Shape: {df.shape}")
df.head()

## Dataset Overview

In [None]:
print("=" * 80)
print("LINKNYC PERMITS DATA PROFILE")
print("=" * 80)

print(f"\n📊 DATASET OVERVIEW")
print(f"{'─' * 40}")
print(f"Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

## Data Quality Assessment

In [None]:
# Column information
print(f"\n📋 COLUMNS")
print(f"{'─' * 70}")
for i, col in enumerate(df.columns):
    dtype = df[col].dtype
    null_count = df[col].isnull().sum()
    null_pct = (null_count / len(df)) * 100
    unique_count = df[col].nunique()
    print(f"{i+1:2d}. {col[:35]:<35} {str(dtype):<15} {null_count:>6} nulls ({null_pct:5.1f}%) {unique_count:>6} unique")

# Data quality metrics
print(f"\n🔍 DATA QUALITY")
print(f"{'─' * 40}")
total_cells = df.shape[0] * df.shape[1]
missing_cells = df.isnull().sum().sum()
print(f"Total cells: {total_cells:,}")
print(f"Missing cells: {missing_cells:,} ({missing_cells/total_cells*100:.2f}%)")
print(f"Complete rows: {df.dropna().shape[0]:,} ({df.dropna().shape[0]/df.shape[0]*100:.1f}%)")

# Show missing data pattern
missing_data = df.isnull().sum().sort_values(ascending=False)
missing_data = missing_data[missing_data > 0]

if len(missing_data) > 0:
    plt.figure(figsize=(12, 6))
    missing_data.plot(kind='bar', color='coral', edgecolor='darkred')
    plt.title('Missing Data by Column', fontweight='bold')
    plt.ylabel('Number of Missing Values')
    plt.xticks(rotation=45, ha='right')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

## Installation Status Analysis

In [None]:
print(f"\n🔧 INSTALLATION STATUS")
print(f"{'─' * 40}")
if 'Installation Status ' in df.columns:
    status_counts = df['Installation Status '].value_counts()
    print("Installation status distribution:")
    for status, count in status_counts.items():
        pct = (count / len(df)) * 100
        print(f"• {status}: {count:,} ({pct:.1f}%)")
    
    # Visualize installation status
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Pie chart
    status_counts.plot(kind='pie', ax=ax1, autopct='%1.1f%%', startangle=90)
    ax1.set_title('Installation Status Distribution', fontweight='bold')
    ax1.set_ylabel('')
    
    # Bar chart
    status_counts.plot(kind='bar', ax=ax2, color='lightblue', edgecolor='navy')
    ax2.set_title('Installation Status Count', fontweight='bold')
    ax2.set_ylabel('Number of Permits')
    ax2.tick_params(axis='x', rotation=45)
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

## Geographic Distribution

In [None]:
print(f"\n🗺️  GEOGRAPHIC DISTRIBUTION")
print(f"{'─' * 40}")
if 'Borough' in df.columns:
    borough_counts = df['Borough'].value_counts()
    print("Borough distribution:")
    for borough, count in borough_counts.items():
        pct = (count / len(df)) * 100
        print(f"• {borough}: {count:,} ({pct:.1f}%)")
    
    # Visualize borough distribution
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Bar chart
    borough_counts.plot(kind='bar', ax=ax1, color='lightgreen', edgecolor='darkgreen')
    ax1.set_title('Permits by Borough', fontweight='bold')
    ax1.set_ylabel('Number of Permits')
    ax1.tick_params(axis='x', rotation=45)
    ax1.grid(True, alpha=0.3)
    
    # Pie chart
    borough_counts.plot(kind='pie', ax=ax2, autopct='%1.1f%%', startangle=90)
    ax2.set_title('Borough Distribution', fontweight='bold')
    ax2.set_ylabel('')
    
    plt.tight_layout()
    plt.show()

## Kiosk Type Analysis

In [None]:
if 'Planned Kiosk Type' in df.columns:
    print(f"\n📡 KIOSK TYPES")
    print(f"{'─' * 40}")
    type_counts = df['Planned Kiosk Type'].value_counts()
    for ktype, count in type_counts.items():
        pct = (count / len(df)) * 100
        print(f"• {ktype}: {count:,} ({pct:.1f}%)")
    
    # Visualize kiosk types
    plt.figure(figsize=(10, 6))
    colors = ['skyblue', 'lightcoral', 'lightgreen']
    type_counts.plot(kind='bar', color=colors[:len(type_counts)], edgecolor='black')
    plt.title('Permit Kiosk Type Distribution', fontsize=14, fontweight='bold')
    plt.ylabel('Number of Permits')
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    # Cross-tabulation: Kiosk type by borough
    if 'Borough' in df.columns:
        cross_tab = pd.crosstab(df['Borough'], df['Planned Kiosk Type'])
        
        plt.figure(figsize=(10, 6))
        cross_tab.plot(kind='bar', stacked=True, ax=plt.gca())
        plt.title('Kiosk Types by Borough', fontweight='bold')
        plt.ylabel('Number of Permits')
        plt.xlabel('Borough')
        plt.xticks(rotation=45)
        plt.legend(title='Kiosk Type', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.show()

## Community Board and Council District Analysis

In [None]:
if 'Community Board' in df.columns:
    print(f"\n🏛️  COMMUNITY BOARDS")
    print(f"{'─' * 40}")
    cb_counts = df['Community Board'].value_counts().head(10)
    print("Top 10 Community Boards:")
    for cb, count in cb_counts.items():
        if pd.notna(cb):
            print(f"• CB {cb}: {count:,} permits")

if 'Council District' in df.columns:
    print(f"\n🏛️  COUNCIL DISTRICTS")
    print(f"{'─' * 40}")
    cd_counts = df['Council District'].value_counts().head(10)
    print("Top 10 Council Districts:")
    for cd, count in cd_counts.items():
        if pd.notna(cd):
            print(f"• District {cd}: {count:,} permits")
    
    # Visualize community boards and council districts
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Community boards
    if len(cb_counts) > 0:
        cb_counts.plot(kind='bar', ax=ax1, color='orange', edgecolor='darkorange')
        ax1.set_title('Top Community Boards', fontweight='bold')
        ax1.set_ylabel('Number of Permits')
        ax1.set_xlabel('Community Board')
        ax1.tick_params(axis='x', rotation=45)
        ax1.grid(True, alpha=0.3)
    
    # Council districts
    if len(cd_counts) > 0:
        cd_counts.plot(kind='bar', ax=ax2, color='purple', edgecolor='darkpurple')
        ax2.set_title('Top Council Districts', fontweight='bold')
        ax2.set_ylabel('Number of Permits')
        ax2.set_xlabel('Council District')
        ax2.tick_params(axis='x', rotation=45)
        ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

## Timeline Analysis

In [None]:
if 'Final Submission (A)' in df.columns:
    print(f"\n📅 PERMIT TIMELINE")
    print(f"{'─' * 40}")
    submission_df = df.dropna(subset=['Final Submission (A)'])
    if len(submission_df) > 0:
        print(f"Permits with submission dates: {len(submission_df):,}")
        print(f"First submission: {submission_df['Final Submission (A)'].min().strftime('%Y-%m-%d')}")
        print(f"Latest submission: {submission_df['Final Submission (A)'].max().strftime('%Y-%m-%d')}")

        # Submissions by year
        submission_df['Submission_Year'] = submission_df['Final Submission (A)'].dt.year
        yearly_submissions = submission_df['Submission_Year'].value_counts().sort_index()
        print("\nSubmissions by year:")
        for year, count in yearly_submissions.items():
            if pd.notna(year):
                print(f"• {int(year)}: {count:,} permits")
        
        # Visualize timeline
        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10))
        
        # Submissions by year
        yearly_submissions.plot(kind='bar', ax=ax1, color='teal', edgecolor='darkteal')
        ax1.set_title('Permit Submissions by Year', fontweight='bold')
        ax1.set_ylabel('Number of Submissions')
        ax1.grid(True, alpha=0.3)
        
        # Monthly timeline
        submission_df.set_index('Final Submission (A)').resample('M').size().plot(
            ax=ax2, kind='line', marker='o', linewidth=2, color='blue')
        ax2.set_title('Monthly Permit Submission Timeline', fontweight='bold')
        ax2.set_ylabel('Number of Submissions')
        ax2.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()

## Business Improvement Districts and Historic Districts

In [None]:
# Historic district analysis
if 'Link In Historic District' in df.columns:
    print(f"\n🏛️  HISTORIC DISTRICTS")
    print(f"{'─' * 40}")
    historic_counts = df['Link In Historic District'].value_counts()
    for status, count in historic_counts.items():
        if pd.notna(status):
            pct = (count / len(df)) * 100
            print(f"• {status}: {count:,} ({pct:.1f}%)")

# BID analysis
if 'Site in Business Improvement District (BID)?' in df.columns:
    print(f"\n💼 BUSINESS IMPROVEMENT DISTRICTS")
    print(f"{'─' * 40}")
    bid_counts = df['Site in Business Improvement District (BID)?'].value_counts()
    for status, count in bid_counts.items():
        if pd.notna(status):
            pct = (count / len(df)) * 100
            print(f"• {status}: {count:,} ({pct:.1f}%)")
    
    # Visualize BID and Historic District status
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # BID status
    bid_counts.plot(kind='pie', ax=ax1, autopct='%1.1f%%', startangle=90, colors=['lightblue', 'lightcoral'])
    ax1.set_title('Business Improvement District Status', fontweight='bold')
    ax1.set_ylabel('')
    
    # Historic district status (if data available)
    if len(historic_counts) > 0 and historic_counts.sum() > 0:
        historic_counts.plot(kind='pie', ax=ax2, autopct='%1.1f%%', startangle=90, colors=['lightgreen', 'lightyellow'])
        ax2.set_title('Historic District Status', fontweight='bold')
        ax2.set_ylabel('')
    else:
        ax2.text(0.5, 0.5, 'No Historic District\nData Available', 
                ha='center', va='center', transform=ax2.transAxes, fontsize=12)
        ax2.set_title('Historic District Status', fontweight='bold')
    
    plt.tight_layout()
    plt.show()

## Geographic Coordinates Analysis

In [None]:
# Geographic analysis
if 'Latitude' in df.columns and 'Longitude' in df.columns:
    print(f"\n🗺️  GEOGRAPHIC COORDINATES")
    print(f"{'─' * 40}")
    
    coords_df = df[['Latitude', 'Longitude', 'Borough']].dropna()
    print(f"Permits with coordinates: {len(coords_df):,}")
    if len(coords_df) > 0:
        print(f"Latitude range: {coords_df['Latitude'].min():.6f} to {coords_df['Latitude'].max():.6f}")
        print(f"Longitude range: {coords_df['Longitude'].min():.6f} to {coords_df['Longitude'].max():.6f}")
        
        # Scatter plot of permit locations
        plt.figure(figsize=(12, 8))
        
        # Plot by borough with different colors
        if 'Borough' in coords_df.columns:
            boroughs = coords_df['Borough'].unique()
            colors = plt.cm.Set1(np.linspace(0, 1, len(boroughs)))
            
            for i, borough in enumerate(boroughs):
                borough_data = coords_df[coords_df['Borough'] == borough]
                plt.scatter(borough_data['Longitude'], borough_data['Latitude'], 
                          c=[colors[i]], label=borough, alpha=0.7, s=50)
        else:
            plt.scatter(coords_df['Longitude'], coords_df['Latitude'], alpha=0.7, s=50)
        
        plt.xlabel('Longitude')
        plt.ylabel('Latitude')
        plt.title('LinkNYC Permit Application Locations', fontweight='bold')
        if 'Borough' in coords_df.columns:
            plt.legend()
        plt.grid(True, alpha=0.3)
        plt.show()

## Permit Process Analysis

In [None]:
# Analyze permit process timeline
date_columns = ['Final Submission (A)', 'GF DoITT Submitted to CB (A)', 'GF CB Comment Period Ends (A)']
available_dates = [col for col in date_columns if col in df.columns]

if len(available_dates) > 1:
    print(f"\n📋 PERMIT PROCESS TIMELINE")
    print(f"{'─' * 40}")
    
    process_df = df[available_dates].dropna(how='all')
    print(f"Permits with process dates: {len(process_df):,}")
    
    for col in available_dates:
        non_null_count = df[col].notna().sum()
        print(f"• {col}: {non_null_count} permits ({non_null_count/len(df)*100:.1f}%)")
    
    # Calculate processing times (if we have sequential dates)
    if 'Final Submission (A)' in df.columns and 'GF DoITT Submitted to CB (A)' in df.columns:
        processing_time = df['GF DoITT Submitted to CB (A)'] - df['Final Submission (A)']
        valid_times = processing_time.dropna()
        
        if len(valid_times) > 0:
            print(f"\nProcessing time analysis:")
            print(f"• Average time to CB submission: {valid_times.mean().days:.1f} days")
            print(f"• Median time to CB submission: {valid_times.median().days:.1f} days")
            
            # Histogram of processing times
            plt.figure(figsize=(10, 6))
            plt.hist(valid_times.dt.days, bins=20, alpha=0.7, edgecolor='black', color='lightblue')
            plt.title('Distribution of Processing Times (Final Submission to CB)', fontweight='bold')
            plt.xlabel('Days')
            plt.ylabel('Number of Permits')
            plt.grid(True, alpha=0.3)
            plt.show()

## Summary Dashboard

In [None]:
# Create summary dashboard
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('LinkNYC Permits Analysis Dashboard', fontsize=16, fontweight='bold')

# 1. Installation status pie chart
if 'Installation Status ' in df.columns:
    status_counts = df['Installation Status '].value_counts()
    axes[0,0].pie(status_counts.values, labels=status_counts.index, autopct='%1.1f%%', startangle=90)
    axes[0,0].set_title('Installation Status Distribution')

# 2. Borough distribution bar chart
if 'Borough' in df.columns:
    borough_counts = df['Borough'].value_counts()
    axes[0,1].bar(borough_counts.index, borough_counts.values, color='lightgreen', edgecolor='darkgreen')
    axes[0,1].set_title('Permits by Borough')
    axes[0,1].tick_params(axis='x', rotation=45)
    axes[0,1].grid(True, alpha=0.3)

# 3. Timeline of submissions
if 'Final Submission (A)' in df.columns:
    submission_df = df.dropna(subset=['Final Submission (A)'])
    if len(submission_df) > 0:
        submission_df['Submission_Year'] = submission_df['Final Submission (A)'].dt.year
        yearly_submissions = submission_df['Submission_Year'].value_counts().sort_index()
        axes[1,0].bar(yearly_submissions.index, yearly_submissions.values, color='teal', edgecolor='darkteal')
        axes[1,0].set_title('Permit Submissions by Year')
        axes[1,0].set_xlabel('Year')
        axes[1,0].set_ylabel('Number of Submissions')
        axes[1,0].grid(True, alpha=0.3)

# 4. Kiosk type distribution
if 'Planned Kiosk Type' in df.columns:
    type_counts = df['Planned Kiosk Type'].value_counts()
    colors = ['skyblue', 'lightcoral', 'lightgreen']
    axes[1,1].bar(range(len(type_counts)), type_counts.values, 
                  color=colors[:len(type_counts)], edgecolor='black')
    axes[1,1].set_xticks(range(len(type_counts)))
    axes[1,1].set_xticklabels(type_counts.index, rotation=45)
    axes[1,1].set_title('Permit Kiosk Types')
    axes[1,1].set_ylabel('Number of Permits')
    axes[1,1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Analysis Summary

In [None]:
print("\n" + "="*60)
print("PERMITS ANALYSIS SUMMARY")
print("="*60)
print(f"Analysis completed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Dataset: LinkNYC New Site Permit Applications")
print(f"Total permits analyzed: {len(df):,}")
print(f"Data completeness: {((df.shape[0] * df.shape[1] - df.isnull().sum().sum()) / (df.shape[0] * df.shape[1]) * 100):.1f}%")

if 'Borough' in df.columns:
    top_borough = df['Borough'].value_counts().index[0]
    print(f"Primary focus: {top_borough} has the most permit applications")

if 'Planned Kiosk Type' in df.columns:
    top_type = df['Planned Kiosk Type'].value_counts().index[0]
    print(f"Most common type: {top_type} permits are most frequent")

print(f"Key insight: Recent permit activity suggests continued LinkNYC network expansion")