# Exploratory Data Analysis

This notebook provides a template for performing exploratory data analysis on your datasets.

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys

# Add src directory to path to import custom modules
sys.path.append('../src')
from data_loader import load_csv, load_excel, get_data_info
from analysis import (
    set_plot_style, explore_distributions, correlation_analysis,
    outlier_detection, create_boxplots, categorical_analysis
)

# Set plotting style
set_plot_style()

# Display all columns in pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [13]:
df = pd.read_csv("../data/ext_data/parks_properties_20250929.csv")
print(df.columns)
df[['ADDRESS','BOROUGH','ZIPCODE','LOCATION']].head()


Index(['ACQUISITIONDATE', 'ACRES', 'ADDRESS', 'BOROUGH', 'CLASS',
       'COMMUNITYBOARD', 'COUNCILDISTRICT', 'DEPARTMENT', 'EAPPLY', 'GISOBJID',
       'GISPROPNUM', 'GlobalID', 'JURISDICTION', 'LOCATION', 'MAPPED',
       'NAME311', 'NYS_ASSEMBLY', 'NYS_SENATE', 'OBJECTID', 'OMPPROPID',
       'PARENTID', 'PERMIT', 'PERMITDISTRICT', 'PERMITPARENT', 'PIP_RATABLE',
       'PRECINCT', 'RETIRED', 'SIGNNAME', 'SUBCATEGORY', 'TYPECATEGORY',
       'US_CONGRESS', 'WATERFRONT', 'ZIPCODE', 'multipolygon'],
      dtype='object')


Unnamed: 0,ADDRESS,BOROUGH,ZIPCODE,LOCATION
0,88-02 ATLANTIC AVENUE,Q,11416,"Atlantic Ave., 95 Ave. bet. 88 St., 89 St."
1,3324 RESERVOIR OVAL EAST,X,10467,"Van Cortlandt Ave. East, Resevoir Oval E"
2,,Q,11417,133 Ave. bet. 82 St. and 86 St.
3,,M,"10024, 10025","Riverside Dr., W. 91 St. To W. 95 St."
4,675 RIVERSIDE DRIVE,M,"10027, 10031","Riverside Dr to Henry Hudson Pkwy, W 153 St"


In [None]:
df = pd.read_csv("../data/ext_data/directory_of_toilets_.csv")
print(df.columns)
df[['ADDRESS','BOROUGH','ZIPCODE','LOCATION']].head()

## 2. Dataset Overview

In [None]:
# Get comprehensive dataset information
get_data_info(df)

## 3. Distribution Analysis

Explore the distributions of numeric variables.

In [None]:
# Explore distributions of numeric columns
numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
if 'id' in numeric_columns:
    numeric_columns.remove('id')  # Remove ID column if present

explore_distributions(df, columns=numeric_columns)

## 4. Correlation Analysis

In [None]:
# Analyze correlations between numeric variables
corr_matrix = correlation_analysis(df)

# Show strongest correlations
if not corr_matrix.empty:
    print("\nStrongest correlations (excluding self-correlations):")
    corr_pairs = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            corr_pairs.append((
                corr_matrix.columns[i],
                corr_matrix.columns[j],
                corr_matrix.iloc[i, j]
            ))
    
    corr_pairs_df = pd.DataFrame(corr_pairs, columns=['Var1', 'Var2', 'Correlation'])
    corr_pairs_df = corr_pairs_df.reindex(
        corr_pairs_df['Correlation'].abs().sort_values(ascending=False).index
    )
    print(corr_pairs_df.head(10))

## 5. Outlier Detection

In [None]:
# Detect outliers using IQR method
outlier_df = outlier_detection(df, columns=numeric_columns, method='iqr')

# Count outliers by column
outlier_columns = [col for col in outlier_df.columns if col.endswith('_outlier')]
outlier_summary = outlier_df[outlier_columns].sum().sort_values(ascending=False)
print("Outlier counts by column:")
print(outlier_summary)

# Visualize outliers with box plots
create_boxplots(df, columns=numeric_columns)

## 6. Categorical Variable Analysis

In [None]:
# Analyze categorical variables
categorical_analysis(df, max_categories=20)

## 7. Relationships Between Variables

Explore relationships between different types of variables.

In [None]:
# Example: Numeric vs Categorical relationships
categorical_cols = df.select_dtypes(include=['object', 'category']).columns

if len(categorical_cols) > 0 and len(numeric_columns) > 0:
    # Pick first categorical and first numeric column for demonstration
    cat_col = categorical_cols[0]
    num_col = numeric_columns[0]
    
    plt.figure(figsize=(12, 6))
    
    # Box plot
    plt.subplot(1, 2, 1)
    sns.boxplot(data=df, x=cat_col, y=num_col)
    plt.title(f'{num_col} by {cat_col}')
    plt.xticks(rotation=45)
    
    # Violin plot
    plt.subplot(1, 2, 2)
    sns.violinplot(data=df, x=cat_col, y=num_col)
    plt.title(f'{num_col} by {cat_col} (Violin Plot)')
    plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.show()

## 8. Summary and Next Steps

Based on the exploratory analysis, document your key findings and next steps.

### Key Findings:
1. Dataset contains X rows and Y columns
2. Missing values found in: [list columns]
3. Outliers detected in: [list columns]
4. Strong correlations between: [list variable pairs]
5. Categorical variables distribution: [describe patterns]

### Next Steps:
1. Data cleaning and preprocessing
2. Feature engineering based on findings
3. Model selection and training
4. Further analysis of specific relationships

### Questions for Further Investigation:
1. [Add questions based on your findings]
2. [Continue list as needed]