In [0]:
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
import os

warnings.filterwarnings('ignore')

# =============================================================================
# SETUP
# =============================================================================

# Create reports directory
REPORTS_DIR = 'reports'
if not os.path.exists(REPORTS_DIR):
    os.makedirs(REPORTS_DIR)
    print(f"Created directory: {REPORTS_DIR}/\n")

# =============================================================================
# LOAD DATA
# =============================================================================

print("="*80)
print("GENERATING ANALYTICS REPORT")
print("="*80)

df = pd.read_csv('nyc_housing_processed.csv')
df_res = pd.read_csv('nyc_housing_residential_units.csv')

print(f"Total records: {len(df):,}")
print(f"Residential units: {len(df_res):,}\n")

# =============================================================================
# GENERATE REPORT
# =============================================================================

def generate_report():
    lines = []
    
    lines.append("="*80)
    lines.append("NYC HOUSING MARKET ANALYTICS REPORT")
    lines.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    lines.append("="*80)
    lines.append("")
    
    # =========================================================================
    # 1. EXECUTIVE SUMMARY
    # =========================================================================
    lines.append("="*80)
    lines.append("1. EXECUTIVE SUMMARY")
    lines.append("="*80)
    lines.append("")
    lines.append(f"Total Properties: {len(df):,}")
    lines.append(f"Residential Units (for $/sqft analysis): {len(df_res):,}")
    lines.append(f"Total Market Value: ${df['sale_price'].sum():,.0f}")
    lines.append("")
    lines.append(f"Median Sale Price: ${df['sale_price'].median():,.0f}")
    lines.append(f"Mean Sale Price: ${df['sale_price'].mean():,.0f}")
    lines.append("")
    
    # Price/sqft from residential
    valid_pps = df_res['price_per_sqft'].dropna()
    lines.append(f"Median Price/SqFt (Residential): ${valid_pps.median():,.0f}")
    lines.append(f"Mean Price/SqFt (Residential): ${valid_pps.mean():,.0f}")
    lines.append("")
    
    # =========================================================================
    # 2. BOROUGH OVERVIEW
    # =========================================================================
    lines.append("="*80)
    lines.append("2. BOROUGH OVERVIEW")
    lines.append("="*80)
    lines.append("")
    lines.append(f"{'Borough':<20} {'Count':>10} {'Median Price':>15} {'Median $/SqFt':>15}")
    lines.append("-"*65)
    
    for borough in ['Manhattan', 'Brooklyn', 'Queens', 'Bronx', 'Staten Island']:
        b_data = df[df['borough_name'] == borough]
        b_res = df_res[df_res['borough_name'] == borough]
        b_pps = b_res['price_per_sqft'].dropna()
        
        median = b_data['sale_price'].median()
        pps = b_pps.median() if len(b_pps) > 0 else 0
        
        lines.append(f"{borough:<20} {len(b_data):>10,} ${median:>13,.0f} ${pps:>13,.0f}")
    
    lines.append("")
    
    # =========================================================================
    # 3. DETAILED BOROUGH ANALYSIS (NEW!)
    # =========================================================================
    lines.append("="*80)
    lines.append("3. DETAILED BOROUGH ANALYSIS")
    lines.append("="*80)
    
    for borough in ['Manhattan', 'Brooklyn', 'Queens', 'Bronx', 'Staten Island']:
        b_data = df[df['borough_name'] == borough]
        b_res = df_res[df_res['borough_name'] == borough]
        b_pps = b_res['price_per_sqft'].dropna()
        
        lines.append("")
        lines.append("-"*80)
        lines.append(f"  {borough.upper()}")
        lines.append("-"*80)
        lines.append("")
        
        # Basic Stats
        lines.append(f"  Total Properties: {len(b_data):,}")
        lines.append(f"  Market Share: {len(b_data)/len(df)*100:.1f}%")
        lines.append(f"  Total Market Value: ${b_data['sale_price'].sum():,.0f}")
        lines.append("")
        
        # Price Statistics
        lines.append("  PRICE STATISTICS:")
        lines.append(f"    Minimum Price: ${b_data['sale_price'].min():,.0f}")
        lines.append(f"    25th Percentile: ${b_data['sale_price'].quantile(0.25):,.0f}")
        lines.append(f"    Median Price: ${b_data['sale_price'].median():,.0f}")
        lines.append(f"    75th Percentile: ${b_data['sale_price'].quantile(0.75):,.0f}")
        lines.append(f"    Maximum Price: ${b_data['sale_price'].max():,.0f}")
        lines.append(f"    Mean Price: ${b_data['sale_price'].mean():,.0f}")
        lines.append(f"    Std Deviation: ${b_data['sale_price'].std():,.0f}")
        lines.append("")
        
        # Price Per SqFt (from residential data)
        if len(b_pps) > 0:
            lines.append("  PRICE PER SQFT (Residential Only):")
            lines.append(f"    Sample Size: {len(b_pps):,}")
            lines.append(f"    Minimum: ${b_pps.min():,.0f}")
            lines.append(f"    25th Percentile: ${b_pps.quantile(0.25):,.0f}")
            lines.append(f"    Median: ${b_pps.median():,.0f}")
            lines.append(f"    75th Percentile: ${b_pps.quantile(0.75):,.0f}")
            lines.append(f"    Maximum: ${b_pps.max():,.0f}")
            lines.append(f"    Mean: ${b_pps.mean():,.0f}")
            lines.append("")
        
        # Building Categories in this Borough
        lines.append("  BUILDING CATEGORIES:")
        b_cats = b_data['building_category'].value_counts()
        for cat, count in b_cats.head(10).items():
            pct = count / len(b_data) * 100
            cat_median = b_data[b_data['building_category'] == cat]['sale_price'].median()
            lines.append(f"    {cat:<25} {count:>6,} ({pct:>5.1f}%)  Median: ${cat_median:>12,.0f}")
        lines.append("")
        
        # Price Tier Distribution for this Borough
        lines.append("  PRICE TIER DISTRIBUTION:")
        tier_order = ['Under $300K', '$300K-600K', '$600K-900K', '$900K-1.5M', '$1.5M-3M', 'Over $3M']
        b_tiers = b_data['price_tier'].value_counts()
        for tier in tier_order:
            if tier in b_tiers.index:
                count = b_tiers[tier]
                pct = count / len(b_data) * 100
                lines.append(f"    {tier:<15}: {count:>6,} ({pct:>5.1f}%)")
        lines.append("")
        
        # Building Age Statistics (if available)
        if 'building_age' in b_data.columns:
            b_age = b_data['building_age'].dropna()
            if len(b_age) > 0:
                lines.append("  BUILDING AGE:")
                lines.append(f"    Median Age: {b_age.median():.0f} years")
                lines.append(f"    Mean Age: {b_age.mean():.0f} years")
                lines.append(f"    Oldest: {b_age.max():.0f} years")
                lines.append(f"    Newest: {b_age.min():.0f} years")
                lines.append("")
        
        # Age Category Distribution (if available)
        if 'age_category' in b_data.columns:
            lines.append("  AGE CATEGORY DISTRIBUTION:")
            age_order = ['New (0-10)', 'Recent (10-25)', 'Modern (25-50)', 
                        'Mature (50-75)', 'Old (75-100)', 'Historic (100+)']
            b_ages = b_data['age_category'].value_counts()
            for age_cat in age_order:
                if age_cat in b_ages.index:
                    count = b_ages[age_cat]
                    pct = count / len(b_data) * 100
                    lines.append(f"    {age_cat:<20}: {count:>6,} ({pct:>5.1f}%)")
            lines.append("")
        
        # Building Size Stats
        b_size = b_data[b_data['bldgarea'] > 0]['bldgarea']
        if len(b_size) > 0:
            lines.append("  BUILDING SIZE (sqft):")
            lines.append(f"    Median: {b_size.median():,.0f}")
            lines.append(f"    Mean: {b_size.mean():,.0f}")
            lines.append(f"    Range: {b_size.min():,.0f} - {b_size.max():,.0f}")
            lines.append("")
        
        # Top 5 Highest Sales
        lines.append("  TOP 5 HIGHEST SALES:")
        top_sales = b_data.nlargest(5, 'sale_price')[['sale_price', 'building_category', 'bldgarea']]
        for i, (idx, row) in enumerate(top_sales.iterrows(), 1):
            sqft = f"{row['bldgarea']:,.0f} sqft" if pd.notna(row['bldgarea']) and row['bldgarea'] > 0 else "N/A"
            lines.append(f"    {i}. ${row['sale_price']:>12,.0f}  ({row['building_category']}, {sqft})")
        lines.append("")
    
    # =========================================================================
    # 4. BUILDING TYPE ANALYSIS
    # =========================================================================
    lines.append("="*80)
    lines.append("4. BUILDING TYPE ANALYSIS")
    lines.append("="*80)
    lines.append("")
    lines.append(f"{'Category':<30} {'Count':>10} {'%':>7} {'Median Price':>15}")
    lines.append("-"*65)
    
    cat_stats = df.groupby('building_category').agg({
        'sale_price': ['count', 'median']
    })
    cat_stats.columns = ['Count', 'Median']
    cat_stats = cat_stats.sort_values('Count', ascending=False)
    
    for cat, row in cat_stats.iterrows():
        pct = row['Count'] / len(df) * 100
        lines.append(f"{cat:<30} {int(row['Count']):>10,} {pct:>6.1f}% ${row['Median']:>13,.0f}")
    
    lines.append("")
    
    # =========================================================================
    # 5. PRICE TIER DISTRIBUTION (OVERALL)
    # =========================================================================
    lines.append("="*80)
    lines.append("5. PRICE TIER DISTRIBUTION (OVERALL)")
    lines.append("="*80)
    lines.append("")
    
    tier_order = ['Under $300K', '$300K-600K', '$600K-900K', '$900K-1.5M', '$1.5M-3M', 'Over $3M']
    tier_counts = df['price_tier'].value_counts()
    
    for tier in tier_order:
        if tier in tier_counts.index:
            count = tier_counts[tier]
            pct = count / len(df) * 100
            lines.append(f"  {tier:<15}: {count:>8,} ({pct:>5.1f}%)")
    
    lines.append("")
    
    # =========================================================================
    # 6. BOROUGH COMPARISON SUMMARY
    # =========================================================================
    lines.append("="*80)
    lines.append("6. BOROUGH COMPARISON SUMMARY")
    lines.append("="*80)
    lines.append("")
    
    # Most Expensive Borough
    borough_medians = df.groupby('borough_name')['sale_price'].median().sort_values(ascending=False)
    lines.append("  MEDIAN PRICE RANKING:")
    for i, (borough, median) in enumerate(borough_medians.items(), 1):
        lines.append(f"    {i}. {borough:<20} ${median:>12,.0f}")
    lines.append("")
    
    # Price per SqFt Ranking
    lines.append("  PRICE PER SQFT RANKING (Residential):")
    pps_by_borough = {}
    for borough in ['Manhattan', 'Brooklyn', 'Queens', 'Bronx', 'Staten Island']:
        b_pps = df_res[df_res['borough_name'] == borough]['price_per_sqft'].dropna()
        if len(b_pps) > 0:
            pps_by_borough[borough] = b_pps.median()
    
    pps_sorted = sorted(pps_by_borough.items(), key=lambda x: x[1], reverse=True)
    for i, (borough, pps) in enumerate(pps_sorted, 1):
        lines.append(f"    {i}. {borough:<20} ${pps:>8,.0f}/sqft")
    lines.append("")
    
    # Property Count Ranking
    lines.append("  PROPERTY COUNT RANKING:")
    borough_counts = df['borough_name'].value_counts()
    for i, (borough, count) in enumerate(borough_counts.items(), 1):
        pct = count / len(df) * 100
        lines.append(f"    {i}. {borough:<20} {count:>8,} ({pct:.1f}%)")
    lines.append("")
    
    # =========================================================================
    # 7. KEY INSIGHTS
    # =========================================================================
    lines.append("="*80)
    lines.append("7. KEY INSIGHTS")
    lines.append("="*80)
    lines.append("")
    
    # Calculate some insights
    most_expensive_borough = borough_medians.idxmax()
    least_expensive_borough = borough_medians.idxmin()
    price_gap = borough_medians.max() / borough_medians.min()
    
    most_properties_borough = borough_counts.idxmax()
    
    lines.append(f"  â€¢ Most Expensive Borough: {most_expensive_borough}")
    lines.append(f"    (Median: ${borough_medians.max():,.0f})")
    lines.append("")
    lines.append(f"  â€¢ Most Affordable Borough: {least_expensive_borough}")
    lines.append(f"    (Median: ${borough_medians.min():,.0f})")
    lines.append("")
    lines.append(f"  â€¢ Price Gap: {most_expensive_borough} is {price_gap:.1f}x more expensive than {least_expensive_borough}")
    lines.append("")
    lines.append(f"  â€¢ Highest Transaction Volume: {most_properties_borough}")
    lines.append(f"    ({borough_counts.max():,} properties)")
    lines.append("")
    
    # Most common property type overall
    most_common_type = df['building_category'].value_counts().idxmax()
    most_common_count = df['building_category'].value_counts().max()
    lines.append(f"  â€¢ Most Common Property Type: {most_common_type}")
    lines.append(f"    ({most_common_count:,} properties, {most_common_count/len(df)*100:.1f}%)")
    lines.append("")
    
    # =========================================================================
    # END
    # =========================================================================
    lines.append("="*80)
    lines.append("END OF REPORT")
    lines.append("="*80)
    
    return "\n".join(lines)

# =============================================================================
# SAVE REPORT
# =============================================================================

report = generate_report()

report_path = os.path.join(REPORTS_DIR, 'nyc_housing_analytics_report.txt')
with open(report_path, 'w') as f:
    f.write(report)

print(report)
print(f"\nðŸ’¾ Saved: {report_path}")