# 01. Data Exploration and Understanding

**Author**: Rafsamjani Anugrah  
**Date**: 2024  
**Project**: Credit Risk Prediction - ID/X Partners  

## Tujuan Notebook

Notebook ini berfokus pada:
1. Memuat dan memahami dataset loan data 2007-2014
2. Analisis awal struktur data dan karakteristik
3. Identifikasi masalah data quality
4. Eksplorasi distribusi variabel target
5. Persiapan untuk tahap cleaning selanjutnya

## Dataset Information

- **Source**: Lending Club Loan Data (2007-2014)
- **Format**: CSV
- **Purpose**: Memprediksi risiko kredit (default vs fully paid)

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import datetime
import os

# Set styling untuk visualisasi
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 50)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

print("üìö Libraries imported successfully!")
print(f"üìÖ Analysis date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"üìÅ Working directory: {os.getcwd()}")

## 1. Load Dataset

In [None]:
# Cek lokasi file dataset
data_paths = [
    '../data/raw/loan_data_2007_2014.csv',
    '../../data/raw/loan_data_2007_2014.csv',
    'data/raw/loan_data_2007_2014.csv',
    'loan_data_2007_2014.csv'
]

data_path = None
for path in data_paths:
    if os.path.exists(path):
        data_path = path
        break

if data_path:
    print(f"‚úÖ Dataset found at: {data_path}")
    
    # Load dataset dengan parameter optimal
    print("üìä Loading dataset...")
    
    try:
        # Load data dengan sampling untuk preview cepat (jika file besar)
        df = pd.read_csv(data_path, low_memory=False)
        
        print(f"‚úÖ Dataset loaded successfully!")
        print(f"üìè Shape: {df.shape[0]:,} rows √ó {df.shape[1]} columns")
        print(f"üíæ Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
        
    except Exception as e:
        print(f"‚ùå Error loading dataset: {e}")
        df = None
        
else:
    print("‚ùå Dataset not found in any of the expected paths:")
    for path in data_paths:
        print(f"   - {path}")
    print("\nüí° Please ensure the dataset is available in the data/raw/ directory")
    df = None

In [None]:
# Tampilkan informasi dasar dataset
if df is not None:
    print("="*80)
    print("DATASET OVERVIEW")
    print("="*80)
    
    print(f"\nüìä Basic Statistics:")
    print(f"   Total Records: {df.shape[0]:,}")
    print(f"   Total Features: {df.shape[1]}")
    print(f"   File Size: {os.path.getsize(data_path) / 1024**2:.1f} MB")
    
    # Tampilkan sample data
    print(f"\nüëÄ First 3 rows:")
    display(df.head(3))
    
    # Tampilkan tipe data
    print(f"\nüî¢ Data Types:")
    dtype_counts = df.dtypes.value_counts()
    for dtype, count in dtype_counts.items():
        print(f"   {dtype}: {count} columns")
    
    # Tampilkan informasi kolom
    print(f"\nüìã Column Information:")
    display(df.info(verbose=False))
else:
    print("‚ùå No dataset available for exploration")

## 2. Column Analysis and Structure

In [None]:
if df is not None:
    print("="*80)
    print("COLUMN ANALYSIS")
    print("="*80)
    
    # Daftar semua kolom
    print(f"\nüìù All Columns ({len(df.columns)} total):")
    for i, col in enumerate(df.columns, 1):
        print(f"   {i:2d}. {col}")
    
    # Group columns by logical categories
    print(f"\nüóÇÔ∏è Column Categories:")
    
    # Loan characteristics
    loan_cols = [col for col in df.columns if any(x in col.lower() 
                for x in ['loan', 'amount', 'term', 'rate', 'purpose', 'grade'])]
    print(f"\nüí∞ Loan Characteristics ({len(loan_cols)}):")
    for col in loan_cols:
        print(f"   - {col}")
    
    # Borrower information
    borrower_cols = [col for col in df.columns if any(x in col.lower() 
                   for x in ['annual', 'emp', 'home', 'verification', 'dti'])]
    print(f"\nüë§ Borrower Information ({len(borrower_cols)}):")
    for col in borrower_cols:
        print(f"   - {col}")
    
    # Credit history
    credit_cols = [col for col in df.columns if any(x in col.lower() 
                 for x in ['fico', 'credit', 'delinq', 'revol', 'pub_rec', 'earliest'])]
    print(f"\nüìà Credit History ({len(credit_cols)}):")
    for col in credit_cols:
        print(f"   - {col}")
    
    # Status and dates
    status_cols = [col for col in df.columns if any(x in col.lower() 
                 for x in ['status', 'issue', 'last', 'next'])]
    print(f"\nüìÖ Status and Dates ({len(status_cols)}):")
    for col in status_cols:
        print(f"   - {col}")
else:
    print("‚ùå No dataset available")

## 3. Target Variable Analysis (Loan Status)

In [None]:
if df is not None and 'loan_status' in df.columns:
    print("="*80)
    print("TARGET VARIABLE ANALYSIS - LOAN STATUS")
    print("="*80)
    
    # Analisis distribusi loan status
    status_counts = df['loan_status'].value_counts()
    status_percentages = (status_counts / len(df) * 100).round(2)
    
    print(f"\nüìä Loan Status Distribution:")
    print("="*50)
    for status, count in status_counts.items():
        percentage = status_percentages[status]
        print(f"   {status:25} : {count:>7,} ({percentage:>6.2f}%)")
    
    # Visualisasi distribusi
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
    
    # Bar chart
    status_counts.plot(kind='bar', ax=ax1, color='skyblue')
    ax1.set_title('Loan Status Distribution - Count', fontsize=14, fontweight='bold')
    ax1.set_xlabel('Loan Status')
    ax1.set_ylabel('Count')
    ax1.tick_params(axis='x', rotation=45)
    
    # Pie chart
    colors = plt.cm.Set3(np.linspace(0, 1, len(status_counts)))
    wedges, texts, autotexts = ax2.pie(status_counts, labels=status_counts.index, 
                                     autopct='%1.1f%%', colors=colors, startangle=90)
    ax2.set_title('Loan Status Proportion', fontsize=14, fontweight='bold')
    
    # Horizontal bar chart untuk top 10
    top_10_status = status_counts.head(10)
    y_pos = range(len(top_10_status))
    ax3.barh(y_pos, top_10_status.values, color='lightcoral')
    ax3.set_yticks(y_pos)
    ax3.set_yticklabels(top_10_status.index)
    ax3.invert_yaxis()  # Labels read top-to-bottom
    ax3.set_xlabel('Count')
    ax3.set_title('Top 10 Loan Status Types', fontsize=14, fontweight='bold')
    
    # Log scale untuk distribusi
    ax4.bar(range(len(status_counts)), status_counts.values, color='gold')
    ax4.set_yscale('log')
    ax4.set_xlabel('Loan Status (sorted)')
    ax4.set_ylabel('Count (log scale)')
    ax4.set_title('Loan Status Distribution (Log Scale)', fontsize=14, fontweight='bold')
    ax4.set_xticks([])  # Hide x-axis labels for clarity
    
    plt.tight_layout()
    plt.show()
    
    # Analisis khusus untuk completed loans
    completed_loans = ['Fully Paid', 'Charged Off']
    df_completed = df[df['loan_status'].isin(completed_loans)]
    
    print(f"\nüéØ Completed Loans Analysis:")
    print("="*40)
    print(f"   Completed loans (Fully Paid + Charged Off): {len(df_completed):,} ({len(df_completed)/len(df)*100:.1f}%)")
    print(f"   Fully Paid: {len(df_completed[df_completed['loan_status'] == 'Fully Paid']):,}")
    print(f"   Charged Off: {len(df_completed[df_completed['loan_status'] == 'Charged Off']):,}")
    
    # Default rate calculation
    if len(df_completed) > 0:
        default_rate = (df_completed['loan_status'] == 'Charged Off').mean() * 100
        print(f"   Default rate among completed loans: {default_rate:.2f}%")
        print(f"   Success rate among completed loans: {100-default_rate:.2f}%")
    
else:
    print("‚ùå Loan status column not found or no dataset available")

## 4. Missing Values Analysis

In [None]:
if df is not None:
    print("="*80)
    print("MISSING VALUES ANALYSIS")
    print("="*80)
    
    # Calculate missing values
    missing_counts = df.isnull().sum()
    missing_percentages = (missing_counts / len(df) * 100).round(2)
    
    # Create missing values DataFrame
    missing_df = pd.DataFrame({
        'Column': df.columns,
        'Missing Count': missing_counts.values,
        'Missing %': missing_percentages.values,
        'Data Type': df.dtypes.values
    })
    
    # Sort by missing percentage
    missing_df = missing_df.sort_values('Missing %', ascending=False)
    
    # Summary statistics
    total_columns = len(df.columns)
    columns_with_missing = (missing_counts > 0).sum()
    columns_with_high_missing = (missing_percentages > 50).sum()
    
    print(f"\nüìä Missing Values Summary:")
    print(f"   Total columns: {total_columns}")
    print(f"   Columns with missing values: {columns_with_missing} ({columns_with_missing/total_columns*100:.1f}%)")
    print(f"   Columns with >50% missing: {columns_with_high_missing}")
    print(f"   Total missing values: {missing_counts.sum():,}")
    
    # Display columns with missing values
    missing_cols = missing_df[missing_df['Missing Count'] > 0]
    
    if len(missing_cols) > 0:
        print(f"\nüìã Columns with Missing Values ({len(missing_cols)} columns):")
        print("="*80)
        
        # Format display
        for _, row in missing_cols.iterrows():
            missing_bar = '‚ñà' * int(row['Missing %'] / 2)  # Visual bar
            print(f"{row['Column']:<30} | {row['Missing Count']:>8,} | {row['Missing %']:>6.2f}% | {missing_bar}")
        
        # Visualize missing values pattern
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))
        
        # Missing values by column (top 20)
        top_missing = missing_cols.head(20)
        ax1.barh(range(len(top_missing)), top_missing['Missing %'], color='salmon')
        ax1.set_yticks(range(len(top_missing)))
        ax1.set_yticklabels(top_missing['Column'], fontsize=8)
        ax1.invert_yaxis()
        ax1.set_xlabel('Missing Percentage (%)')
        ax1.set_title('Top 20 Columns by Missing Values', fontsize=14, fontweight='bold')
        
        # Missing percentage distribution
        missing_bins = [0, 1, 5, 10, 25, 50, 75, 100]
        missing_hist, _ = np.histogram(missing_percentages, bins=missing_bins)
        
        ax2.bar(range(len(missing_hist)-1), missing_hist[:-1], color='lightblue', edgecolor='navy')
        ax2.set_xlabel('Missing Percentage Range')
        ax2.set_ylabel('Number of Columns')
        ax2.set_title('Distribution of Missing Values', fontsize=14, fontweight='bold')
        ax2.set_xticks(range(len(missing_bins)-1))
        ax2.set_xticklabels([f'{missing_bins[i]}-{missing_bins[i+1]}%' for i in range(len(missing_bins)-1)], 
                          rotation=45)
        
        # Add value labels on bars
        for i, v in enumerate(missing_hist[:-1]):
            if v > 0:
                ax2.text(i, v + 0.5, str(int(v)), ha='center', va='bottom')
        
        plt.tight_layout()
        plt.show()
        
        # Recommendations for handling missing values
        print(f"\nüí° Missing Values Handling Recommendations:")
        print("="*60)
        
        high_missing = missing_df[missing_df['Missing %'] > 50]
        if len(high_missing) > 0:
            print(f"\nüî¥ HIGH MISSING (>50%): Consider dropping these columns")
            for _, row in high_missing.iterrows():
                print(f"   - {row['Column']}: {row['Missing %']:.1f}%")
        
        medium_missing = missing_df[(missing_df['Missing %'] > 10) & (missing_df['Missing %'] <= 50)]
        if len(medium_missing) > 0:
            print(f"\nüü° MEDIUM MISSING (10-50%): Requires careful imputation strategy")
            for _, row in medium_missing.head(5).iterrows():
                print(f"   - {row['Column']}: {row['Missing %']:.1f}% ({row['Data Type']})")
            if len(medium_missing) > 5:
                print(f"   ... and {len(medium_missing)-5} more columns")
        
        low_missing = missing_df[(missing_df['Missing %'] > 0) & (missing_df['Missing %'] <= 10)]
        if len(low_missing) > 0:
            print(f"\nüü¢ LOW MISSING (0-10%): Simple imputation should work")
            for _, row in low_missing.head(5).iterrows():
                print(f"   - {row['Column']}: {row['Missing %']:.1f}% ({row['Data Type']})")
            if len(low_missing) > 5:
                print(f"   ... and {len(low_missing)-5} more columns")
    
    else:
        print("\n‚úÖ No missing values found in the dataset!")
    
else:
    print("‚ùå No dataset available for missing values analysis")

## 5. Data Types Analysis

In [None]:
if df is not None:
    print("="*80)
    print("DATA TYPES ANALYSIS")
    print("="*80)
    
    # Analyze data types
    dtype_analysis = pd.DataFrame({
        'Data Type': df.dtypes.value_counts().index,
        'Count': df.dtypes.value_counts().values
    })
    
    print(f"\nüìä Data Types Distribution:")
    print("="*40)
    for _, row in dtype_analysis.iterrows():
        percentage = row['Count'] / len(df.columns) * 100
        print(f"   {row['Data Type']:<20} : {row['Count']:>3} columns ({percentage:>5.1f}%)")
    
    # Detailed analysis by data type
    print(f"\nüîç Detailed Data Type Analysis:")
    print("="*40)
    
    # Numerical columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    print(f"\nüìä Numerical Columns ({len(numeric_cols)}):")
    
    # Basic statistics for numerical columns
    if len(numeric_cols) > 0:
        numeric_stats = df[numeric_cols].describe().T
        print(f"   Range of values:")
        for col in numeric_cols[:10]:  # Show first 10
            min_val = numeric_stats.loc[col, 'min']
            max_val = numeric_stats.loc[col, 'max']
            mean_val = numeric_stats.loc[col, 'mean']
            print(f"     {col:<25}: {min_val:>12,.2f} to {max_val:>12,.2f} (avg: {mean_val:>8,.2f})")
        if len(numeric_cols) > 10:
            print(f"     ... and {len(numeric_cols)-10} more numerical columns")
    
    # Categorical columns
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    print(f"\nüìù Categorical Columns ({len(categorical_cols)}):")
    
    if len(categorical_cols) > 0:
        print(f"   Cardinality analysis:")
        for col in categorical_cols[:15]:  # Show first 15
            unique_count = df[col].nunique()
            sample_values = df[col].dropna().unique()[:3]  # Show sample values
            sample_str = ', '.join([str(v) for v in sample_values])
            if len(sample_values) < df[col].nunique():
                sample_str += ', ...'
            print(f"     {col:<25}: {unique_count:>3} unique values")
            print(f"     {'':25}   Sample: {sample_str[:50]}")
        if len(categorical_cols) > 15:
            print(f"     ... and {len(categorical_cols)-15} more categorical columns")
    
    # DateTime columns (if any)
    datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
    print(f"\nüìÖ DateTime Columns ({len(datetime_cols)}):")
    for col in datetime_cols:
        min_date = df[col].min()
        max_date = df[col].max()
        print(f"     {col:<25}: {min_date} to {max_date}")
    
    # Potential type conversion candidates
    print(f"\nüîß Potential Type Conversions:")
    print("="*40)
    
    # Look for columns that might need conversion
    conversion_candidates = []
    
    # Check for percentage columns
    for col in df.columns:
        if df[col].dtype == 'object':
            sample_vals = df[col].dropna().head(5).astype(str)
            # Check for percentage
            if sample_vals.str.contains('%').any():
                conversion_candidates.append((col, 'percentage', sample_vals.tolist()))
            # Check for dates
            elif sample_vals.str.match(r'\w{3}-\d{2}').all():
                conversion_candidates.append((col, 'date', sample_vals.tolist()))
            # Check for numeric with special characters
            elif sample_vals.str.contains(r'\$|,').any():
                conversion_candidates.append((col, 'currency', sample_vals.tolist()))
    
    if conversion_candidates:
        print(f"   Found {len(conversion_candidates)} columns that may need type conversion:")
        for col, conv_type, samples in conversion_candidates[:10]:
            print(f"     - {col} ({conv_type}): {samples[:2]}")
        if len(conversion_candidates) > 10:
            print(f"     ... and {len(conversion_candidates)-10} more columns")
    else:
        print("   No obvious type conversion candidates found.")
    
    # Visualize data type distribution
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
    
    # Data type pie chart
    dtype_counts = df.dtypes.value_counts()
    colors = plt.cm.Set3(np.linspace(0, 1, len(dtype_counts)))
    ax1.pie(dtype_counts.values, labels=dtype_counts.index, autopct='%1.1f%%', 
           colors=colors, startangle=90)
    ax1.set_title('Data Type Distribution', fontsize=14, fontweight='bold')
    
    # Column count by type
    ax2.bar(range(len(dtype_counts)), dtype_counts.values, color=colors)
    ax2.set_xlabel('Data Type')
    ax2.set_ylabel('Number of Columns')
    ax2.set_title('Column Count by Data Type', fontsize=14, fontweight='bold')
    ax2.set_xticks(range(len(dtype_counts)))
    ax2.set_xticklabels(dtype_counts.index, rotation=45)
    
    # Add value labels
    for i, v in enumerate(dtype_counts.values):
        ax2.text(i, v + 0.5, str(int(v)), ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()
    
else:
    print("‚ùå No dataset available for data type analysis")

## 6. Initial Statistical Summary

In [None]:
if df is not None:
    print("="*80)
    print("INITIAL STATISTICAL SUMMARY")
    print("="*80)
    
    # Numerical columns statistics
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    
    if len(numeric_cols) > 0:
        print(f"\nüìä Numerical Columns Statistics (Top 15 by range):")
        print("="*60)
        
        # Calculate statistics
        stats_df = df[numeric_cols].describe().T
        
        # Add additional statistics
        stats_df['range'] = stats_df['max'] - stats_df['min']
        stats_df['missing_pct'] = (df[numeric_cols].isnull().sum() / len(df) * 100)
        stats_df['zeros_pct'] = ((df[numeric_cols] == 0).sum() / len(df) * 100)
        
        # Sort by range (descending)
        stats_df = stats_df.sort_values('range', ascending=False)
        
        # Display top 15
        display_cols = ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max', 'range', 'missing_pct', 'zeros_pct']
        
        # Format for display
        for col in stats_df.head(15).index:
            row = stats_df.loc[col]
            print(f"\nüî¢ {col}:")
            print(f"   Count: {int(row['count']):,} | Missing: {row['missing_pct']:.1f}% | Zeros: {row['zeros_pct']:.1f}%")
            print(f"   Range: {row['min']:,.2f} to {row['max']:,.2f} (spread: {row['range']:,.2f})")
            print(f"   Central tendency: Mean={row['mean']:,.2f}, Median={row['50%']:,.2f}")
            print(f"   Dispersion: Std={row['std']:,.2f}, IQR={row['75%']-row['25%']:,.2f}")
    
    # Categorical columns statistics
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns
    
    if len(categorical_cols) > 0:
        print(f"\n\nüìù Categorical Columns Statistics:")
        print("="*50)
        
        for col in categorical_cols[:10]:  # Analyze first 10 categorical columns
            unique_count = df[col].nunique()
            missing_count = df[col].isnull().sum()
            most_common = df[col].mode().iloc[0] if not df[col].mode().empty else 'N/A'
            most_common_pct = (df[col] == most_common).mean() * 100
            
            print(f"\nüìã {col}:")
            print(f"   Unique values: {unique_count:,}")
            print(f"   Missing: {missing_count:,} ({missing_count/len(df)*100:.1f}%)")
            print(f"   Most common: '{most_common}' ({most_common_pct:.1f}%)")
            
            # Show top categories
            value_counts = df[col].value_counts().head(3)
            print(f"   Top 3 values:")
            for val, count in value_counts.items():
                pct = count / len(df) * 100
                print(f"     '{val}': {count:,} ({pct:.1f}%)")
        
        if len(categorical_cols) > 10:
            print(f"\n   ... and {len(categorical_cols)-10} more categorical columns")
    
    # Quick outlier detection
    print(f"\n\n‚ö†Ô∏è  Potential Outliers (based on IQR method):")
    print("="*50)
    
    outliers_detected = 0
    for col in numeric_cols[:20]:  # Check first 20 numeric columns
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        outliers = ((df[col] < lower_bound) | (df[col] > upper_bound)).sum()
        outlier_pct = outliers / len(df) * 100
        
        if outlier_pct > 1:  # More than 1% outliers
            print(f"   {col}: {outliers:,} outliers ({outlier_pct:.1f}%)")
            outliers_detected += 1
    
    if outliers_detected == 0:
        print("   No significant outliers detected in the sample columns.")
    
else:
    print("‚ùå No dataset available for statistical analysis")

## 7. Summary and Next Steps

In [None]:
if df is not None:
    print("="*80)
    print("DATA EXPLORATION SUMMARY")
    print("="*80)
    
    # Key findings
    print(f"\nüìä DATASET SUMMARY:")
    print(f"   ‚Ä¢ Total Records: {df.shape[0]:,}")
    print(f"   ‚Ä¢ Total Features: {df.shape[1]}")
    print(f"   ‚Ä¢ Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
    
    if 'loan_status' in df.columns:
        completed_loans = df[df['loan_status'].isin(['Fully Paid', 'Charged Off'])]
        default_rate = (completed_loans['loan_status'] == 'Charged Off').mean() * 100 if len(completed_loans) > 0 else 0
        print(f"   ‚Ä¢ Completed Loans: {len(completed_loans):,} ({len(completed_loans)/len(df)*100:.1f}%)")
        print(f"   ‚Ä¢ Default Rate: {default_rate:.2f}%")
    
    # Data quality issues
    missing_cols = (df.isnull().sum() > 0).sum()
    high_missing_cols = ((df.isnull().sum() / len(df) * 100) > 50).sum()
    
    print(f"\nüîç DATA QUALITY ISSUES:")
    print(f"   ‚Ä¢ Columns with missing values: {missing_cols} ({missing_cols/len(df.columns)*100:.1f}%)")
    print(f"   ‚Ä¢ Columns with >50% missing: {high_missing_cols}")
    print(f"   ‚Ä¢ Total missing values: {df.isnull().sum().sum():,}")
    
    # Feature categories
    numeric_count = len(df.select_dtypes(include=[np.number]).columns)
    categorical_count = len(df.select_dtypes(include=['object', 'category']).columns)
    
    print(f"\nüìã FEATURE DISTRIBUTION:")
    print(f"   ‚Ä¢ Numerical features: {numeric_count}")
    print(f"   ‚Ä¢ Categorical features: {categorical_count}")
    print(f"   ‚Ä¢ DateTime features: {len(df.select_dtypes(include=['datetime64']).columns)}")
    
    print(f"\nüéØ NEXT STEPS FOR DATA CLEANING:")
    print("="*50)
    print("   1. Filter dataset to completed loans only (Fully Paid + Charged Off)")
    print("   2. Handle missing values based on importance and percentage")
    print("   3. Convert data types (percentages, dates, currencies)")
    print("   4. Create derived features (FICO average, financial ratios)")
    print("   5. Handle outliers in key numerical variables")
    print("   6. Encode categorical variables for modeling")
    
    # Save basic exploration results
    exploration_summary = {
        'dataset_shape': df.shape,
        'total_records': len(df),
        'total_features': len(df.columns),
        'missing_columns': missing_cols,
        'high_missing_columns': high_missing_cols,
        'numeric_features': numeric_count,
        'categorical_features': categorical_count,
        'completed_loans': len(completed_loans) if 'loan_status' in df.columns else 0,
        'default_rate': default_rate if 'loan_status' in df.columns else 0,
        'exploration_date': datetime.now().isoformat()
    }
    
    # Save summary to file
    import json
    try:
        with open('../data/exploration_summary.json', 'w') as f:
            json.dump(exploration_summary, f, indent=2, default=str)
        print(f"\nüíæ Exploration summary saved to '../data/exploration_summary.json'")
    except:
        print(f"\n‚ö†Ô∏è  Could not save exploration summary file")
    
    print(f"\n‚úÖ Data exploration completed successfully!")
    print(f"üìù Ready to proceed to '02_data_cleaning.ipynb'")
    
else:
    print("‚ùå No dataset available for summary")
    print("\nüí° Please ensure the dataset is available and try again")
    print("   Expected location: data/raw/loan_data_2007_2014.csv")