# Comprehensive Exploratory Data Analysis (EDA)
## South German Credit Dataset - Complete Visualization Guide

This notebook provides an in-depth exploratory data analysis with multiple visualizations to understand:
- Target class distribution and imbalance
- Numerical feature distributions and skewness
- Categorical feature frequencies
- Feature correlations and multicollinearity
- Predictive power of features
- Outlier identification and analysis

## 1. Import Required Libraries

In [None]:
# Core data manipulation libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# For advanced visualizations
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# Set style for better-looking plots
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Configure display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

print("‚úì All libraries imported successfully")

## 2. Load and Explore Data

In [None]:
# Define column names
column_names = [
    'Status_Checking_Account', 'Duration_Months', 'Credit_History', 'Purpose', 
    'Credit_Amount', 'Savings_Account', 'Employment_Since', 'Installment_Rate', 
    'Gender_Status', 'Other_Debtors', 'Residence_Years', 'Property', 'Age', 
    'Other_Installments', 'Housing', 'Existing_Credits', 'Job', 'Dependents', 
    'Telephone', 'Foreign_Worker', 'Credit_Risk'
]

# Load dataset
df = pd.read_csv('SouthGermanCredit.asc', delim_whitespace=True, header=0, names=column_names)

# Display basic information
print("="*80)
print("DATASET OVERVIEW")
print("="*80)
print(f"\nüìä Dataset Shape: {df.shape}")
print(f"   Rows: {df.shape[0]}")
print(f"   Columns: {df.shape[1]}")

print(f"\nüìã Data Types:")
print(df.dtypes)

print(f"\n‚ùå Missing Values:")
print(df.isnull().sum())

print(f"\nüìà Statistical Summary:")
print(df.describe().round(3))

print(f"\nüéØ Target Variable Distribution:")
print(df['Credit_Risk'].value_counts().sort_index())

## 3. Target Distribution Visualization
### Analyzing the severe class imbalance between Good Risk and Bad Risk loans

In [None]:
# Prepare data for visualization
target_counts = df['Credit_Risk'].value_counts().sort_index()
target_labels = ['Good Risk (1)', 'Bad Risk (0)']
target_colors = ['#2ecc71', '#e74c3c']

# Create figure with subplots
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Bar Chart with counts
ax1 = axes[0, 0]
bars = ax1.bar(target_labels, target_counts.values, color=target_colors, alpha=0.8, edgecolor='black', linewidth=2)
ax1.set_ylabel('Count', fontsize=12, fontweight='bold')
ax1.set_title('Target Distribution - Bar Chart (Counts)', fontsize=14, fontweight='bold')
ax1.grid(axis='y', alpha=0.3)

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height,
            f'{int(height)}',
            ha='center', va='bottom', fontsize=12, fontweight='bold')

# 2. Pie Chart with percentages
ax2 = axes[0, 1]
percentages = (target_counts.values / target_counts.sum()) * 100
wedges, texts, autotexts = ax2.pie(target_counts.values, labels=target_labels, colors=target_colors,
                                     autopct='%1.1f%%', startangle=90, textprops={'fontsize': 11, 'weight': 'bold'},
                                     explode=(0.05, 0.05), shadow=True)
ax2.set_title('Target Distribution - Pie Chart (Percentages)', fontsize=14, fontweight='bold')

# 3. Count and percentage table
ax3 = axes[1, 0]
ax3.axis('tight')
ax3.axis('off')

table_data = [
    ['Class', 'Count', 'Percentage', 'Ratio'],
    ['Good Risk (1)', f'{target_counts[1]}', f'{percentages[1]:.2f}%', f'1 : {target_counts[0]/target_counts[1]:.2f}'],
    ['Bad Risk (0)', f'{target_counts[0]}', f'{percentages[0]:.2f}%', 'Baseline']
]

table = ax3.table(cellText=table_data, cellLoc='center', loc='center',
                 colWidths=[0.25, 0.25, 0.25, 0.25])
table.auto_set_font_size(False)
table.set_fontsize(11)
table.scale(1, 2.5)

# Style header row
for i in range(4):
    table[(0, i)].set_facecolor('#3498db')
    table[(0, i)].set_text_props(weight='bold', color='white')

# Alternate row colors
for i in range(1, 3):
    for j in range(4):
        if i % 2 == 0:
            table[(i, j)].set_facecolor('#ecf0f1')
        else:
            table[(i, j)].set_facecolor('#ffffff')

ax3.set_title('Target Distribution Summary Table', fontsize=14, fontweight='bold', pad=20)

# 4. Imbalance visualization
ax4 = axes[1, 1]
imbalance_ratio = target_counts[0] / target_counts[1]
ax4.text(0.5, 0.7, f'Class Imbalance Ratio', ha='center', fontsize=16, fontweight='bold', transform=ax4.transAxes)
ax4.text(0.5, 0.5, f'Bad Risk : Good Risk = 1 : {1/imbalance_ratio:.2f}', 
         ha='center', fontsize=14, transform=ax4.transAxes, bbox=dict(boxstyle='round', facecolor='#fff3cd', alpha=0.8))
ax4.text(0.5, 0.3, f'Minority Class (Bad Risk) = {percentages[0]:.2f}%', 
         ha='center', fontsize=12, fontweight='bold', transform=ax4.transAxes, color='#e74c3c')
ax4.text(0.5, 0.1, f'‚ö†Ô∏è This severe imbalance requires SMOTE or class weighting', 
         ha='center', fontsize=11, transform=ax4.transAxes, style='italic', color='#c0392b')
ax4.axis('off')

plt.tight_layout()
plt.savefig('01_Target_Distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n" + "="*80)
print("TARGET DISTRIBUTION ANALYSIS")
print("="*80)
print(f"‚úì Good Risk (Class 1): {target_counts[1]} samples ({percentages[1]:.2f}%)")
print(f"‚úì Bad Risk (Class 0): {target_counts[0]} samples ({percentages[0]:.2f}%)")
print(f"‚úì Imbalance Ratio: 1 : {1/imbalance_ratio:.2f}")
print(f"‚úì Severity: {'SEVERE' if percentages[0] < 35 else 'MODERATE'} - Resampling recommended!")

## 4. Numerical Distributions Analysis
### Histograms and KDE plots for Credit Amount and Duration to detect skewness

In [None]:
# Identify key numerical features
key_numerical = ['Credit_Amount', 'Duration_Months', 'Age', 'Installment_Rate']

# Calculate skewness and kurtosis
print("\n" + "="*80)
print("NUMERICAL FEATURES - DISTRIBUTION ANALYSIS")
print("="*80)

skewness_data = []
for col in key_numerical:
    skew = stats.skew(df[col])
    kurt = stats.kurtosis(df[col])
    skewness_data.append({
        'Feature': col,
        'Skewness': skew,
        'Interpretation': 'Right-skewed' if skew > 0 else 'Left-skewed',
        'Kurtosis': kurt,
        'Mean': df[col].mean(),
        'Median': df[col].median(),
        'Std': df[col].std()
    })

skew_df = pd.DataFrame(skewness_data)
print("\n" + skew_df.to_string(index=False))

# Create comprehensive histogram visualization
fig = plt.figure(figsize=(18, 12))
gs = fig.add_gridspec(3, 2, hspace=0.3, wspace=0.3)

numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
numerical_features = [col for col in numerical_features if col != 'Credit_Risk']

for idx, feature in enumerate(numerical_features[:6]):
    ax = fig.add_subplot(gs[idx // 2, idx % 2])
    
    # Histogram with KDE
    ax.hist(df[feature], bins=50, alpha=0.6, color='steelblue', edgecolor='black', density=True, label='Histogram')
    
    # Add KDE plot
    from scipy.stats import gaussian_kde
    data = df[feature].dropna()
    kde = gaussian_kde(data)
    x_range = np.linspace(data.min(), data.max(), 200)
    ax.plot(x_range, kde(x_range), 'r-', linewidth=2.5, label='KDE')
    
    # Add statistics
    skewness = stats.skew(df[feature])
    mean_val = df[feature].mean()
    median_val = df[feature].median()
    
    ax.axvline(mean_val, color='green', linestyle='--', linewidth=2, label=f'Mean: {mean_val:.2f}')
    ax.axvline(median_val, color='orange', linestyle='--', linewidth=2, label=f'Median: {median_val:.2f}')
    
    ax.set_xlabel(feature, fontsize=11, fontweight='bold')
    ax.set_ylabel('Density', fontsize=11, fontweight='bold')
    ax.set_title(f'{feature} Distribution (Skewness: {skewness:.3f})', fontsize=12, fontweight='bold')
    ax.legend(fontsize=9)
    ax.grid(alpha=0.3)

plt.savefig('02_Numerical_Distributions.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n‚úì Visualization saved: 02_Numerical_Distributions.png")