# 📊 Bank Loan and Credit Risk Data Analysis
This notebook performs detailed Exploratory Data Analysis (EDA) on the **Bank Loan and Credit Risk Dataset**.

✅ Handling missing values
✅ Outlier detection and treatment
✅ Feature encoding and transformation
✅ Correlation analysis and feature selection
✅ Visualization for pattern identification
✅ Assumption checking and data preparation

In [None]:

# Import required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler


In [None]:

# Load the dataset
df = pd.read_csv('Bank_Data_RealWorld.csv')

# Dataset Overview
print("Dataset Shape:", df.shape)
print(df.info())
df.head()


In [None]:

# --- Handling Missing Values ---
# Define imputer for numerical columns with mean strategy
num_imputer = SimpleImputer(strategy='mean')
num_cols = ['Income', 'LoanAmount', 'TransactionCount']
df[num_cols] = num_imputer.fit_transform(df[num_cols])

# Fill missing values for CreditHistory with mode
df['CreditHistory'].fillna(df['CreditHistory'].mode()[0], inplace=True)
print("Missing values after treatment:\n", df.isnull().sum())


In [None]:

# --- Outlier Detection and Treatment ---
# Z-Score to identify outliers
z_scores = np.abs(stats.zscore(df[num_cols]))
df_no_outliers = df[(z_scores < 3).all(axis=1)]

print(f"Rows after outlier treatment: {df_no_outliers.shape[0]}")


In [None]:

# --- Correlation Analysis ---
# Calculate correlation matrix
corr_matrix = df_no_outliers.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Matrix")
plt.show()


In [None]:

# --- One-Hot Encoding for Categorical Variables ---
df_encoded = pd.get_dummies(df_no_outliers, columns=['PropertyArea', 'MaritalStatus'], drop_first=True)

# Scaling numerical features
scaler = StandardScaler()
num_scaled = scaler.fit_transform(df_encoded[num_cols])
df_encoded[num_cols] = num_scaled

print("Final dataset shape after encoding and scaling:", df_encoded.shape)
