# Paisabazaar Credit Score Analysis - Data Visualization

This notebook contains visualizations to understand credit score patterns and factors affecting creditworthiness.

## Import Required Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

## Load Processed Data

In [None]:
# Load the processed dataset
df = pd.read_csv("../data/proccessed/paisabazaar_processed.csv")

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

## 1. Credit Score Distribution Analysis

Understanding the distribution of credit scores is crucial for assessing the overall creditworthiness of customers.

In [None]:
# Credit Score Distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Count plot
credit_score_counts = df['Credit_Score'].value_counts()
axes[0].bar(credit_score_counts.index, credit_score_counts.values, color=['#2ecc71', '#f39c12', '#e74c3c'])
axes[0].set_title('Credit Score Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Credit Score Category')
axes[0].set_ylabel('Number of Customers')
axes[0].grid(axis='y', alpha=0.3)

# Add value labels on bars
for i, v in enumerate(credit_score_counts.values):
    axes[0].text(i, v + 1000, str(v), ha='center', va='bottom', fontweight='bold')

# Pie chart
colors = ['#2ecc71', '#f39c12', '#e74c3c']
axes[1].pie(credit_score_counts.values, labels=credit_score_counts.index, autopct='%1.1f%%',
            colors=colors, startangle=90, textprops={'fontsize': 12, 'fontweight': 'bold'})
axes[1].set_title('Credit Score Distribution (%)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

print("Credit Score Distribution:")
print(credit_score_counts)
print(f"\nPercentage Distribution:")
print(credit_score_counts / len(df) * 100)

## 2. Income Analysis and Credit Score Relationship

Annual income is a key factor in determining creditworthiness.

In [None]:
# Income vs Credit Score
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Box plot - Annual Income by Credit Score
axes[0, 0].boxplot([df[df['Credit_Score'] == 'Good']['Annual_Income'].dropna(),
                     df[df['Credit_Score'] == 'Standard']['Annual_Income'].dropna(),
                     df[df['Credit_Score'] == 'Poor']['Annual_Income'].dropna()],
                    labels=['Good', 'Standard', 'Poor'],
                    patch_artist=True)
axes[0, 0].set_title('Annual Income Distribution by Credit Score', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Credit Score')
axes[0, 0].set_ylabel('Annual Income ($)')
axes[0, 0].grid(axis='y', alpha=0.3)

# Violin plot - Monthly Inhand Salary by Credit Score
sns.violinplot(data=df, x='Credit_Score', y='Monthly_Inhand_Salary', 
               order=['Good', 'Standard', 'Poor'], ax=axes[0, 1])
axes[0, 1].set_title('Monthly Salary Distribution by Credit Score', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Credit Score')
axes[0, 1].set_ylabel('Monthly Inhand Salary ($)')

# Average Income by Credit Score
avg_income = df.groupby('Credit_Score')['Annual_Income'].mean().sort_values(ascending=False)
colors_income = ['#2ecc71', '#f39c12', '#e74c3c']
axes[1, 0].bar(avg_income.index, avg_income.values, color=colors_income)
axes[1, 0].set_title('Average Annual Income by Credit Score', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Credit Score')
axes[1, 0].set_ylabel('Average Annual Income ($)')
axes[1, 0].grid(axis='y', alpha=0.3)

# Add value labels
for i, v in enumerate(avg_income.values):
    axes[1, 0].text(i, v + 500, f'${v:,.0f}', ha='center', va='bottom', fontweight='bold')

# Age vs Income colored by Credit Score
for score, color in zip(['Good', 'Standard', 'Poor'], ['#2ecc71', '#f39c12', '#e74c3c']):
    data = df[df['Credit_Score'] == score]
    axes[1, 1].scatter(data['Age'], data['Annual_Income'], alpha=0.5, label=score, color=color, s=20)
axes[1, 1].set_title('Age vs Annual Income by Credit Score', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('Age')
axes[1, 1].set_ylabel('Annual Income ($)')
axes[1, 1].legend()
axes[1, 1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print("Average Income by Credit Score:")
print(avg_income)

## 3. Credit Card and Banking Behavior

Analyzing credit card usage and banking patterns to understand risk factors.

In [None]:
# Credit Card Usage Analysis
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Number of Credit Cards by Credit Score
credit_card_avg = df.groupby('Credit_Score')['Num_Credit_Card'].mean().sort_values(ascending=False)
axes[0, 0].bar(credit_card_avg.index, credit_card_avg.values, color=['#2ecc71', '#f39c12', '#e74c3c'])
axes[0, 0].set_title('Average Number of Credit Cards by Credit Score', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Credit Score')
axes[0, 0].set_ylabel('Average Number of Credit Cards')
axes[0, 0].grid(axis='y', alpha=0.3)
for i, v in enumerate(credit_card_avg.values):
    axes[0, 0].text(i, v + 0.05, f'{v:.2f}', ha='center', va='bottom', fontweight='bold')

# Credit Utilization Ratio by Credit Score
sns.boxplot(data=df, x='Credit_Score', y='Credit_Utilization_Ratio', 
            order=['Good', 'Standard', 'Poor'], ax=axes[0, 1])
axes[0, 1].set_title('Credit Utilization Ratio by Credit Score', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Credit Score')
axes[0, 1].set_ylabel('Credit Utilization Ratio (%)')

# Number of Bank Accounts by Credit Score
bank_acc_avg = df.groupby('Credit_Score')['Num_Bank_Accounts'].mean().sort_values(ascending=False)
axes[1, 0].bar(bank_acc_avg.index, bank_acc_avg.values, color=['#2ecc71', '#f39c12', '#e74c3c'])
axes[1, 0].set_title('Average Number of Bank Accounts by Credit Score', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Credit Score')
axes[1, 0].set_ylabel('Average Number of Bank Accounts')
axes[1, 0].grid(axis='y', alpha=0.3)
for i, v in enumerate(bank_acc_avg.values):
    axes[1, 0].text(i, v + 0.05, f'{v:.2f}', ha='center', va='bottom', fontweight='bold')

# Outstanding Debt by Credit Score
sns.violinplot(data=df, x='Credit_Score', y='Outstanding_Debt', 
               order=['Good', 'Standard', 'Poor'], ax=axes[1, 1])
axes[1, 1].set_title('Outstanding Debt Distribution by Credit Score', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('Credit Score')
axes[1, 1].set_ylabel('Outstanding Debt ($)')

plt.tight_layout()
plt.show()

print("Credit Card Statistics by Credit Score:")
print(df.groupby('Credit_Score')[['Num_Credit_Card', 'Credit_Utilization_Ratio', 'Outstanding_Debt']].mean())

## 4. Payment Behavior Analysis

Payment behavior is a critical indicator of creditworthiness and default risk.

In [None]:
# Payment Behavior Analysis
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Payment Behaviour Distribution by Credit Score
payment_behaviour = pd.crosstab(df['Payment_Behaviour'], df['Credit_Score'], normalize='columns') * 100
payment_behaviour.plot(kind='bar', ax=axes[0, 0], color=['#2ecc71', '#f39c12', '#e74c3c'])
axes[0, 0].set_title('Payment Behaviour Distribution by Credit Score (%)', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Payment Behaviour')
axes[0, 0].set_ylabel('Percentage (%)')
axes[0, 0].legend(title='Credit Score')
axes[0, 0].tick_params(axis='x', rotation=45)
axes[0, 0].grid(axis='y', alpha=0.3)

# Delayed Payments by Credit Score
delay_avg = df.groupby('Credit_Score')['Num_of_Delayed_Payment'].mean().sort_values()
axes[0, 1].barh(delay_avg.index, delay_avg.values, color=['#2ecc71', '#f39c12', '#e74c3c'])
axes[0, 1].set_title('Average Number of Delayed Payments', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Average Number of Delayed Payments')
axes[0, 1].set_ylabel('Credit Score')
axes[0, 1].grid(axis='x', alpha=0.3)
for i, v in enumerate(delay_avg.values):
    axes[0, 1].text(v + 0.3, i, f'{v:.2f}', va='center', fontweight='bold')

# Delay from Due Date by Credit Score
sns.boxplot(data=df, x='Credit_Score', y='Delay_from_due_date', 
            order=['Good', 'Standard', 'Poor'], ax=axes[1, 0])
axes[1, 0].set_title('Days Delayed from Due Date by Credit Score', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Credit Score')
axes[1, 0].set_ylabel('Days Delayed from Due Date')

# Payment of Minimum Amount by Credit Score
min_payment = pd.crosstab(df['Credit_Score'], df['Payment_of_Min_Amount'], normalize='index') * 100
min_payment.plot(kind='bar', ax=axes[1, 1], color=['#3498db', '#e67e22'])
axes[1, 1].set_title('Payment of Minimum Amount by Credit Score (%)', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('Credit Score')
axes[1, 1].set_ylabel('Percentage (%)')
axes[1, 1].legend(title='Pays Min Amount')
axes[1, 1].tick_params(axis='x', rotation=0)
axes[1, 1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

print("Payment Statistics by Credit Score:")
print(df.groupby('Credit_Score')[['Num_of_Delayed_Payment', 'Delay_from_due_date']].describe())

## 5. Loan and Credit Mix Analysis

Understanding loan patterns and credit mix is essential for risk assessment.

In [None]:
# Loan and Credit Mix Analysis
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Number of Loans by Credit Score
loan_avg = df.groupby('Credit_Score')['Num_of_Loan'].mean().sort_values(ascending=False)
axes[0, 0].bar(loan_avg.index, loan_avg.values, color=['#2ecc71', '#f39c12', '#e74c3c'])
axes[0, 0].set_title('Average Number of Loans by Credit Score', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Credit Score')
axes[0, 0].set_ylabel('Average Number of Loans')
axes[0, 0].grid(axis='y', alpha=0.3)
for i, v in enumerate(loan_avg.values):
    axes[0, 0].text(i, v + 0.1, f'{v:.2f}', ha='center', va='bottom', fontweight='bold')

# Credit Mix Distribution by Credit Score
credit_mix = pd.crosstab(df['Credit_Mix'], df['Credit_Score'], normalize='columns') * 100
credit_mix.plot(kind='bar', ax=axes[0, 1], color=['#2ecc71', '#f39c12', '#e74c3c'])
axes[0, 1].set_title('Credit Mix Distribution by Credit Score (%)', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Credit Mix')
axes[0, 1].set_ylabel('Percentage (%)')
axes[0, 1].legend(title='Credit Score')
axes[0, 1].tick_params(axis='x', rotation=0)
axes[0, 1].grid(axis='y', alpha=0.3)

# Interest Rate by Credit Score
sns.violinplot(data=df, x='Credit_Score', y='Interest_Rate', 
               order=['Good', 'Standard', 'Poor'], ax=axes[1, 0])
axes[1, 0].set_title('Interest Rate Distribution by Credit Score', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Credit Score')
axes[1, 0].set_ylabel('Interest Rate (%)')

# Total EMI per Month by Credit Score
sns.boxplot(data=df, x='Credit_Score', y='Total_EMI_per_month', 
            order=['Good', 'Standard', 'Poor'], ax=axes[1, 1])
axes[1, 1].set_title('Total EMI per Month by Credit Score', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('Credit Score')
axes[1, 1].set_ylabel('Total EMI per Month ($)')

plt.tight_layout()
plt.show()

print("Loan Statistics by Credit Score:")
print(df.groupby('Credit_Score')[['Num_of_Loan', 'Interest_Rate', 'Total_EMI_per_month']].mean())

## 6. Credit History and Inquiries

Credit history age and inquiries are important factors in credit score determination.

In [None]:
# Credit History and Inquiries Analysis
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Credit History Age by Credit Score
sns.boxplot(data=df, x='Credit_Score', y='Credit_History_Age', 
            order=['Good', 'Standard', 'Poor'], ax=axes[0, 0])
axes[0, 0].set_title('Credit History Age by Credit Score', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Credit Score')
axes[0, 0].set_ylabel('Credit History Age (months)')

# Average Credit History Age by Credit Score
history_avg = df.groupby('Credit_Score')['Credit_History_Age'].mean().sort_values(ascending=False)
axes[0, 1].bar(history_avg.index, history_avg.values, color=['#2ecc71', '#f39c12', '#e74c3c'])
axes[0, 1].set_title('Average Credit History Age by Credit Score', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Credit Score')
axes[0, 1].set_ylabel('Average Credit History Age (months)')
axes[0, 1].grid(axis='y', alpha=0.3)
for i, v in enumerate(history_avg.values):
    axes[0, 1].text(i, v + 5, f'{v:.1f}', ha='center', va='bottom', fontweight='bold')

# Number of Credit Inquiries by Credit Score
inquiry_avg = df.groupby('Credit_Score')['Num_Credit_Inquiries'].mean().sort_values()
axes[1, 0].barh(inquiry_avg.index, inquiry_avg.values, color=['#2ecc71', '#f39c12', '#e74c3c'])
axes[1, 0].set_title('Average Number of Credit Inquiries by Credit Score', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Average Number of Credit Inquiries')
axes[1, 0].set_ylabel('Credit Score')
axes[1, 0].grid(axis='x', alpha=0.3)
for i, v in enumerate(inquiry_avg.values):
    axes[1, 0].text(v + 0.1, i, f'{v:.2f}', va='center', fontweight='bold')

# Credit History Age Distribution
axes[1, 1].hist([df[df['Credit_Score'] == 'Good']['Credit_History_Age'].dropna(),
                 df[df['Credit_Score'] == 'Standard']['Credit_History_Age'].dropna(),
                 df[df['Credit_Score'] == 'Poor']['Credit_History_Age'].dropna()],
                bins=30, label=['Good', 'Standard', 'Poor'], 
                color=['#2ecc71', '#f39c12', '#e74c3c'], alpha=0.7)
axes[1, 1].set_title('Credit History Age Distribution', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('Credit History Age (months)')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].legend()
axes[1, 1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

print("Credit History Statistics by Credit Score:")
print(df.groupby('Credit_Score')[['Credit_History_Age', 'Num_Credit_Inquiries']].describe())

## 7. Financial Management Indicators

Analyzing monthly balance and investment behavior.

In [None]:
# Financial Management Indicators
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Monthly Balance by Credit Score
sns.violinplot(data=df, x='Credit_Score', y='Monthly_Balance', 
               order=['Good', 'Standard', 'Poor'], ax=axes[0, 0])
axes[0, 0].set_title('Monthly Balance Distribution by Credit Score', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Credit Score')
axes[0, 0].set_ylabel('Monthly Balance ($)')

# Amount Invested Monthly by Credit Score
sns.boxplot(data=df, x='Credit_Score', y='Amount_invested_monthly', 
            order=['Good', 'Standard', 'Poor'], ax=axes[0, 1])
axes[0, 1].set_title('Monthly Investment Amount by Credit Score', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Credit Score')
axes[0, 1].set_ylabel('Amount Invested Monthly ($)')

# Average Monthly Balance by Credit Score
balance_avg = df.groupby('Credit_Score')['Monthly_Balance'].mean().sort_values(ascending=False)
axes[1, 0].bar(balance_avg.index, balance_avg.values, color=['#2ecc71', '#f39c12', '#e74c3c'])
axes[1, 0].set_title('Average Monthly Balance by Credit Score', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Credit Score')
axes[1, 0].set_ylabel('Average Monthly Balance ($)')
axes[1, 0].grid(axis='y', alpha=0.3)
for i, v in enumerate(balance_avg.values):
    axes[1, 0].text(i, v + 10, f'${v:.2f}', ha='center', va='bottom', fontweight='bold')

# Average Amount Invested by Credit Score
investment_avg = df.groupby('Credit_Score')['Amount_invested_monthly'].mean().sort_values(ascending=False)
axes[1, 1].bar(investment_avg.index, investment_avg.values, color=['#2ecc71', '#f39c12', '#e74c3c'])
axes[1, 1].set_title('Average Monthly Investment by Credit Score', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('Credit Score')
axes[1, 1].set_ylabel('Average Amount Invested Monthly ($)')
axes[1, 1].grid(axis='y', alpha=0.3)
for i, v in enumerate(investment_avg.values):
    axes[1, 1].text(i, v + 10, f'${v:.2f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

print("Financial Management Statistics by Credit Score:")
print(df.groupby('Credit_Score')[['Monthly_Balance', 'Amount_invested_monthly']].describe())

## 8. Occupation and Demographic Analysis

Understanding how occupation and demographics relate to credit scores.

In [None]:
# Occupation and Age Analysis
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Age Distribution by Credit Score
sns.violinplot(data=df, x='Credit_Score', y='Age', 
               order=['Good', 'Standard', 'Poor'], ax=axes[0, 0])
axes[0, 0].set_title('Age Distribution by Credit Score', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Credit Score')
axes[0, 0].set_ylabel('Age (years)')

# Average Age by Credit Score
age_avg = df.groupby('Credit_Score')['Age'].mean().sort_values(ascending=False)
axes[0, 1].bar(age_avg.index, age_avg.values, color=['#2ecc71', '#f39c12', '#e74c3c'])
axes[0, 1].set_title('Average Age by Credit Score', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Credit Score')
axes[0, 1].set_ylabel('Average Age (years)')
axes[0, 1].grid(axis='y', alpha=0.3)
for i, v in enumerate(age_avg.values):
    axes[0, 1].text(i, v + 0.5, f'{v:.1f}', ha='center', va='bottom', fontweight='bold')

# Top 10 Occupations by Count
top_occupations = df['Occupation'].value_counts().head(10)
axes[1, 0].barh(range(len(top_occupations)), top_occupations.values, color='#3498db')
axes[1, 0].set_yticks(range(len(top_occupations)))
axes[1, 0].set_yticklabels(top_occupations.index)
axes[1, 0].set_title('Top 10 Occupations in Dataset', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Number of Customers')
axes[1, 0].grid(axis='x', alpha=0.3)

# Credit Score Distribution by Top 5 Occupations
top_5_occupations = df['Occupation'].value_counts().head(5).index
occupation_credit = df[df['Occupation'].isin(top_5_occupations)]
occupation_pivot = pd.crosstab(occupation_credit['Occupation'], 
                                occupation_credit['Credit_Score'], 
                                normalize='index') * 100
occupation_pivot.plot(kind='bar', ax=axes[1, 1], color=['#2ecc71', '#f39c12', '#e74c3c'])
axes[1, 1].set_title('Credit Score Distribution - Top 5 Occupations (%)', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('Occupation')
axes[1, 1].set_ylabel('Percentage (%)')
axes[1, 1].legend(title='Credit Score')
axes[1, 1].tick_params(axis='x', rotation=45)
axes[1, 1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

print("Age Statistics by Credit Score:")
print(df.groupby('Credit_Score')['Age'].describe())

## 9. Correlation Heatmap

Analyzing correlations between key numeric features and credit score.

In [None]:
# Correlation Heatmap
# Select key numeric features
numeric_features = ['Age', 'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts', 
                    'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan', 'Delay_from_due_date',
                    'Num_of_Delayed_Payment', 'Credit_Utilization_Ratio', 'Credit_History_Age',
                    'Total_EMI_per_month', 'Amount_invested_monthly', 'Monthly_Balance',
                    'Num_Credit_Inquiries', 'Outstanding_Debt']

# Calculate correlation matrix
correlation_matrix = df[numeric_features].corr()

# Create heatmap
plt.figure(figsize=(14, 12))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Correlation Heatmap of Key Features', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

# Find top correlated features
print("Top 10 Positive Correlations (excluding diagonal):")
corr_pairs = correlation_matrix.unstack()
sorted_pairs = corr_pairs.sort_values(ascending=False)
# Remove diagonal and duplicates
sorted_pairs = sorted_pairs[sorted_pairs < 1.0]
print(sorted_pairs.head(10))

## 10. Key Risk Indicators Summary

Summary visualization of critical risk factors for loan default prediction.