# Importing Required Libraries

In [11]:
import pandas as pd
import numpy as np

# Loading the required Data

In [12]:
# Load dataset
df = pd.read_csv("cleaned_loan_data.csv")
print("Data loaded. Shape:", df.shape)
df.head()

Data loaded. Shape: (10000, 12)


Unnamed: 0,id,age,cartype,loanamount,deposit,area,apr,applicationoutcome,funded,age_bucket,depositratio,apr_pct
0,9997,61,Convertible,24108.0,881.0,rural,0.15,1,0,55+,0.036544,15%
1,9996,42,Saloon,5988.0,967.0,rural,0.2,1,0,35-44,0.16149,20%
2,9995,57,Saloon,5099.0,973.0,urban,0.15,1,0,55+,0.190822,15%
3,9994,32,Convertible,20774.0,1478.0,rural,0.0,0,0,25-34,0.071147,0%
4,9993,29,Saloon,4612.0,774.0,urban,0.0,0,0,25-34,0.167823,0%


# Data Quality Checks

## Completeness Check

In [13]:
"""
Completeness Check:
- Ensure each field is not missing.
- If all required fields are present, return 'Completeness check passed'.
"""

completeness = df.isnull().mean().to_dict()

if max(completeness.values()) == 0:
    print("Completeness check passed.")
else:
    print("Missing values found.")
    print(completeness)


Completeness check passed.


## Uniqueness Check

In [14]:
"""
Uniqueness Check:
- Ensure each row is unique (no duplicates).
- If 100% unique, return 'Uniqueness check passed'.
"""

dup_pct = (1 - df.drop_duplicates().shape[0] / df.shape[0]) * 100

if dup_pct == 0:
    print("Uniqueness check passed.")
else:
    print(f"{dup_pct:.2f}% duplicate rows found.")


Uniqueness check passed.


## Validity Check

In [15]:
"""
Validity Check:
- Age must be between 18 and 90.
- Loan_amount must be > 0.
- Deposit must be ≥ 0.
- APR must be between 0 and 50 (%).
"""

validity = {}
if "age" in df.columns:
    validity["invalid_age_count"] = df[(df["age"] < 18) | (df["age"] > 90)].shape[0]
if "loan_amount" in df.columns:
    validity["invalid_loan_amount_count"] = df[df["loanamount"] <= 0].shape[0]
if "deposit" in df.columns:
    validity["invalid_deposit_count"] = df[df["deposit"] < 0].shape[0]
if "apr" in df.columns:
    validity["invalid_apr_count"] = df[(df["apr"] < 0) | (df["apr"] > 50)].shape[0]

if all(v == 0 for v in validity.values()):
    print("Validity check passed.")
else:
    print("Invalid values found.")
    print(validity)


Validity check passed.


## Accuracy Check

In [16]:
"""
Accuracy Check:
- Deposit should not exceed Loan Amount.
"""

accuracy = {}
if "deposit" in df.columns and "loanamount" in df.columns:
    accuracy["deposit_gt_loan_count"] = df[df["deposit"] > df["loanamount"]].shape[0]

if all(v == 0 for v in accuracy.values()):
    print("Accuracy check passed.")
else:
    print("Potential accuracy issues found.")
    print(accuracy)


Accuracy check passed.


## Integrity Check

In [17]:
"""
Integrity Check:
- Funded = Yes (1) only if Application Outcome = Approved (1).
"""

integrity = {}
if {"applicationoutcome", "funded"}.issubset(df.columns):
    integrity["funded_without_approval"] = df[
        (df["funded"] == 1) & (df["applicationoutcome"] == 0)
    ].shape[0]

if all(v == 0 for v in integrity.values()):
    print("Integrity check passed.")
else:
    print("Integrity violations found.")
    print(integrity)


Integrity check passed.


# Describe Data Distribution

In [18]:
"""
Data Description:
- For numeric columns: min, max, mean, median, mode
- For categorical columns: unique values, count of unique, mode
"""

numeric_report = {}
categorical_report = {}

for col in df.columns:
    if pd.api.types.is_numeric_dtype(df[col]):
        numeric_report[col] = {
            "min": df[col].min(),
            "max": df[col].max(),
            "mean": df[col].mean(),
            "median": df[col].median(),
            "mode": df[col].mode()[0] if not df[col].mode().empty else None
        }
    else:
        categorical_report[col] = {
            "unique_values": df[col].unique().tolist(),
            "n_unique": df[col].nunique(),
            "mode": df[col].mode()[0] if not df[col].mode().empty else None
        }

# Combine with numeric first, categorical after
desc_report_sorted = {**numeric_report, **categorical_report}

# Display neatly
print(pd.DataFrame(desc_report_sorted).T)

                         min       max        mean   median      mode  \
id                       1.0   10000.0      5000.5   5000.5       1.0   
age                     18.0      65.0     40.4774     40.0      25.0   
loanamount            2502.0   26986.0  11920.4302  11218.5    8624.0   
deposit                500.0    6000.0   2668.1421   2364.5     812.0   
apr                      0.0      0.25     0.09628      0.1       0.0   
applicationoutcome       0.0       1.0       0.647      1.0       1.0   
funded                   0.0       1.0      0.3069      0.0       0.0   
depositratio        0.020134  0.999573    0.266187   0.2204  0.094156   
cartype                  NaN       NaN         NaN      NaN       SUV   
area                     NaN       NaN         NaN      NaN     rural   
age_bucket               NaN       NaN         NaN      NaN     25-34   
apr_pct                  NaN       NaN         NaN      NaN        0%   

                                      unique_value