In [1]:
import pandas as pd

df = pd.read_csv("../data/raw/application_train.csv")
df.shape
df.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# A) Load & basic stats
print("Rows:", df.shape[0])
print("Columns:", df.shape[1])

# Target distribution (default rate)
print("\nTarget distribution:")
print(df['TARGET'].value_counts(normalize=True).rename('proportion'))

Rows: 307511
Columns: 122

Target distribution:
TARGET
0    0.919271
1    0.080729
Name: proportion, dtype: float64


In [8]:
# B) Missingness
# Missing % per column (top 30)
missing = df.isnull().mean().sort_values(ascending=False) * 100
print("\nTop 30 columns by missingess:")
display(missing.head(30))


Top 30 columns by missingess:


COMMONAREA_MEDI             69.872297
COMMONAREA_AVG              69.872297
COMMONAREA_MODE             69.872297
NONLIVINGAPARTMENTS_MODE    69.432963
NONLIVINGAPARTMENTS_AVG     69.432963
NONLIVINGAPARTMENTS_MEDI    69.432963
FONDKAPREMONT_MODE          68.386172
LIVINGAPARTMENTS_MODE       68.354953
LIVINGAPARTMENTS_AVG        68.354953
LIVINGAPARTMENTS_MEDI       68.354953
FLOORSMIN_AVG               67.848630
FLOORSMIN_MODE              67.848630
FLOORSMIN_MEDI              67.848630
YEARS_BUILD_MEDI            66.497784
YEARS_BUILD_MODE            66.497784
YEARS_BUILD_AVG             66.497784
OWN_CAR_AGE                 65.990810
LANDAREA_MEDI               59.376738
LANDAREA_MODE               59.376738
LANDAREA_AVG                59.376738
BASEMENTAREA_MEDI           58.515956
BASEMENTAREA_AVG            58.515956
BASEMENTAREA_MODE           58.515956
EXT_SOURCE_1                56.381073
NONLIVINGAREA_MODE          55.179164
NONLIVINGAREA_AVG           55.179164
NONLIVINGARE

In [9]:
# Identify columns >80% missing (flag)
high_missing = missing[missing > 80]
print("\nColumns with >80% missing values:")
display(high_missing)


Columns with >80% missing values:


Series([], dtype: float64)

In [11]:
# C) Duplicates
# Duplicated rows count
print("\nDuplicate rows:", df.duplicated().sum())

# Duplicate SK_ID_CURR count (should be 0)
print("Duplicate SK_ID_CURR:", df['SK_ID_CURR'].duplicated().sum())


Duplicate rows: 0
Duplicate SK_ID_CURR: 0


In [12]:
# D) Simple leakage scan (heuristic)
# Flag columns containing keywords like: TARGET, DEFAULT, OVERDUE, DELINQ, DPD, LATE, PAST_DUE
# Just flag; don't delete blindly.
keywords = ['TARGET', 'DEFAULT', 'OVERDUE', 'DELINQ', 'DPD', 'LATE', 'PAST_DUE']
leakage_cols = [col for col in df.columns if any(k in col.upper() for k in keywords)]
print("\nFlagged potential leakage columns:")
display(leakage_cols)


Flagged potential leakage columns:


['TARGET']

In [13]:
# E) Split strategy recommendation
# Home Credit doesn't have a clean time variable for a perfect time split, so for this project:
# - recommend stratified split and CV
# - document why (governance detail)
print("\nSplit Strategy Recommendation:")
print("Home Credit data lacks a clean time variable for chronological split.")
print("Use stratified train-test split and cross-validation to maintain target balance.")
print("Document this choice for governance and reproducibility.")


Split Strategy Recommendation:
Home Credit data lacks a clean time variable for chronological split.
Use stratified train-test split and cross-validation to maintain target balance.
Document this choice for governance and reproducibility.


In [15]:
# Output reminder
print("\nSave screenshots or exports of:")
print("1. Missingness table")
print("2. Target distribution")
print("3. Flagged leakage columns")
print("4. Conclusion section")


Save screenshots or exports of:
1. Missingness table
2. Target distribution
3. Flagged leakage columns
4. Conclusion section


In [17]:
import os
os.makedirs("../reports", exist_ok=True)

In [21]:
# Save outputs for documentation
missing.head(30).to_csv("../reports/missingness_top30.csv")
df['TARGET'].value_counts(normalize=True).rename('proportion').to_csv("../reports/target_distribution.csv")
pd.Series(leakage_cols, name='flagged_leakage_columns').to_csv("../reports/leakage_columns.csv")

with open("../reports/conclusions.txt", "w") as f:
    f.write("Data Quality Summary\n")
    f.write(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}\n")
    f.write(f"Duplicate rows: {df.duplicated().sum()}\n")
    f.write(f"Duplicate SK_ID_CURR: {df['SK_ID_CURR'].duplicated().sum()}\n")
    f.write("Columns >80% missing saved in missingness_top30.csv\n")
    f.write("Recommended split: Stratified split + cross-validation\n")
    f.write("Reason: No clean time variable, governance consistency\n")

print("Reports saved in ../reports")

Reports saved in ../reports


In [30]:
!python ../src/data_load.py

(307511, 122)
target
0    0.919271
1    0.080729
Name: proportion, dtype: float64


In [29]:
!python ../src/validate.py

Wrote report to /Users/shpresimsadiku/risk-ml/reports/data_quality.json


In [32]:
import importlib.util
print("yes" if importlib.util.find_spec("shap") else "no")

no
