In [4]:
# ================================
# Notebook 1: Basic Data Understanding
# ================================

import pandas as pd
import numpy as np

DATA_PATH = "/Users/mr.engineer/Desktop/Code/DSPROJECT/CREDITRISK/DATA/credit_risk_dataset.csv"
TARGET = "loan_status"   # 1 = default, 0 = repaid

# 1) Load data
df = pd.read_csv(DATA_PATH, low_memory=False)
print("File loaded:", DATA_PATH)
print("Shape:", df.shape)           # (rows, cols)
display(df.head(5))                 # quick peek

# 2) Basic schema
print("\n--- Dtypes ---")
print(df.dtypes)

print("\n--- .info() ---")
df.info()

mem_mb = df.memory_usage(deep=True).sum() / (1024**2)
print(f"\nApprox memory usage: {mem_mb:.2f} MB")

# 3) Target sanity checks
if TARGET not in df.columns:
    raise ValueError(f"Target '{TARGET}' not found in columns!")

print("\n--- Target checks ---")
print("Unique values in target:", sorted(df[TARGET].dropna().unique()))
print("Is target numeric?", pd.api.types.is_numeric_dtype(df[TARGET]))

# Expect 0/1 only
is_binary = set(df[TARGET].dropna().unique()).issubset({0,1})
print("Is target strictly binary 0/1?", is_binary)

print("\nTarget distribution:")
target_counts = df[TARGET].value_counts(dropna=False).sort_index()
target_pct = (target_counts / len(df) * 100).round(2)
display(pd.DataFrame({"count": target_counts, "percent": target_pct}))

# 4) Missing values overview
print("\n--- Missingness overview ---")
missing_pct = (df.isna().mean() * 100).sort_values(ascending=False)
print("Any missing values?", (missing_pct > 0).any())
display(missing_pct.head(20).to_frame("missing_%"))

# 5) Split columns into numeric vs categorical (initial guess)
numeric_cols = []
categorical_cols = []
for c in df.columns:
    if c == TARGET:
        continue
    if pd.api.types.is_numeric_dtype(df[c]):
        numeric_cols.append(c)
    else:
        categorical_cols.append(c)

print("\n# numeric (excl. target):", len(numeric_cols))
print("# categorical:", len(categorical_cols))

# 6) Uniqueness to spot ID-like columns
print("\n--- Uniqueness stats ---")
uniq_rows = []
for c in df.columns:
    n_unique = df[c].nunique(dropna=True)
    pct_unique = (n_unique / len(df)) * 100
    uniq_rows.append([c, n_unique, round(pct_unique, 2), str(df[c].dtype)])

uniqueness = pd.DataFrame(uniq_rows, columns=["column", "n_unique", "pct_unique", "dtype"])
uniqueness = uniqueness.sort_values(["pct_unique", "n_unique"], ascending=False)
display(uniqueness.head(20))

id_like_cols = uniqueness[(uniqueness["pct_unique"] > 90) & (uniqueness["column"] != TARGET)]["column"].tolist()
print("Potential ID-like columns:", id_like_cols if id_like_cols else "None")

# 7) Constant / near-constant / duplicate columns
print("\n--- Constant / near-constant / duplicate columns ---")
const_cols = []
for c in df.columns:
    if df[c].nunique(dropna=False) <= 1:
        const_cols.append(c)
print("Constant columns:", const_cols if const_cols else "None")

near_const_numeric = []
for c in numeric_cols:
    vc = df[c].value_counts(normalize=True, dropna=False)
    if len(vc) > 0 and vc.iloc[0] > 0.99:
        near_const_numeric.append(c)
print("Near-constant numeric (>99% same value):", near_const_numeric if near_const_numeric else "None")

dup_pairs = []
seen = {}
for c in df.columns:
    # convert to string to compare easily (simple approach)
    s = df[c].astype(str)
    key = tuple(s.values)
    if key in seen:
        dup_pairs.append((c, seen[key]))
    else:
        seen[key] = c
print("Duplicate columns (col, duplicate_of):", dup_pairs if dup_pairs else "None")

# 8) Quick numeric health (mins/max/percentiles)
print("\n--- Numeric health ---")
num_health_rows = []
for c in numeric_cols:
    s = pd.to_numeric(df[c], errors="coerce")
    num_health_rows.append({
        "column": c,
        "n_nan": int(s.isna().sum()),
        "min": float(np.nanmin(s)) if s.notna().any() else np.nan,
        "p01": float(np.nanpercentile(s, 1)) if s.notna().any() else np.nan,
        "median": float(np.nanmedian(s)) if s.notna().any() else np.nan,
        "p99": float(np.nanpercentile(s, 99)) if s.notna().any() else np.nan,
        "max": float(np.nanmax(s)) if s.notna().any() else np.nan
    })
num_health = pd.DataFrame(num_health_rows)
display(num_health.head(15))

# 9) Simple leakage name scan (just a heuristic)
print("\n--- Potential leakage by name (heuristic) ---")
leak_words = ["default","charge","charged","writeoff","dpd","delinq","overdue",
              "paid","payoff","repaid","settled","status","collection","recover","npa","bucket"]
leak_suspects = []
for c in df.columns:
    name = c.lower()
    if c != TARGET and any(w in name for w in leak_words):
        leak_suspects.append(c)
print("Leakage suspects:", leak_suspects if leak_suspects else "None")

# 10) Basic describe (non-visual)
print("\n--- Numeric describe() ---")
if len(numeric_cols) > 0:
    display(df[numeric_cols].describe().T.head(15))
else:
    print("No numeric features found.")

print("\n--- Top values for first few categorical columns ---")
for c in categorical_cols[:10]:
    vc = df[c].value_counts(dropna=False).head(12)
    pct = (vc / len(df) * 100).round(2)
    tmp = pd.DataFrame({c: vc.index.astype(str), "count": vc.values, "percent": pct.values})
    print(f"\nTop values for {c}:")
    display(tmp)

print("\n✅ Basic Data Understanding done.")
print("Next notebook: EDA (visuals, correlations, target-wise cuts).")


File loaded: /Users/mr.engineer/Desktop/Code/DSPROJECT/CREDITRISK/DATA/credit_risk_dataset.csv
Shape: (32581, 12)


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4



--- Dtypes ---
person_age                      int64
person_income                   int64
person_home_ownership          object
person_emp_length             float64
loan_intent                    object
loan_grade                     object
loan_amnt                       int64
loan_int_rate                 float64
loan_status                     int64
loan_percent_income           float64
cb_person_default_on_file      object
cb_person_cred_hist_length      int64
dtype: object

--- .info() ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32581 entries, 0 to 32580
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  32581 non-null  int64  
 1   person_income               32581 non-null  int64  
 2   person_home_ownership       32581 non-null  object 
 3   person_emp_length           31686 non-null  float64
 4   loan_intent                 32581 non-nul

Unnamed: 0_level_0,count,percent
loan_status,Unnamed: 1_level_1,Unnamed: 2_level_1
0,25473,78.18
1,7108,21.82



--- Missingness overview ---
Any missing values? True


Unnamed: 0,missing_%
loan_int_rate,9.563856
person_emp_length,2.747
person_age,0.0
person_income,0.0
person_home_ownership,0.0
loan_intent,0.0
loan_grade,0.0
loan_amnt,0.0
loan_status,0.0
loan_percent_income,0.0



# numeric (excl. target): 7
# categorical: 4

--- Uniqueness stats ---


Unnamed: 0,column,n_unique,pct_unique,dtype
1,person_income,4295,13.18,int64
6,loan_amnt,753,2.31,int64
7,loan_int_rate,348,1.07,float64
9,loan_percent_income,77,0.24,float64
0,person_age,58,0.18,int64
3,person_emp_length,36,0.11,float64
11,cb_person_cred_hist_length,29,0.09,int64
5,loan_grade,7,0.02,object
4,loan_intent,6,0.02,object
2,person_home_ownership,4,0.01,object


Potential ID-like columns: None

--- Constant / near-constant / duplicate columns ---
Constant columns: None
Near-constant numeric (>99% same value): None
Duplicate columns (col, duplicate_of): None

--- Numeric health ---


Unnamed: 0,column,n_nan,min,p01,median,p99,max
0,person_age,0,20.0,21.0,26.0,50.0,144.0
1,person_income,0,4000.0,14400.0,55000.0,225200.0,6000000.0
2,person_emp_length,895,0.0,0.0,4.0,18.0,123.0
3,loan_amnt,0,500.0,1000.0,8000.0,29800.0,35000.0
4,loan_int_rate,3116,5.42,5.42,10.99,18.62,23.22
5,loan_percent_income,0,0.0,0.02,0.15,0.5,0.83
6,cb_person_cred_hist_length,0,2.0,2.0,4.0,17.0,30.0



--- Potential leakage by name (heuristic) ---
Leakage suspects: ['cb_person_default_on_file']

--- Numeric describe() ---


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
person_age,32581.0,27.7346,6.348078,20.0,23.0,26.0,30.0,144.0
person_income,32581.0,66074.84847,61983.119168,4000.0,38500.0,55000.0,79200.0,6000000.0
person_emp_length,31686.0,4.789686,4.14263,0.0,2.0,4.0,7.0,123.0
loan_amnt,32581.0,9589.371106,6322.086646,500.0,5000.0,8000.0,12200.0,35000.0
loan_int_rate,29465.0,11.011695,3.240459,5.42,7.9,10.99,13.47,23.22
loan_percent_income,32581.0,0.170203,0.106782,0.0,0.09,0.15,0.23,0.83
cb_person_cred_hist_length,32581.0,5.804211,4.055001,2.0,3.0,4.0,8.0,30.0



--- Top values for first few categorical columns ---

Top values for person_home_ownership:


Unnamed: 0,person_home_ownership,count,percent
0,RENT,16446,50.48
1,MORTGAGE,13444,41.26
2,OWN,2584,7.93
3,OTHER,107,0.33



Top values for loan_intent:


Unnamed: 0,loan_intent,count,percent
0,EDUCATION,6453,19.81
1,MEDICAL,6071,18.63
2,VENTURE,5719,17.55
3,PERSONAL,5521,16.95
4,DEBTCONSOLIDATION,5212,16.0
5,HOMEIMPROVEMENT,3605,11.06



Top values for loan_grade:


Unnamed: 0,loan_grade,count,percent
0,A,10777,33.08
1,B,10451,32.08
2,C,6458,19.82
3,D,3626,11.13
4,E,964,2.96
5,F,241,0.74
6,G,64,0.2



Top values for cb_person_default_on_file:


Unnamed: 0,cb_person_default_on_file,count,percent
0,N,26836,82.37
1,Y,5745,17.63



✅ Basic Data Understanding done.
Next notebook: EDA (visuals, correlations, target-wise cuts).
