In [4]:
import pandas as pd
import numpy as np

In [5]:
path = "/content/loan_data(1).csv"
df = pd.read_csv(path)

In [6]:
df.columns = (
    df.columns.str.strip()
    .str.lower()
    .str.replace(r"[^a-z0-9]+", "_", regex=True)
    .str.strip("_")
)

In [7]:
df.drop_duplicates(inplace=True)

In [8]:
null_ratio = df.isna().mean()
drop_cols = null_ratio[null_ratio > 0.40].index.tolist()
df.drop(columns=drop_cols, inplace=True)

In [10]:
target_col = "loan_status"
if target_col in df.columns:

    df[target_col] = (
        df[target_col]
        .astype(str)
        .str.strip()
        .str.lower()
        .map({"1":1, "0":0, "yes":1, "no":0, "true":1, "false":0})
        .fillna(df[target_col].apply(pd.to_numeric, errors="coerce"))
    )

In [11]:
df[target_col] = df[target_col].fillna(0).astype(int)

In [12]:
num_cols_auto = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols_auto = df.select_dtypes(exclude=[np.number]).columns.tolist()


In [13]:
likely_cats = [
    "person_gender","person_education","person_home_ownership",
    "loan_intent","previous_loan_defaults_on_file"
]
for c in likely_cats:
    if c in df.columns:
        df[c] = df[c].astype(str).str.strip().str.lower()

        if c == "previous_loan_defaults_on_file":
            df[c] = df[c].map({"yes":1, "no":0}).fillna(df[c])

            if not np.issubdtype(df[c].dtype, np.number):
                df[c] = pd.to_numeric(df[c], errors="coerce")

In [14]:
likely_nums = [
    "person_age","person_income","person_emp_exp","loan_amnt","loan_int_rate",
    "loan_percent_income","cb_person_cred_hist_length","credit_score"
]
for c in likely_nums:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

In [16]:
bounds = {
    "person_age": (18, 100),
    "person_emp_exp": (0, 60),
    "person_income": (0, None),
    "loan_amnt": (0, None),
    "loan_int_rate": (0, 40),
    "loan_percent_income": (0, 1.5),
    "cb_person_cred_hist_length": (0, 50),
    "credit_score": (300, 850)
}
for c, (lo, hi) in bounds.items():
    if c in df.columns:
        if lo is not None:
            df.loc[df[c] < lo, c] = np.nan
        if hi is not None:
            df.loc[df[c] > hi, c] = np.nan

In [17]:
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

In [18]:
for c in num_cols:
    if c != target_col:
        med = df[c].median()
        df[c] = df[c].fillna(med)

In [19]:
for c in cat_cols:
    df[c] = df[c].fillna("unknown")


In [20]:
cols_to_encode = [c for c in cat_cols if c != target_col]
df = pd.get_dummies(df, columns=cols_to_encode, drop_first=True)


In [21]:
nunique = df.nunique()
const_cols = nunique[nunique <= 1].index.tolist()
if const_cols:
    df.drop(columns=const_cols, inplace=True)


In [23]:
corr = df.corr(numeric_only=True).abs()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
to_drop_corr = [column for column in upper.columns if any(upper[column] > 0.98)]
if to_drop_corr:
    #
    to_drop_corr = [c for c in to_drop_corr if c != target_col]
    df.drop(columns=to_drop_corr, inplace=True, errors="ignore")

In [24]:
if {"loan_amnt","person_income"}.issubset(df.columns):
    df["feat_dti_approx"] = (df["loan_amnt"] / (df["person_income"] + 1e-9)).clip(0, 2)

In [25]:
if {"loan_int_rate","credit_score"}.issubset(df.columns):
    df["feat_rate_x_score"] = df["loan_int_rate"] * df["credit_score"]

In [26]:
cols = df.columns.tolist()
if target_col in cols:
    cols = [target_col] + [c for c in cols if c != target_col]
    df = df[cols]


In [27]:
clean_path = "/content/loan_data_clean.csv"
df.to_csv(clean_path, index=False)

print("Clean data shape:", df.shape)
print("Saved to:", clean_path)
print(df.head(10))



Clean data shape: (45000, 25)
Saved to: /content/loan_data_clean.csv
   loan_status  person_age  person_income  person_emp_exp  loan_amnt  \
0            1        22.0        71948.0             0.0    35000.0   
1            0        21.0        12282.0             0.0     1000.0   
2            1        25.0        12438.0             3.0     5500.0   
3            1        23.0        79753.0             0.0    35000.0   
4            1        24.0        66135.0             1.0    35000.0   
5            1        21.0        12951.0             0.0     2500.0   
6            1        26.0        93471.0             1.0    35000.0   
7            1        24.0        95550.0             5.0    35000.0   
8            1        24.0       100684.0             3.0    35000.0   
9            1        21.0        12739.0             0.0     1600.0   

   loan_int_rate  loan_percent_income  cb_person_cred_hist_length  \
0          16.02                 0.49                         3.0   
