In [16]:
import pandas as pd
import numpy as np

np.random.seed(42)


rows = 9000

data = {
    "income": np.random.normal(50000, 20000, rows),
    "debts": np.random.normal(15000, 10000, rows),
    "credit_score": np.random.randint(300, 850, rows),
    "employment_years": np.random.randint(0, 35, rows),
    "loan_amount": np.random.normal(20000, 15000, rows),
    "payment_history": np.random.randint(0, 6, rows),
    "age": np.random.randint(18, 70, rows),
    "loan_term": np.random.choice([12, 24, 36, 48, 60], rows)
}

df = pd.DataFrame(data)

# ---------- Target Variable ----------
df["creditworthy"] = np.where(
    (df["credit_score"] > 650) &
    (df["payment_history"] <= 1) &
    (df["income"] > df["loan_amount"] / 2),
    1,
    0
)

# ---------- Introduce Missing Values ----------
for col in df.columns:
    df.loc[df.sample(frac=0.08).index, col] = np.nan

# ---------- Messy Data ----------
df.loc[df.sample(frac=0.03).index, "income"] *= -1   # negative income
df.loc[df.sample(frac=0.03).index, "debts"] *= 5     # extreme debts

df.sample(2)

Unnamed: 0,income,debts,credit_score,employment_years,loan_amount,payment_history,age,loan_term,creditworthy
3135,58860.042609,19340.684872,469.0,11.0,16726.394368,3.0,32.0,60.0,0.0
2758,,19137.542029,379.0,17.0,26065.495292,4.0,57.0,12.0,0.0


HANDLING_INCORRECT_DATA

In [17]:
df['income']  = df['income'].abs()

In [18]:
df[df['income'] < 0]

Unnamed: 0,income,debts,credit_score,employment_years,loan_amount,payment_history,age,loan_term,creditworthy


In [19]:
cols = [i for i in df.columns]
for i in range(len(df.columns)):
    df[cols[i]] = df[cols[i]].abs()
    

In [20]:
df.sample(2)

Unnamed: 0,income,debts,credit_score,employment_years,loan_amount,payment_history,age,loan_term,creditworthy
8580,53478.944168,29022.095895,361.0,23.0,32204.782041,1.0,51.0,24.0,0.0
668,3019.396168,5750.741843,725.0,4.0,,1.0,41.0,36.0,0.0


In [21]:
print('income' ,df['income'].mean(),"debths",df['debts'].mean(),"credit_score",df['credit_score'].mean(),"employment_years",df['employment_years'].median(),"loan_amount",df['loan_amount'].median())

income 49984.73253228217 debths 17611.375167424583 credit_score 574.4972222222223 employment_years 17.0 loan_amount 19995.07859304758


In [22]:
df.columns[0]

'income'

HANDLING MISSING DATA

In [23]:
(((df['income'].isnull().sum()/ len(df))*100) *9) 

np.float64(72.0)

In [24]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.pipeline import Pipeline



In [25]:
df['debts'].mean(),df['debts'].median()
df['loan_amount'].mean(),df['loan_amount'].median()

(np.float64(21235.532936896278), 19995.07859304758)

In [26]:
df.head(1)

Unnamed: 0,income,debts,credit_score,employment_years,loan_amount,payment_history,age,loan_term,creditworthy
0,59934.28306,22654.02156,813.0,8.0,15537.525878,3.0,20.0,12.0,


In [27]:
col_tra = ColumnTransformer(
    transformers=[
        ('tr_1',SimpleImputer(strategy='median'),['income','loan_amount']),
        ('tr_2',SimpleImputer(strategy='mean'),['debts','employment_years']),
        ('tr_3',IterativeImputer(),['credit_score','payment_history']),
        ('tr_4',SimpleImputer(strategy='most_frequent'),['age','loan_term']),
    ]
    ,remainder='passthrough'
)

In [28]:
df = pd.DataFrame(col_tra.fit_transform(df),columns=df.columns)

In [29]:
df['creditworthy'].value_counts()

Unnamed: 0_level_0,count
creditworthy,Unnamed: 1_level_1
0.0,7265
1.0,1015


In [None]:
df.isnull().sum()

Unnamed: 0,0
income,0
debts,0
credit_score,0
employment_years,0
loan_amount,0
payment_history,0
age,0
loan_term,0
creditworthy,720


In [34]:
df['employment_years'].mean()

np.float64(17.0493961352657)

In [None]:
pro = df[df['employment_years'] > df['age']][['employment_years','age']]
df['employment_years'] = pro['employment_years'] - 18
    

In [42]:
df[df['employment_years'] > df['age']]

Unnamed: 0,income,debts,credit_score,employment_years,loan_amount,payment_history,age,loan_term,creditworthy
