In [148]:
import pandas as pd
import numpy as np

np.random.seed(42)


rows = 9000

data = {
    "income": np.random.normal(50000, 20000, rows),
    "debts": np.random.normal(15000, 10000, rows),
    "credit_score": np.random.randint(300, 850, rows),
    "employment_years": np.random.randint(0, 35, rows),
    "loan_amount": np.random.normal(20000, 15000, rows),
    "payment_history": np.random.randint(0, 6, rows),
    "age": np.random.randint(18, 70, rows),
    "loan_term": np.random.choice([12, 24, 36, 48, 60], rows)
}

df = pd.DataFrame(data)

# ---------- Target Variable ----------
df["creditworthy"] = np.where(
    (df["credit_score"] > 650) &
    (df["payment_history"] <= 1) &
    (df["income"] > df["loan_amount"] / 2),
    1,
    0
)

# ---------- Introduce Missing Values ----------
for col in df.columns:
    df.loc[df.sample(frac=0.08).index, col] = np.nan

# ---------- Messy Data ----------
df.loc[df.sample(frac=0.03).index, "income"] *= -1   # negative income
df.loc[df.sample(frac=0.03).index, "debts"] *= 5     # extreme debts

df.sample(2)

Unnamed: 0,income,debts,credit_score,employment_years,loan_amount,payment_history,age,loan_term,creditworthy
3135,58860.042609,19340.684872,469.0,11.0,16726.394368,3.0,32.0,60.0,0.0
2758,,19137.542029,379.0,17.0,26065.495292,4.0,57.0,12.0,0.0


HANDLING_INCORRECT_DATA

In [149]:
df['income']  = df['income'].abs()

In [150]:
df[df['income'] < 0]

Unnamed: 0,income,debts,credit_score,employment_years,loan_amount,payment_history,age,loan_term,creditworthy


In [151]:
cols = [i for i in df.columns]
for i in range(len(df.columns)):
    df[cols[i]] = df[cols[i]].abs()
    

In [152]:
df.sample(2)

Unnamed: 0,income,debts,credit_score,employment_years,loan_amount,payment_history,age,loan_term,creditworthy
8580,53478.944168,29022.095895,361.0,23.0,32204.782041,1.0,51.0,24.0,0.0
668,3019.396168,5750.741843,725.0,4.0,,1.0,41.0,36.0,0.0


In [153]:
print('income' ,df['income'].mean(),"debths",df['debts'].mean(),"credit_score",df['credit_score'].mean(),"employment_years",df['employment_years'].median(),"loan_amount",df['loan_amount'].median())

income 49984.73253228217 debths 17611.375167424583 credit_score 574.4972222222223 employment_years 17.0 loan_amount 19995.07859304758


In [154]:
df.columns[0]

'income'

HANDLING MISSING DATA

In [155]:
(((df['income'].isnull().sum()/ len(df))*100) *9) 

np.float64(72.0)

In [156]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.pipeline import Pipeline



In [157]:
df['debts'].mean(),df['debts'].median()
df['loan_amount'].mean(),df['loan_amount'].median()

(np.float64(21235.532936896278), 19995.07859304758)

In [158]:
df.head(1)

Unnamed: 0,income,debts,credit_score,employment_years,loan_amount,payment_history,age,loan_term,creditworthy
0,59934.28306,22654.02156,813.0,8.0,15537.525878,3.0,20.0,12.0,


In [159]:
col_tra = ColumnTransformer(
    transformers=[
        ('tr_1',SimpleImputer(strategy='median'),['income','loan_amount']),
        ('tr_2',SimpleImputer(strategy='mean'),['debts','employment_years']),
        ('tr_3',IterativeImputer(),['credit_score','payment_history']),
        ('tr_4',SimpleImputer(strategy='most_frequent'),['age','loan_term']),
    ]
    ,remainder='passthrough'
)

In [160]:
df = pd.DataFrame(col_tra.fit_transform(df),columns=df.columns)

In [161]:
df['creditworthy'].value_counts()

Unnamed: 0_level_0,count
creditworthy,Unnamed: 1_level_1
0.0,7265
1.0,1015


In [181]:
df['creditworthy'].fillna(1,inplace=True)

In [162]:
df.isnull().sum()

Unnamed: 0,0
income,0
debts,0
credit_score,0
employment_years,0
loan_amount,0
payment_history,0
age,0
loan_term,0
creditworthy,720


In [163]:
df['employment_years'].mean()

np.float64(17.0493961352657)

In [164]:
# pro = df[df['employment_years'] > df['age']][['employment_years','age']]
# df['age'] = pro['age'] + 18
for i in range(len(df)):
    if df['employment_years'][i] > df['age'][i]:
        df['age'][i] = df['age'][i] + 18
    else:
        pass

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['age'][i] = df['age'][i] + 18


In [165]:
df[df['employment_years'] > df['age']]

Unnamed: 0,income,debts,credit_score,employment_years,loan_amount,payment_history,age,loan_term,creditworthy


CORRECTING DATA TYPE OF COLUMNS

In [166]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9000 entries, 0 to 8999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   income            9000 non-null   float64
 1   debts             9000 non-null   float64
 2   credit_score      9000 non-null   float64
 3   employment_years  9000 non-null   float64
 4   loan_amount       9000 non-null   float64
 5   payment_history   9000 non-null   float64
 6   age               9000 non-null   float64
 7   loan_term         9000 non-null   float64
 8   creditworthy      8280 non-null   float64
dtypes: float64(9)
memory usage: 632.9 KB


In [167]:
df['income'].astype('float32').head(1)



Unnamed: 0,income
0,59934.28125


In [168]:
df['debts'].astype('float32').head(1)


Unnamed: 0,debts
0,15537.525391


In [169]:
df['credit_score'].astype('float32').head(1)


Unnamed: 0,credit_score
0,22654.021484


In [None]:
df['employment_years'].astype('float64')
#

Unnamed: 0,employment_years
0,8.000000
1,17.000000
2,34.000000
3,30.000000
4,23.000000
...,...
8995,17.049396
8996,17.049396
8997,14.000000
8998,6.000000


In [None]:
 df['loan_amount'].astype('float32')


Unnamed: 0,loan_amount
0,813.0
1,848.0
2,396.0
3,669.0
4,716.0
...,...
8995,693.0
8996,426.0
8997,413.0
8998,824.0


In [None]:
df['age'].astype('int8')
#

Unnamed: 0,age
0,20
1,65
2,39
3,64
4,66
...,...
8995,22
8996,31
8997,51
8998,27


In [None]:
 df['loan_term'].astype('int8')


Unnamed: 0,loan_term
0,12
1,12
2,12
3,12
4,36
...,...
8995,24
8996,36
8997,36
8998,36


In [182]:
df['creditworthy'].astype('int8')

Unnamed: 0,creditworthy
0,1
1,0
2,0
3,0
4,0
...,...
8995,0
8996,1
8997,0
8998,1


In [183]:
df.isnull().sum()

Unnamed: 0,0
income,0
debts,0
credit_score,0
employment_years,0
loan_amount,0
payment_history,0
age,0
loan_term,0
creditworthy,0


TREATING OUTLIERS

In [187]:
median = df['income'].median()

In [184]:
df['income'].describe()

Unnamed: 0,income
count,9000.0
mean,49975.880148
std,19131.880423
min,11.885703
25%,37678.21448
50%,49874.077732
75%,62073.252287
max,128524.754129
