In [46]:
import pandas as pd
import numpy as np

In [47]:
df=pd.read_csv("credit_risk.csv")
df.head(5)

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,3,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,4,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [48]:
print(df.isnull().sum())

Unnamed: 0                                  0
SeriousDlqin2yrs                            0
RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime30-59DaysPastDueNotWorse        0
DebtRatio                                   0
MonthlyIncome                           29731
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime60-89DaysPastDueNotWorse        0
NumberOfDependents                       3924
dtype: int64


In [49]:
df["MonthlyIncome"]=df["MonthlyIncome"].fillna(df["MonthlyIncome"].median())
df["NumberOfDependents"]=df["NumberOfDependents"].fillna(0)

In [50]:
df["BalanceToIncome"]=df["DebtRatio"]*df["MonthlyIncome"]
df["CreditLinePerDependent"]=df["NumberOfOpenCreditLinesAndLoans"]/(df["NumberOfDependents"]+1)

In [51]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,classification_report,roc_auc_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [52]:
features=df.drop(columns=['Unnamed: 0','SeriousDlqin2yrs'])
scalar=StandardScaler()
scaled=scalar.fit_transform(features)

pca=PCA(n_components=5)
pca_feature=pca.fit_transform(scaled)

for i in range(5):
    df[f"pc{i+1}"]=pca_feature[:,1]
    
final_features=features.copy()
final_features[['pc1','pc2','pc3','pc4','pc5']]=pca_feature

In [53]:
final_features.head()

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,BalanceToIncome,CreditLinePerDependent,pc1,pc2,pc3,pc4,pc5
0,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0,7323.197016,4.333333,-0.219577,1.227387,-0.793296,3.405489,-0.126634
1,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0,316.878123,2.0,0.266105,-1.491601,0.796718,-0.133168,-0.087324
2,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0,258.914887,2.0,0.578437,-1.467499,0.822266,-0.7916,-0.029126
3,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0,118.963951,5.0,0.197326,-1.063966,0.484465,-0.552589,-0.096179
4,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0,1584.975094,7.0,-0.0428,0.127395,-0.566309,0.82987,0.805368


In [54]:

y=df['SeriousDlqin2yrs']

In [55]:
X_train,X_test,y_train,y_test=train_test_split(final_features,y,test_size=0.2,random_state=42)

In [56]:
from imblearn.over_sampling import SMOTE
smote=SMOTE(random_state=42)
X_res,y_res=smote.fit_resample(X_train,y_train)

In [57]:
model=RandomForestClassifier(class_weight='balanced',n_estimators=100)
model.fit(X_res,y_res)

In [61]:
y_proba=model.predict_proba(X_test)[:,1]
y_pred=(y_proba>0.46).astype(int)
print("ROS AUC:",roc_auc_score(y_test,y_proba))
print("Classification report:",classification_report(y_pred,y_test))
print("Confusion matrix:",confusion_matrix(y_pred,y_test))

ROS AUC: 0.838843955116981
Classification report:               precision    recall  f1-score   support

           0       0.94      0.96      0.95     27409
           1       0.47      0.35      0.40      2591

    accuracy                           0.91     30000
   macro avg       0.70      0.66      0.68     30000
weighted avg       0.90      0.91      0.90     30000

Confusion matrix: [[26363  1046]
 [ 1681   910]]


In [59]:
import joblib
joblib.dump(model,'credit_risk_model.pkl')

['credit_risk_model.pkl']

In [60]:
joblib.dump(pca,'pca_model.pkl')
joblib.dump(scalar,'scaler_model.pkl')

['scaler_model.pkl']