In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
credit_risk = pd.read_csv("credit_risk_dataset.csv")
credit_risk.head()


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [3]:
credit_risk.rename(columns={'cb_person_default_on_file':'defaulter',
                   'cb_person_cred_hist_length':'credit_hist_length (years)'},
                  inplace=True)

In [4]:
#checking null values in all the columns
credit_risk.isna().sum()

person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              895
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3116
loan_status                      0
loan_percent_income              0
defaulter                        0
credit_hist_length (years)       0
dtype: int64

In [2]:
#Dropping unwanted columns
credit_risk.drop(["person_emp_length","loan_grade","loan_percent_income"],axis=1,inplace

SyntaxError: incomplete input (2981925314.py, line 2)

In [6]:
#Dropping the column with no lable
credit_risk.dropna(subset=["defaulter"],inplace=True)
credit_risk.isna().sum()

person_age                       0
person_income                    0
person_home_ownership            0
loan_intent                      0
loan_amnt                        0
loan_int_rate                 3116
loan_status                      0
defaulter                        0
credit_hist_length (years)       0
dtype: int64

## SPLIT THE DATA INTO FEATURES AND LABELS (X & Y)

In [7]:
X=credit_risk.drop("defaulter",axis=1) #includes the whole df except the defaulter column
y=credit_risk["defaulter"]

## FILLING UP THE MISSING VALUES

In [8]:
#heatmap gives a representation of the null values present in the datasaet
# sns.heatmap(credit_risk.isnull(),yticklabels=False,cbar=False,cmap='viridis');

In [9]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

#Filling up the categorical value with "missing" & numerical value with mean
cat_imputer = SimpleImputer(strategy="constant",fill_value="missing")
num_imputer = SimpleImputer(strategy="mean")
#define column
cat_features = ["person_age","person_income","person_home_ownership",
                "loan_intent","loan_amnt","loan_status",
                "credit_hist_length (years)"]
num_features = ["loan_int_rate"]

#Creating imputer to fill up the missing values
imputer = ColumnTransformer([("cat_imputer",cat_imputer,cat_features),
                             ("num_imputer",num_imputer,num_features)])

#Transfrom the data
filled_X = imputer.fit_transform(X)
filled_X

array([[22, 59000, 'RENT', ..., 1, 3, 16.02],
       [21, 9600, 'OWN', ..., 0, 2, 11.14],
       [25, 9600, 'MORTGAGE', ..., 1, 3, 12.87],
       ...,
       [65, 76000, 'RENT', ..., 1, 28, 10.99],
       [56, 150000, 'MORTGAGE', ..., 0, 26, 11.48],
       [66, 42000, 'RENT', ..., 0, 30, 9.99]], dtype=object)

In [10]:
credit_risk_filled = pd.DataFrame(filled_X,
                                 columns=["person_age","person_income","person_home_ownership",
                                          "loan_intent","loan_amnt","loan_status","loan_int_rate",
                                          "credit_hist_length(years)",])
credit_risk_filled.head()

Unnamed: 0,person_age,person_income,person_home_ownership,loan_intent,loan_amnt,loan_status,loan_int_rate,credit_hist_length(years)
0,22,59000,RENT,PERSONAL,35000,1,3,16.02
1,21,9600,OWN,EDUCATION,1000,0,2,11.14
2,25,9600,MORTGAGE,MEDICAL,5500,1,3,12.87
3,23,65500,RENT,MEDICAL,35000,1,2,15.23
4,24,54400,RENT,MEDICAL,35000,1,4,14.27


In [11]:
credit_risk_filled.isna().sum()

person_age                   0
person_income                0
person_home_ownership        0
loan_intent                  0
loan_amnt                    0
loan_status                  0
loan_int_rate                0
credit_hist_length(years)    0
dtype: int64

## CONVERTING THE STRING INTO NUMBERS

In [13]:
credit_risk_filled.dtypes

person_age                   object
person_income                object
person_home_ownership        object
loan_intent                  object
loan_amnt                    object
loan_status                  object
loan_int_rate                object
credit_hist_length(years)    object
dtype: object

In [14]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["person_age","person_income","person_home_ownership",
                        "loan_intent","loan_amnt","loan_status","loan_int_rate",
                        "credit_hist_length(years)"]
one_hot = OneHotEncoder()
transfromer = ColumnTransformer([("one_hot",
                                   one_hot,
                                   categorical_features)],
                                remainder ='passthrough')
transformed_X = transfromer.fit_transform(credit_risk_filled)
transformed_X

<32581x5496 sparse matrix of type '<class 'numpy.float64'>'
	with 260648 stored elements in Compressed Sparse Row format>