# Model Selection


Basic Setup and Loading Data

In [11]:
import pandas as pd
import numpy as np

data = pd.read_csv('credit_risk_dataset (2).csv')

data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4



Data Preprocessing

In [12]:
from sklearn.preprocessing import LabelEncoder

# Fill missing values
data['person_emp_length'].fillna(data['person_emp_length'].median(), inplace=True)

# Encode categorical variables
encoder = LabelEncoder()
data['person_home_ownership'] = encoder.fit_transform(data['person_home_ownership'])
data['loan_intent'] = encoder.fit_transform(data['loan_intent'])
data['loan_grade'] = encoder.fit_transform(data['loan_grade'])
data['cb_person_default_on_file'] = encoder.fit_transform(data['cb_person_default_on_file'])


Split Data

In [13]:
from sklearn.model_selection import train_test_split

# Features and target
X = data.drop('loan_status', axis=1)
y = data['loan_status']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Baseline Model

In [16]:
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Impute missing values with the mean of each column
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)  # X is your feature set

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

# Baseline model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.7983734847322388


Compare Algorithms

In [15]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Decision Tree
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)
print("Decision Tree Accuracy:", accuracy_score(y_test, dt_pred))

# Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, rf_pred))

Decision Tree Accuracy: 0.8833819241982507
Random Forest Accuracy: 0.9281878164799755
