In [473]:
# pip install kaggle

In [474]:
import kaggle

# Example command to download a dataset
kaggle.api.dataset_download_files('altruistdelhite04/loan-prediction-problem-dataset', path='/Users/raiffhazanow/Desktop/impeliaMachineLearning', unzip=True)

In [475]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder

In [476]:
# Load the datasets
data = pd.read_csv('dataset/train.csv')

In [477]:
data = data.drop('Loan_ID', axis=1) 

In [478]:
# Preprocess the data
# Fill missing values or drop them
data.fillna(method='ffill', inplace=True)

In [479]:
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [480]:
data.isnull().sum().sort_values(ascending=False)

LoanAmount           1
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [481]:
data.shape

(614, 12)

In [482]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             614 non-null    object 
 1   Married            614 non-null    object 
 2   Dependents         614 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      614 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         613 non-null    float64
 8   Loan_Amount_Term   614 non-null    float64
 9   Credit_History     614 non-null    float64
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 57.7+ KB


In [483]:
data.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,613.0,614.0,614.0
mean,5403.459283,1621.245798,147.381729,341.628664,0.835505
std,6109.041673,2926.248369,87.512302,65.656819,0.371027
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [484]:
data['Credit_History'] = data['Credit_History'].astype('O')

In [485]:
# Encode categorical variables
label_encoders = {}
categorical_columns = data.select_dtypes(include=['object']).columns
for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])

In [486]:
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,0,0,0,0,5849,0.0,,360.0,1,2,1
1,1,1,1,0,0,4583,1508.0,128.0,360.0,1,0,0
2,1,1,0,0,1,3000,0.0,66.0,360.0,1,2,1
3,1,1,0,1,0,2583,2358.0,120.0,360.0,1,2,1
4,1,0,0,0,0,6000,0.0,141.0,360.0,1,2,1


In [487]:
# Split the data into features and target
X = data.drop('Loan_Status', axis=1)  # assuming 'Loan_Status' is the target column
y = data['Loan_Status']

In [488]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=788)

In [489]:
# Initialize the Decision Tree Classifier
classifier = DecisionTreeClassifier()

In [490]:
# Train the model
classifier.fit(X_train, y_train)

In [491]:
# Make predictions
predictions = classifier.predict(X_test)

In [492]:
# Evaluate the model
# Evaluate the model with various metrics
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
roc_auc = roc_auc_score(y_test, classifier.predict_proba(X_test)[:, 1])

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"AUC-ROC: {roc_auc}")


# Accuracy (0.735 or 73.5%): This is the proportion of the total number of predictions that were correct. 
# It is a measure of how many instances were classified correctly. An accuracy of 73.5% means that, on average, 
# your model made the correct prediction for approximately 73.5 out of every 100 instances.

# Precision (0.858 or 85.8%): This measures the proportion of positive identifications that were actually correct. 
# A precision of 85.8% means that when your model predicted an instance was positive, it was correct 85.8% of the time. 
# In the context of loan prediction, it means that when the model predicts a loan will be approved, it is correct about 85.8% 
# of the time.

# Recall (0.763 or 76.3%): Also known as sensitivity, it measures the proportion of actual positives that were identified 
# correctly. A recall of 76.3% indicates that the model correctly identified 76.3% of all the actual positive instances in 
# the data. For a loan prediction model, it reflects how many of the loans that should have been approved were correctly 
# predicted by the model.

# F1 Score (0.808 or 80.8%): The F1 score is the harmonic mean of precision and recall, and as such, it takes both false 
# positives and false negatives into account. It is a good measure when you need to balance precision and recall. An F1 
# score of 80.8% is quite strong, suggesting that the model has a good balance between precision and recall.

# AUC-ROC (0.743 or 74.3%): The Area Under the Receiver Operating Characteristic (ROC) curve is a measure of the model's 
# ability to distinguish between the classes. An AUC-ROC of 74.3% means that if you randomly choose one positive and one 
# negative instance, there's a 74.3% chance that the model will be able to tell which one is positive and which is negative.

Accuracy: 0.7351351351351352
Precision: 0.8583333333333333
Recall: 0.762962962962963
F1 Score: 0.807843137254902
AUC-ROC: 0.7434814814814815
