In [2]:
import pandas as pd

In [3]:
# KNN
dataset = pd.read_csv("dataset/train.csv")
dataset.head()


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:

# Clean the data
def clean_dataset(dataset, test_data = False):
    dataset = dataset.drop("Loan_ID", axis=1)
    gender_mapping = {"nan": 0, "Male": 1, "Female": 2}
    dataset["Gender"] = dataset["Gender"].map(gender_mapping)
    # dataset.head()
    education_mapping = {"Not Graduate": 0, "Graduate": 1}
    dataset["Education"] = dataset["Education"].map(education_mapping)

    # dataset.Property_Area.unique()
    property_mapping = {"Urban": 0, "Rural": 1, "Semiurban": 2}
    dataset["Property_Area"] = dataset["Property_Area"].map(property_mapping)

    married_mapping = {"No": 0, "Yes": 1}
    dataset["Married"] = dataset["Married"].map(married_mapping)

    self_employed_mapping = {"No": 0, "Yes": 1}
    dataset["Self_Employed"] = dataset["Self_Employed"].map(self_employed_mapping)

    dependents_mapping = {"0": 0, "nan": 0, "1": 1, "2": 2, "3+": 3}
    dataset["Dependents"] = dataset["Dependents"].map(dependents_mapping)

    # Assume 0 since it seems like the most logical thing to do.
    # dataset['LoanAmount'].fillna(0, inplace=True)
    # dataset['CoapplicantIncome'].fillna(0, inplace=True)
    # dataset['Loan_Amount_Term'].fillna(0, inplace=True)
    # dataset['ApplicantIncome'].fillna(0, inplace=True)
    dataset.fillna(0, inplace=True)

    if not test_data:
        loan_mapping = {"N": 0, "Y": 1}
        dataset["Loan_Status"] = dataset["Loan_Status"].map(loan_mapping)
    return dataset

# dataset.head(50)


In [5]:
# dataset = clean_dataset(dataset)

In [6]:
dataset.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [7]:
dataset = dataset.dropna()
dataset = dataset.drop("Loan_ID", axis=1)
X = dataset.drop('Loan_Status', axis=1)
y = dataset['Loan_Status']

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

In [9]:
# dataset splitting

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
# X_train

In [10]:
# Identify numeric and categorical features
numeric_features = X.select_dtypes(include=['float64', 'int64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

# Create a preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),  # Scaling for numeric features
        ('cat', OneHotEncoder(), categorical_features)  # One-hot encoding for categorical features
    ])

# Create a pipeline with preprocessor and classifier
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier())
])


In [11]:
# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

In [12]:
accuracy = pipeline.score(X_test, y_test)
print(f'Model Accuracy: {accuracy:.2f}')

Model Accuracy: 0.83


In [17]:
predictions = pipeline.predict(X_test)
# Calculate metrics
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, pos_label='Y')
recall = recall_score(y_test, predictions, pos_label='Y')
f1 = f1_score(y_test, predictions, pos_label='Y')

label_encoder = LabelEncoder()
y_true_numeric = label_encoder.fit_transform(y_test)
predictions_numeric = label_encoder.transform(predictions)

roc_auc = roc_auc_score(y_true_numeric, predictions_numeric)

# Print metrics
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-Score: {f1:.4f}')
print(f'AUC-ROC: {roc_auc:.4f}')

# Classification Report
print('\nClassification Report:')
print(classification_report(y_test, predictions))

Accuracy: 0.8333
Precision: 0.8347
Recall: 0.9619
F1-Score: 0.8938
AUC-ROC: 0.7245

Classification Report:
              precision    recall  f1-score   support

           N       0.83      0.49      0.61        39
           Y       0.83      0.96      0.89       105

    accuracy                           0.83       144
   macro avg       0.83      0.72      0.75       144
weighted avg       0.83      0.83      0.82       144



In [None]:
dataset_test = pd.read_csv("dataset/test.csv")
dataset_test = dataset_test.drop("Loan_ID", axis=1)
dataset_test = dataset_test.dropna()

In [None]:
predictions = pipeline.predict(dataset_test)
print(predictions)

['Y' 'Y' 'Y' 'N' 'Y' 'Y' 'N' 'Y' 'N' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y'
 'Y' 'N' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y'
 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'Y' 'N' 'Y' 'Y' 'Y' 'N' 'N' 'Y' 'Y' 'N'
 'N' 'Y' 'N' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'N' 'N' 'N' 'Y' 'Y' 'Y'
 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y'
 'Y' 'Y' 'Y' 'N' 'N' 'N' 'Y' 'Y' 'N' 'N' 'Y' 'N' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y'
 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'Y'
 'Y' 'Y' 'Y' 'N' 'Y' 'Y' 'Y' 'N' 'N' 'Y' 'Y' 'Y' 'N' 'Y' 'N' 'N' 'Y' 'Y'
 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'N' 'N' 'Y' 'Y' 'N' 'Y' 'N' 'Y' 'Y' 'Y'
 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y'
 'N' 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'Y' 'Y' 'Y' 'N' 'N' 'Y' 'Y' 'N' 'Y' 'N' 'Y'
 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'N'
 'Y' 'Y' 'Y' 'Y' 'N' 'N' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'N' 'Y'
 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'Y' 'Y

The whole process onwards is incorrect and offers an accuracy of 0.99 lol

In [None]:
exit()

In [None]:
# dataset scaling

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Fitting and prediction with testing data
classifier = KNeighborsClassifier(n_neighbors=8)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)


In [None]:
result = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(result)
result1 = classification_report(y_test, y_pred)
print("Classification Report:",)
print (result1)
result2 = accuracy_score(y_test,y_pred)
print("Accuracy:",result2)

Confusion Matrix:
[[159   0]
 [  1  25]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00       159
         1.0       1.00      0.96      0.98        26

    accuracy                           0.99       185
   macro avg       1.00      0.98      0.99       185
weighted avg       0.99      0.99      0.99       185

Accuracy: 0.9945945945945946


In [None]:
# Getting the test dataset provided

dataset_test = pd.read_csv("dataset/test.csv")
dataset_test.head()
dataset_test = clean_dataset(dataset_test, test_data=True)
y_pred = classifier.predict(dataset_test)

