In [19]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv('../../datasets/loan_data.csv')
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
1,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
2,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
3,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
4,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y


In [3]:
df.drop(columns="Loan_ID", inplace=True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381 entries, 0 to 380
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             381 non-null    object 
 1   Married            381 non-null    object 
 2   Dependents         381 non-null    object 
 3   Education          381 non-null    object 
 4   Self_Employed      381 non-null    object 
 5   ApplicantIncome    381 non-null    float64
 6   CoapplicantIncome  381 non-null    float64
 7   LoanAmount         381 non-null    float64
 8   Loan_Amount_Term   381 non-null    float64
 9   Credit_History     381 non-null    float64
 10  Property_Area      381 non-null    object 
 11  Loan_Status        381 non-null    object 
dtypes: float64(5), object(7)
memory usage: 35.8+ KB


# Missing Values

In [5]:
columns_num = list(df.select_dtypes(exclude="object").columns)
columns_cat = list(df.select_dtypes(include="object").columns)

In [6]:
from sklearn.impute import SimpleImputer
imputer_num = SimpleImputer(strategy="mean")
imputer_num.fit(df[columns_num])
df[columns_num] = imputer_num.transform(df[columns_num])

imputer_cat = SimpleImputer(strategy="most_frequent")
imputer_cat.fit(df[columns_cat])
df[columns_cat]=imputer_cat.transform(df[columns_cat])

In [8]:
col = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area']

X = df[col]
y = df['Loan_Status'].map({'Y':1, 'N':0})

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

In [11]:
X_train.shape ,X_test.shape,y_train.shape,y_test.shape

((285, 11), (96, 11), (285,), (96,))

In [12]:
columns_cat.remove('Loan_Status')

In [13]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

scaler = MinMaxScaler()
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

X_train_num = scaler.fit_transform(X_train[columns_num])
X_train_cat = encoder.fit_transform(X_train[columns_cat])

In [14]:
X_test_num = scaler.fit_transform(X_test[columns_num])
X_test_cat = encoder.fit_transform(X_test[columns_cat])

In [15]:
columns_enc = encoder.get_feature_names_out(columns_cat)

X_train_final = pd.concat(
    [
        pd.DataFrame(X_train_num, columns=columns_num),
        pd.DataFrame(X_train_cat, columns=columns_enc)
    ],
    axis=1
)

In [16]:
X_test_final = pd.concat(
    [
        pd.DataFrame(X_test_num, columns=columns_num),
        pd.DataFrame(X_test_cat, columns=columns_enc)
    ],
    axis=1
)

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

In [20]:
models = {'Log' : LogisticRegression(),
          'KNN3' : KNeighborsClassifier(n_neighbors=3),
          'KNN5' : KNeighborsClassifier(n_neighbors=5),
          'KNN7' : KNeighborsClassifier(n_neighbors=7),
          'SVMp2' : SVC(kernel='poly', degree=2, probability=True),
          'SVMp3' : SVC(kernel='poly', degree=3, probability=True),
          'DT' : DecisionTreeClassifier(max_depth = 2, max_features = 5, min_samples_leaf = 3,min_samples_split = 2),
          'RF' : RandomForestClassifier(max_depth = 5, min_samples_leaf = 1, n_estimators = 10, n_jobs=-1),
          'AdaB' : AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth = 2),n_estimators=10,random_state=42),}

for name, model in models.items():
    model.fit(X_train_final, y_train)

    train_acc = model.score(X_train_final, y_train)
    test_acc  = model.score(X_test_final, y_test)

    print(f"{name} | Train: {train_acc:.3f} | Test: {test_acc:.3f}")

Log | Train: 0.853 | Test: 0.823
KNN3 | Train: 0.856 | Test: 0.760
KNN5 | Train: 0.853 | Test: 0.698
KNN7 | Train: 0.807 | Test: 0.656
SVMp2 | Train: 0.853 | Test: 0.823
SVMp3 | Train: 0.877 | Test: 0.812
DT | Train: 0.853 | Test: 0.823
RF | Train: 0.888 | Test: 0.812
AdaB | Train: 0.870 | Test: 0.812


In [31]:
from sklearn.model_selection import GridSearchCV

In [32]:
params = {'max_depth':range(2,6), 'min_samples_leaf':range(1,6), 'max_features':range(2,8), 'min_samples_split':range(1,6)}
grid_cv = GridSearchCV(DecisionTreeClassifier(), param_grid=params, cv=5, verbose=1)
grid_cv.fit(X_train_final,y_train)

Fitting 5 folds for each of 600 candidates, totalling 3000 fits


In [33]:
grid_cv.best_params_

{'max_depth': 2,
 'max_features': 5,
 'min_samples_leaf': 3,
 'min_samples_split': 2}

In [34]:
params1 = {'max_depth':range(2,10), 'min_samples_leaf':range(1,10), 'n_estimators':range(10,340,50)}
grid_cv1 = GridSearchCV(RandomForestClassifier(random_state=42), param_grid=params1, cv=5, verbose=1, n_jobs=-1)
grid_cv1.fit(X_train_final,y_train)

Fitting 5 folds for each of 504 candidates, totalling 2520 fits


In [35]:
grid_cv1.best_params_

{'max_depth': 5, 'min_samples_leaf': 1, 'n_estimators': 10}

In [46]:
#print("Train NaN count:\n", X_train_final.isna().sum())
#print("Test NaN count:\n", X_test_final.isna().sum())

In [21]:
#X_test_final = X_test_final.fillna(0)

In [25]:
X_num = scaler.transform(X[columns_num])
X_cat = encoder.transform(X[columns_cat])

In [26]:
columns_enc = encoder.get_feature_names_out(columns_cat)

X_final = pd.concat(
    [
        pd.DataFrame(X_num, columns=columns_num),
        pd.DataFrame(X_cat, columns=columns_enc)
    ],
    axis=1
)

In [28]:
final_model = LogisticRegression()
final_model.fit(X_final,y)

In [30]:
def preprocess_input(test_df):
    test_num = scaler.transform(test_df[columns_num])
    test_cat = encoder.transform(test_df[columns_cat])

    test_final = pd.concat(
        [
            pd.DataFrame(test_num, columns=columns_num),
            pd.DataFrame(test_cat, columns=columns_enc)
        ],
        axis=1
    )

    return test_final

In [31]:
single_input = {
    "Gender": "Male",
    "Married": "Yes",
    "Dependents": "1",
    "Education": "Graduate",
    "Self_Employed": "No",
    "ApplicantIncome": 5000,
    "CoapplicantIncome": 2000,
    "LoanAmount": 150,
    "Loan_Amount_Term": 360,
    "Credit_History": 1,
    "Property_Area": "Urban"
}
single_df = pd.DataFrame([single_input])

In [33]:
test_ready = preprocess_input(single_df)
prediction = final_model.predict(test_ready)

print('Prediction','Loan Approved' if prediction[0]== 1 else 'Loan Rejected')

Prediction Loan Approved


In [34]:
import pickle
pickle.dump(final_model,open('final_model.pkl','wb'))
pickle.dump(scaler,open('scaler.pkl','wb'))
pickle.dump(encoder,open('encoder.pkl','wb'))