In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

In [2]:
test_data = pd.read_csv('loan_test.csv') # load testing data
test_data.isnull().sum() # check for nulls

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [3]:
train_data = pd.read_csv('loan_train.csv') # load training data
train_data.isnull().sum() # check for nulls

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [4]:
# separate x and y variables in the data

x_train = train_data.drop(columns=['Loan_Status', 'Loan_ID'])
y_train = train_data.Loan_Status

x_test = test_data.drop(columns=['Loan_Status', 'Loan_ID'])
y_test = test_data.Loan_Status

In [5]:
categorical_columns = [
    'Gender',
    'Married',
    'Education',
    'Self_Employed',
    'Property_Area'
]

In [6]:
# encode categorical variables for x dataframes for test and training sets

label_encoder = LabelEncoder()
categorical_column_indices = []

for column in categorical_columns:
    categorical_column_indices.append(x_train.columns.get_loc(column))
    x_train[column] = label_encoder.fit_transform(x_train[column])
    x_test[column] = label_encoder.fit_transform(x_test[column])

In [7]:
ct = ColumnTransformer(
    [
        ('encoder',
        OneHotEncoder(),
        categorical_column_indices)
    ], 
    remainder='passthrough')

x_train = ct.fit_transform(x_train)
x_test = ct.fit_transform(x_test)

In [8]:
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.fit_transform(y_test)

In [9]:
logistic_regression = LogisticRegression()
logistic_regression.fit(x_train, y_train)

In [10]:
y_prediction = logistic_regression.predict(x_test)

In [11]:
accuracy = np.sum(y_prediction==y_test)/len(y_test)
accuracy

0.9203539823008849

In [12]:
probability = logistic_regression.predict_proba(x_test)
result = pd.DataFrame({
    'Loan_Id': test_data.Loan_ID,
    'Approval Rate': probability[:,1]
})
result

Unnamed: 0,Loan_Id,Approval Rate
0,LP001008,0.645778
1,LP001011,0.744091
2,LP001013,0.698599
3,LP001024,0.861378
4,LP001028,0.742021
...,...,...
108,LP002958,0.591161
109,LP002974,0.798978
110,LP002978,0.749128
111,LP002979,0.921683
