In [1]:
import pandas as pd

# Importing

In [2]:
data = pd.read_csv("WorkshopData.csv")

In [3]:
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0.0,Graduate,No,4000-<=6000,,360.0,1.0,Urban,1.0
1,LP001003,Male,Yes,1.0,Graduate,No,4000-<=6000,128.0,360.0,1.0,Rural,0.0
2,LP001005,Male,Yes,0.0,Graduate,Yes,2000-<=3000,66.0,360.0,1.0,Urban,1.0
3,LP001006,Male,Yes,0.0,Not Graduate,No,2000-<=3000,120.0,360.0,1.0,Urban,1.0
4,LP001008,Male,No,0.0,Graduate,No,4000-<=6000,141.0,360.0,1.0,Urban,1.0


# Cleaning

In [4]:
data.isna().sum()

Loan_ID              0
Gender              13
Married              3
Dependents          15
Education            0
Self_Employed       32
ApplicantIncome      0
LoanAmount          22
Loan_Amount_Term    14
Credit_History      50
Property_Area        0
Loan_Status          0
dtype: int64

In [5]:
data.dropna(inplace=True)
# data.dropna(subset=['Gender', 'Dependents', 'Loan_Amount_Term'], inplace=True)
# inplace=True means to change the "data" variable rather than generating a new dataframe

# Ordinal Encoding

In [6]:
# normally, place all import statements at the top of the file
from sklearn.preprocessing import OrdinalEncoder

In [7]:
data['ApplicantIncome'].unique()

array(['4000-<=6000', '2000-<=3000', '3000-<=4000', '10000-<=20000',
       '1000-<=2000', '6000-<=8000', '8000-<=10000', '0-<=1000',
       '20000-<=100000'], dtype=object)

In [8]:
income_sort = ['0-<=1000','1000-<=2000', '2000-<=3000', '3000-<=4000','4000-<=6000','6000-<=8000','8000-<=10000','10000-<=20000','20000-<=100000']
income_enc = OrdinalEncoder(categories=[income_sort])

In [9]:
data['ApplicantIncome'] = income_enc.fit_transform(data['ApplicantIncome'].values.reshape(-1, 1))
# "values" converts the column from a pd.Series object to numpy.ndarray object
# "reshape(-1, 1)" converts the dimension of the object from (length,) to (length, 1); in other words, [a,b,c,...] --> [[a,b,c,...]]

In [10]:
data['ApplicantIncome'].value_counts()

3.0    122
4.0    111
2.0    106
5.0     42
7.0     35
6.0     29
1.0     26
8.0      6
0.0      3
Name: ApplicantIncome, dtype: int64

In [13]:
def encode_binary_variable(df, col, val_pos, val_neg):
    df[col] = df[col].replace({val_pos:1, val_neg:0})

In [14]:
encode_binary_variable(data, "Married", "Yes", "No")
encode_binary_variable(data, "Education", "Graduate", "Not Graduate")
encode_binary_variable(data, "Self_Employed", "Yes", "No")
encode_binary_variable(data, "Gender", "Female", "Male")

# One-Hot Encoding

In [11]:
dummies = pd.get_dummies(data['Property_Area'])
dummies.head()

Unnamed: 0,Rural,Semiurban,Urban
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1
5,0,0,1


In [12]:
data = pd.concat([data, dummies], axis=1)
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Rural,Semiurban,Urban
1,LP001003,Male,Yes,1.0,Graduate,No,4.0,128.0,360.0,1.0,Rural,0.0,1,0,0
2,LP001005,Male,Yes,0.0,Graduate,Yes,2.0,66.0,360.0,1.0,Urban,1.0,0,0,1
3,LP001006,Male,Yes,0.0,Not Graduate,No,2.0,120.0,360.0,1.0,Urban,1.0,0,0,1
4,LP001008,Male,No,0.0,Graduate,No,4.0,141.0,360.0,1.0,Urban,1.0,0,0,1
5,LP001011,Male,Yes,2.0,Graduate,Yes,4.0,267.0,360.0,1.0,Urban,1.0,0,0,1


In [15]:
data.drop(labels=["Property_Area"], axis=1, inplace=True)

In [16]:
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Rural,Semiurban,Urban
1,LP001003,0,1,1.0,1,0,4.0,128.0,360.0,1.0,0.0,1,0,0
2,LP001005,0,1,0.0,1,1,2.0,66.0,360.0,1.0,1.0,0,0,1
3,LP001006,0,1,0.0,0,0,2.0,120.0,360.0,1.0,1.0,0,0,1
4,LP001008,0,0,0.0,1,0,4.0,141.0,360.0,1.0,1.0,0,0,1
5,LP001011,0,1,2.0,1,1,4.0,267.0,360.0,1.0,1.0,0,0,1


# Creating the Training, Validation, and Testing Data

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X = data.drop(labels=["Loan_ID", "Loan_Status"], axis=1)
y = data["Loan_Status"]

In [19]:
# create testing dataset
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# create training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=0)

# Logistic Regression Model

In [23]:
from sklearn.linear_model import LogisticRegression

In [24]:
logreg = LogisticRegression(random_state=1, max_iter=1000)

In [25]:
logreg.fit(X_train, y_train)

LogisticRegression(max_iter=1000, random_state=1)

In [26]:
y_pred = logreg.predict(X_val)

# Model Performance

In [27]:
data['Loan_Status'].value_counts()
print(str(data['Loan_Status'].value_counts()[1]/(data['Loan_Status'].value_counts()[1]+data['Loan_Status'].value_counts()[0])*100)+"% of entries correspond to accepted loans")

69.16666666666667% of entries correspond to accepted loans


In [28]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [29]:
print("Accuracy: ", accuracy_score(y_val, y_pred))

Accuracy:  0.8571428571428571


In [30]:
print("Precision for Accepted Applications: ", precision_score(y_val, y_pred))
print("Recall for Accepted Applications: ", recall_score(y_val, y_pred))

Precision for Accepted Applications:  0.8461538461538461
Recall for Accepted Applications:  0.9821428571428571


In [31]:
print("Precision for Rejected Applications: ", precision_score(y_val, y_pred, pos_label=0))
print("Recall for Rejected Applications: ", recall_score(y_val, y_pred, pos_label=0))

Precision for Rejected Applications:  0.9166666666666666
Recall for Rejected Applications:  0.5238095238095238


# Utilizing SMOTE

In [32]:
from imblearn.over_sampling import SMOTE

In [33]:
smote = SMOTE(sampling_strategy='minority')
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

In [34]:
y_train_res.value_counts()

1.0    215
0.0    215
Name: Loan_Status, dtype: int64

In [35]:
logreg_sm = LogisticRegression(random_state=0, max_iter=1000)
logreg_sm.fit(X_train_res, y_train_res)
y_pred_sm = logreg_sm.predict(X_val)

In [36]:
print("Accuracy: ", accuracy_score(y_val, y_pred_sm))

Accuracy:  0.8571428571428571


In [37]:
print("Precision for Accepted Applications: ", precision_score(y_val, y_pred_sm))
print("Recall for Accepted Applications: ", recall_score(y_val, y_pred_sm))

Precision for Accepted Applications:  0.8571428571428571
Recall for Accepted Applications:  0.9642857142857143


In [38]:
print("Precision for Rejected Applications: ", precision_score(y_val, y_pred_sm, pos_label=0))
print("Recall for Rejected Applications: ", recall_score(y_val, y_pred_sm, pos_label=0))

Precision for Rejected Applications:  0.8571428571428571
Recall for Rejected Applications:  0.5714285714285714


# Measuring the Tradeoff

In [39]:
from sklearn.metrics import f1_score

In [40]:
print("F1 Score for the Baseline Model (Accepted): ", f1_score(y_val, y_pred))
print("F1 Score for the SMOTE Model (Accepted): ", f1_score(y_val, y_pred_sm))

F1 Score for the Baseline Model (Accepted):  0.9090909090909091
F1 Score for the SMOTE Model (Accepted):  0.9075630252100839


In [41]:
print("F1 Score for the Baseline Model (Rejected): ", f1_score(y_val, y_pred, pos_label=0))
print("F1 Score for the SMOTE Model (Rejected): ", f1_score(y_val, y_pred_sm, pos_label=0))

F1 Score for the Baseline Model (Rejected):  0.6666666666666667
F1 Score for the SMOTE Model (Rejected):  0.6857142857142857
