In [91]:
# Import modules
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [92]:
# Read the CSV file into a Pandas DataFrame
df = pd.read_csv('train_loanprediction.csv')

# Review the DataFrame
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [93]:
# Identify missing values in each column
missing_values = df.isnull().sum()
missing_values

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [103]:
# Handle Missing Values (MEDIAN!!)
# Define imputers for categorical and numerical columns
categorical_imputer = SimpleImputer(strategy='most_frequent')
numerical_imputer = SimpleImputer(strategy='median')

# Apply imputers
df[['Gender', 'Married', 'Dependents', 'Self_Employed', 'Credit_History']] = \
    categorical_imputer.fit_transform(df[['Gender', 'Married', 'Dependents', 'Self_Employed', 'Credit_History']])

df[['LoanAmount', 'Loan_Amount_Term']] = \
    numerical_imputer.fit_transform(df[['LoanAmount', 'Loan_Amount_Term']])

# Apply imputers
df[['Gender', 'Married', 'Dependents', 'Self_Employed', 'Credit_History']] = \
    categorical_imputer.fit_transform(df[['Gender', 'Married', 'Dependents', 'Self_Employed', 'Credit_History']])

df[['LoanAmount', 'Loan_Amount_Term']] = \
    numerical_imputer.fit_transform(df[['LoanAmount', 'Loan_Amount_Term']])

# Verify that there are no more NaNs
print(df.isnull().sum())

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64


In [104]:
# Split the features and target data
y = df['Loan_Status']
X = df.drop(columns='Loan_Status')

In [105]:
# Encode the features dataset's categorical variables using get_dummies
X = pd.get_dummies(X)

# Review the features DataFrame
X.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Loan_ID_LP001002,Loan_ID_LP001003,Loan_ID_LP001005,Loan_ID_LP001006,Loan_ID_LP001008,Loan_ID_LP001011,...,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,Credit_History_0.0,Credit_History_1.0,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
0,5849,0.0,146.412162,360.0,True,False,False,False,False,False,...,False,True,False,True,False,False,True,False,False,True
1,4583,1508.0,128.0,360.0,False,True,False,False,False,False,...,False,True,False,True,False,False,True,True,False,False
2,3000,0.0,66.0,360.0,False,False,True,False,False,False,...,False,True,False,False,True,False,True,False,False,True
3,2583,2358.0,120.0,360.0,False,False,False,True,False,False,...,False,False,True,True,False,False,True,False,False,True
4,6000,0.0,141.0,360.0,False,False,False,False,True,False,...,False,True,False,True,False,False,True,False,False,True


In [106]:
# Split data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=3)

In [98]:
# Review the distinct values from y
y_train.value_counts()

Loan_Status
Y    301
N    159
Name: count, dtype: int64

In [99]:
# Find the percentage of null values in each column
X_train.isna().sum()/len(X_train)

ApplicantIncome            0.0
CoapplicantIncome          0.0
LoanAmount                 0.0
Loan_Amount_Term           0.0
Loan_ID_LP001002           0.0
                          ... 
Credit_History_0.0         0.0
Credit_History_1.0         0.0
Property_Area_Rural        0.0
Property_Area_Semiurban    0.0
Property_Area_Urban        0.0
Length: 635, dtype: float64

In [100]:
# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled = X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)

In [101]:
# Import the RandomForestClassifier from sklearn
from sklearn.ensemble import RandomForestClassifier

# Instantiate a RandomForestClassifier instance
model = RandomForestClassifier()

# Fit the traning data to the model
model.fit(X_train_scaled, y_train)

In [107]:
# Predict labels for original scaled testing features
y_pred = model.predict(X_test_scaled)

In [108]:
# Import RandomUnderSampler from imblearn
from imblearn.under_sampling import RandomUnderSampler

# Instantiate a RandomUnderSampler instance
rus = RandomUnderSampler(random_state=1)

In [109]:
# Fit the training data to the random undersampler model
X_undersampled, y_undersampled = rus.fit_resample(X_train_scaled, y_train)

In [110]:
# Count distinct values for the resampled target data
y_undersampled.value_counts()

Loan_Status
N    159
Y    159
Name: count, dtype: int64

In [111]:
# Instantiate a new RandomForestClassier model
model_undersampled = RandomForestClassifier()

# Fit the undersampled data the new model
model_undersampled.fit(X_undersampled, y_undersampled)

In [113]:
# Predict labels for oversampled testing features
y_pred_undersampled = model_undersampled.predict(X_test_scaled)

In [116]:
# Print classification reports
print(f"Classification Report - Original Data")
print(classification_report(y_test, y_pred))
print("---------")
print(f"Classification Report - Undersampled Data")
print(classification_report(y_test, y_pred_undersampled))

Classification Report - Original Data
              precision    recall  f1-score   support

           N       0.65      0.39      0.49        33
           Y       0.85      0.94      0.89       121

    accuracy                           0.82       154
   macro avg       0.75      0.67      0.69       154
weighted avg       0.81      0.82      0.81       154

---------
Classification Report - Undersampled Data
              precision    recall  f1-score   support

           N       0.42      0.39      0.41        33
           Y       0.84      0.85      0.84       121

    accuracy                           0.75       154
   macro avg       0.63      0.62      0.63       154
weighted avg       0.75      0.75      0.75       154



In [117]:
# Import RandomOverSampler from imblearn
from imblearn.over_sampling import RandomOverSampler

# Instantiate a RandomOversampler instance
ros = RandomOverSampler(random_state=1)

In [118]:
# Fit the training data to the `RandomOverSampler` model
X_oversampled, y_oversampled = ros.fit_resample(X_train_scaled, y_train)

In [119]:
# Count distinct values
y_oversampled.value_counts()

Loan_Status
Y    301
N    301
Name: count, dtype: int64

In [120]:
# Instantiate a new RandomForestClassier model
model_oversampled = RandomForestClassifier()

# Fit the oversampled data the new model
model_oversampled.fit(X_oversampled, y_oversampled)

In [121]:
# Predict labels for oversampled testing features
y_pred_oversampled = model_oversampled.predict(X_test_scaled)

In [122]:
#Print classification reports
print(f"Classification Report - Original Data")
print(classification_report(y_test, y_pred))
print("---------")
print(f"Classification Report - Undersampled Data")
print(classification_report(y_test, y_pred_undersampled))
print("---------")
print(f"Classification Report - Oversampled Data")
print(classification_report(y_test, y_pred_oversampled))

Classification Report - Original Data
              precision    recall  f1-score   support

           N       0.65      0.39      0.49        33
           Y       0.85      0.94      0.89       121

    accuracy                           0.82       154
   macro avg       0.75      0.67      0.69       154
weighted avg       0.81      0.82      0.81       154

---------
Classification Report - Undersampled Data
              precision    recall  f1-score   support

           N       0.42      0.39      0.41        33
           Y       0.84      0.85      0.84       121

    accuracy                           0.75       154
   macro avg       0.63      0.62      0.63       154
weighted avg       0.75      0.75      0.75       154

---------
Classification Report - Oversampled Data
              precision    recall  f1-score   support

           N       0.59      0.39      0.47        33
           Y       0.85      0.93      0.89       121

    accuracy                           0.