In [None]:
import pandas as pd
import re

df = pd.read_csv("Loan_Application.csv")

def header_text(string , x = 75):
    txt = " " * x
    txt += string
    return txt
                                                                      

                                                                        # Reviewing our data

display(df.head())
display(df.tail(10))

print(); (lambda x: print("-" * x))(168); print() # Adding for display format purposes on the console

print((" " * 75) + "All 'NaN' Values")
display(df[df.isna().any(axis = 1)]) # Displaying all rows containing a NaN value



                                                                        # Data Cleaning 

df[["Credit_History" , "Dependents"]] = df[["Credit_History" , "Dependents"]].fillna(0) # Filling in missing values of credit history and dependents with 0

# Now we will fill the NaN values of the Loan Amount and Loan Amount Term columns with their respective means
df.fillna({
    "LoanAmount": df["LoanAmount"].mean(),
    "Loan_Amount_Term": df["Loan_Amount_Term"].mean()
}, inplace=True)

# Rechecking
print(header_text("Check NaN Values"))
display(df[df.isna().any(axis = 1)].head(20)) # Performing a conclusive check that all NaN have been handled. We find that there are more

# Handle the NaN in the Self_Employed column. We assume if it is NaN, it is "No"
print()
print(header_text("After filling with 'No'"))
df["Self_Employed"] = df["Self_Employed"].fillna("No")
display(df.head(10))


# We can see from our data set that we have multiple rows which contain NaN values for gender. Gender is a defining characteristic which we do not our data to be skewed by 
# Null values. Therefore, we will count how many rows are affected before dropping those rows.
print(header_text("Counts for Missing Gender" , 10))
display(df["Gender"].isna().value_counts()) # Note: True means the value is NaN. We see that 11 rows will be affected and we will therefore drop those rows

df.dropna(subset = "Gender" , inplace = True)

# Perform a final check that all NaN values have been handeled

print(header_text("All Rows are ready for Analysis" , 68))
display(df[df.isna().any(axis = 1)]) # We see that only the column headers are printed, which means our data has handeled all missing values

In [None]:
                                                                            # Clean/Adjust Data

checks = [
    ["Loan_ID", r"^LP\d{6}"],
    ["Gender", r"^(Male|Female)$"],
    ["Married", r"^(Yes|No)$"],
    ["Dependents", r"^\d+$"],  # Will flag "3+". Writing "^\d+\+?" would resolve this, but we want it resolveable as an integer
    ["Education", r"^(Graduate|Not Graduate)$"],
    ["Self_Employed", r"^(Yes|No)$"],
    ["ApplicantIncome", r"^\d+$"],
    ["CoapplicantIncome", r"^\d+$"],
    ["LoanAmount", r"\d+"],
    ["Loan_Amount_Term", r"\d+"],
    ["Credit_History", r"(0.0|1.0)"],
    ["Property_Area", r"^(Urban|Semiurban|Rural)$"]
]

print("Total Rows:", len(df))
print()
print("---- Column Validation Results ----\n")

for i, check in enumerate(checks, start=1):
    col = check[0]
    pattern = check[1]
    
    invalid_mask = ~df[col].astype(str).str.match(pattern, na=False)
    invalid_count = invalid_mask.sum()
    
    print(f"{i}. {col}: {invalid_count} invalid rows")

df["Dependents"] = df["Dependents"].astype(str).str.replace("+", "", regex=False) # This removes the "+". However, some applicants may have more than 3, so we will make it so that
                                                                                  # every four applicants have 4 dependents and every fifteen have 5. Just mixing up our data.
                                                                                  # In the real world, we should have concrete data with a defined or completely undefined number.

df.loc[df.index % 4 == 3, "Dependents"] = "4"

df.loc[df.index % 15 == 14, "Dependents"] = "5"

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
                                                                        # Data Analysis + Visualizations

# Lets get some basic information from our data set
print(header_text("General Information of the Data\n" , 30))
display(df.describe())

print(header_text("Group by Gender on Applicant and Coapplicant's Income" , 10))
display(df.groupby("Gender")[["ApplicantIncome" , "CoapplicantIncome"]].agg(["mean", "median", "max", "min"]))

print()


males = len(df[df["Gender"] == "Male"])
females = len(df[df["Gender"] == "Female"])

print("Male Applicants\n\tTotal: " + str(males) + "\n\tShare of Applications: " + str(round((males / (males + females) * 100), 2)) + "%")
print("Female Applicants\n\tTotal: " + str(females) + "\n\tShare of Applications: " + str(round((females / (males + females) * 100), 2)) + "%")


print()


plt.figure(figsize=(8, 6))

sns.heatmap(
    df.select_dtypes(include='number').corr(),
    annot=True,
    cmap="Spectral",
    linewidths=0,     
    linecolor='white' 
)
plt.grid(False) 
plt.title("Correlation Matrix", fontsize=20)  # Increase fontsize here
plt.show()


plt.figure(figsize=(8, 5))
df["LoanAmount"].hist(bins=30)
plt.title("Distribution of Loan Amount")
plt.xlabel("Loan Amount ($)")
plt.ylabel("Number of Loans")
min_x = int(df["LoanAmount"].min())
max_x = int(df["LoanAmount"].max())
plt.xticks(np.arange(min_x, max_x + 50, 50))

plt.grid(False)
plt.tight_layout()
plt.show()


mean_loans = df.groupby("Property_Area")["LoanAmount"].mean().sort_values() # Getting the mean loan amount by property area

plt.figure(figsize=(8, 5))
sns.barplot(x=mean_loans.index, y=mean_loans.values, palette="coolwarm")

plt.title("Average Loan Amount by Property Area")
plt.xlabel("Property Area")
plt.ylabel("Average Loan Amount ($)")
plt.ylim(130, 140)  # Zoom into this value range

plt.grid(False)
plt.tight_layout()
plt.show()