In [1]:
# Import required libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, f1_score, roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

In [2]:
# Load the datasets into DataFrames
train_df = pd.read_csv(r"D:\Rahulkumar Shiyani\WLU\Data Analysis\Project\Risk Scoring for Loan Approval Prediction\Datasets\train.csv")
test_df = pd.read_csv(r"D:\Rahulkumar Shiyani\WLU\Data Analysis\Project\Risk Scoring for Loan Approval Prediction\Datasets\test.csv")

# Show the first few rows of the training data
print("Training Dataset (First 5 Rows):")
train_df.head()

Training Dataset (First 5 Rows):


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [3]:
test_df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [4]:
# Get dataset info (e.g., column names, non-null values, data types)
print("Dataset Information:")
train_df.info()

# List all the column names in the dataset
print("\nColumn Names:")
print(train_df.columns)

# Check for missing values
missing_values = train_df.isnull().sum()
print("\nMissing Values (Columns with Missing Data):")
print(missing_values[missing_values > 0])

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB

Column Names:
Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Se

In [5]:
# Impute missing values in numerical columns with the mean value
numerical_cols = train_df.select_dtypes(include=['float64', 'int64']).columns
train_df[numerical_cols] = train_df[numerical_cols].fillna(train_df[numerical_cols].mean())

# Impute missing values for categorical columns with the mode (most frequent value)
categorical_cols = train_df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    train_df[col] = train_df[col].fillna(train_df[col].mode()[0])

# Verify that missing values are handled
print("Missing Values After Imputation:\n")
print(train_df.isnull().sum())

Missing Values After Imputation:

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64


In [6]:
# Find unique values for each categorical column
unique_values = {col: train_df[col].unique().tolist() for col in categorical_cols}

# Display unique values for each column
for col, values in unique_values.items():
    print(f"Unique values in '{col}': {values}")

Unique values in 'Loan_ID': ['LP001002', 'LP001003', 'LP001005', 'LP001006', 'LP001008', 'LP001011', 'LP001013', 'LP001014', 'LP001018', 'LP001020', 'LP001024', 'LP001027', 'LP001028', 'LP001029', 'LP001030', 'LP001032', 'LP001034', 'LP001036', 'LP001038', 'LP001041', 'LP001043', 'LP001046', 'LP001047', 'LP001050', 'LP001052', 'LP001066', 'LP001068', 'LP001073', 'LP001086', 'LP001087', 'LP001091', 'LP001095', 'LP001097', 'LP001098', 'LP001100', 'LP001106', 'LP001109', 'LP001112', 'LP001114', 'LP001116', 'LP001119', 'LP001120', 'LP001123', 'LP001131', 'LP001136', 'LP001137', 'LP001138', 'LP001144', 'LP001146', 'LP001151', 'LP001155', 'LP001157', 'LP001164', 'LP001179', 'LP001186', 'LP001194', 'LP001195', 'LP001197', 'LP001198', 'LP001199', 'LP001205', 'LP001206', 'LP001207', 'LP001213', 'LP001222', 'LP001225', 'LP001228', 'LP001233', 'LP001238', 'LP001241', 'LP001243', 'LP001245', 'LP001248', 'LP001250', 'LP001253', 'LP001255', 'LP001256', 'LP001259', 'LP001263', 'LP001264', 'LP001265',

In [7]:
# Exclude identifier column(s) like Loan_ID
train_df_for_corr = train_df.drop(columns=['Loan_ID'])

# Calculate the correlation matrix
correlation = train_df_for_corr.corr()

# Plotting the heatmap of the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(correlation, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix (All Columns Converted to Numeric)')
plt.show()

ValueError: could not convert string to float: 'Male'