In [33]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder


file_path = "/Users/raghavsharma/desktop/loan_default_predication_kaggle.csv"

df = pd.read_csv(file_path)

In [34]:
#Names of the columns
print(df.columns)

Index(['LoanID', 'Age', 'Income', 'LoanAmount', 'CreditScore',
       'MonthsEmployed', 'NumCreditLines', 'InterestRate', 'LoanTerm',
       'DTIRatio', 'Education', 'EmploymentType', 'MaritalStatus',
       'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner',
       'Default'],
      dtype='object')


In [35]:
# Shape of the dataset
print(df.shape)

# First 5 rows
print(df.head(5))



(255347, 18)
       LoanID  Age  Income  LoanAmount  CreditScore  MonthsEmployed  \
0  I38PQUQS96   56   85994       50587          520              80   
1  HPSK72WA7R   69   50432      124440          458              15   
2  C1OZ6DPJ8Y   46   84208      129188          451              26   
3  V2KKSFM3UN   32   31713       44799          743               0   
4  EY08JDHTZP   60   20437        9139          633               8   

   NumCreditLines  InterestRate  LoanTerm  DTIRatio    Education  \
0               4         15.23        36      0.44   Bachelor's   
1               1          4.81        60      0.68     Master's   
2               3         21.17        24      0.31     Master's   
3               3          7.07        24      0.23  High School   
4               4          6.51        48      0.73   Bachelor's   

  EmploymentType MaritalStatus HasMortgage HasDependents LoanPurpose  \
0      Full-time      Divorced         Yes           Yes       Other   
1      

In [36]:
# Data types and missing values
print(df.info())

# Summary statistics
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255347 entries, 0 to 255346
Data columns (total 18 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   LoanID          255347 non-null  object 
 1   Age             255347 non-null  int64  
 2   Income          255347 non-null  int64  
 3   LoanAmount      255347 non-null  int64  
 4   CreditScore     255347 non-null  int64  
 5   MonthsEmployed  255347 non-null  int64  
 6   NumCreditLines  255347 non-null  int64  
 7   InterestRate    255347 non-null  float64
 8   LoanTerm        255347 non-null  int64  
 9   DTIRatio        255347 non-null  float64
 10  Education       255347 non-null  object 
 11  EmploymentType  255347 non-null  object 
 12  MaritalStatus   255347 non-null  object 
 13  HasMortgage     255347 non-null  object 
 14  HasDependents   255347 non-null  object 
 15  LoanPurpose     255347 non-null  object 
 16  HasCoSigner     255347 non-null  object 
 17  Default   

In [37]:
print(df.isnull().sum)

<bound method NDFrame._add_numeric_operations.<locals>.sum of         LoanID    Age  Income  LoanAmount  CreditScore  MonthsEmployed  \
0        False  False   False       False        False           False   
1        False  False   False       False        False           False   
2        False  False   False       False        False           False   
3        False  False   False       False        False           False   
4        False  False   False       False        False           False   
...        ...    ...     ...         ...          ...             ...   
255342   False  False   False       False        False           False   
255343   False  False   False       False        False           False   
255344   False  False   False       False        False           False   
255345   False  False   False       False        False           False   
255346   False  False   False       False        False           False   

        NumCreditLines  InterestRate  LoanTerm  D

In [38]:
missing_percent = df.isnull().sum() / len(df) * 100
print(missing_percent)


LoanID            0.0
Age               0.0
Income            0.0
LoanAmount        0.0
CreditScore       0.0
MonthsEmployed    0.0
NumCreditLines    0.0
InterestRate      0.0
LoanTerm          0.0
DTIRatio          0.0
Education         0.0
EmploymentType    0.0
MaritalStatus     0.0
HasMortgage       0.0
HasDependents     0.0
LoanPurpose       0.0
HasCoSigner       0.0
Default           0.0
dtype: float64


In [39]:
#In the dataset, the column LoanID is a unique identifier for each loan application and does not carry any meaningful information that could help the model 
# predict whether a loan will default or not.

#The correct course of action is to drop the column before the processing of the data.

df.drop(columns=['LoanID'], inplace=True)

print(df.head(5))

   Age  Income  LoanAmount  CreditScore  MonthsEmployed  NumCreditLines  \
0   56   85994       50587          520              80               4   
1   69   50432      124440          458              15               1   
2   46   84208      129188          451              26               3   
3   32   31713       44799          743               0               3   
4   60   20437        9139          633               8               4   

   InterestRate  LoanTerm  DTIRatio    Education EmploymentType MaritalStatus  \
0         15.23        36      0.44   Bachelor's      Full-time      Divorced   
1          4.81        60      0.68     Master's      Full-time       Married   
2         21.17        24      0.31     Master's     Unemployed      Divorced   
3          7.07        24      0.23  High School      Full-time       Married   
4          6.51        48      0.73   Bachelor's     Unemployed      Divorced   

  HasMortgage HasDependents LoanPurpose HasCoSigner  Default  

In [41]:
#A few of the columns (Education, EmploymentType, MaritalStatus, HasMortgage, HasDependents, LoanPurpose, HasCosigner) have categorical values. 
# Before we proceed, we need to convert them into numerical values.

label_encoder = LabelEncoder()

# Apply Label Encoding to the 'purpose' column since it is the only categorical value column
df['EmploymentType_encoded'] = label_encoder.fit_transform(df['EmploymentType'])
df['MarritalStatus_encoded'] = label_encoder.fit_transform(df['MaritalStatus'])
df['Education_encoded'] = label_encoder.fit_transform(df['Education'])
df['HasMortgage_encoded'] = label_encoder.fit_transform(df['HasMortgage'])
df['HasDependents_encoded'] = label_encoder.fit_transform(df['HasDependents'])
df['LoanPurpose_encoded'] = label_encoder.fit_transform(df['LoanPurpose'])
df['HasCoSigner_encoded'] = label_encoder.fit_transform(df['HasCoSigner'])

print(df.columns)

Index(['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed',
       'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio', 'Education',
       'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents',
       'LoanPurpose', 'HasCoSigner', 'Default', 'EmploymentType_encoded',
       'MarritalStatus_encoded', 'Education_encoded', 'HasMortgage_encoded',
       'HasDependents_encoded', 'LoanPurpose_encoded', 'HasCoSigner_encoded'],
      dtype='object')


In [42]:
print(df.head(4))

   Age  Income  LoanAmount  CreditScore  MonthsEmployed  NumCreditLines  \
0   56   85994       50587          520              80               4   
1   69   50432      124440          458              15               1   
2   46   84208      129188          451              26               3   
3   32   31713       44799          743               0               3   

   InterestRate  LoanTerm  DTIRatio    Education  ... LoanPurpose HasCoSigner  \
0         15.23        36      0.44   Bachelor's  ...       Other         Yes   
1          4.81        60      0.68     Master's  ...       Other         Yes   
2         21.17        24      0.31     Master's  ...        Auto          No   
3          7.07        24      0.23  High School  ...    Business          No   

  Default EmploymentType_encoded MarritalStatus_encoded Education_encoded  \
0       0                      0                      0                 0   
1       0                      0                      1         

In [44]:
#Now, we drop the columns which have categorical values only since the we have encoded them into numerical values hence creating new columns with _encoded name

columns_to_drop = ['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']

# Drop the specified columns
df.drop(columns=columns_to_drop, inplace=True)

# Verify that the columns have been dropped
print("Remaining columns:", df.columns)

Remaining columns: Index(['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed',
       'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio', 'Default',
       'EmploymentType_encoded', 'MarritalStatus_encoded', 'Education_encoded',
       'HasMortgage_encoded', 'HasDependents_encoded', 'LoanPurpose_encoded',
       'HasCoSigner_encoded'],
      dtype='object')
