In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer

# Load dataset from the specified path
path = "infosys spring board internship/kaggle/input/dataset-loan_default/Loan_Default.csv"
df = pd.read_csv(path)

# Drop irrelevant columns
drop_cols = [
    'ID', 'year', 'Region', 'loan_limit', 
    'co-applicant_credit_type', 'submission_of_application',
    'construction_type', 'total_units'
]
df = df.drop(columns=drop_cols, errors='ignore')

# Define important features
important_features = [
    'loan_amount', 'rate_of_interest', 'Interest_rate_spread', 'Upfront_charges',
    'term', 'property_value', 'income', 'Credit_Score', 'LTV', 'dtir1',
    'loan_type', 'loan_purpose', 'Credit_Worthiness', 'open_credit',
    'business_or_commercial', 'interest_only', 'lump_sum_payment',
    'occupancy_type', 'Security_Type', 'Status'
]
df = df[important_features]

# Handle missing numeric values
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
num_imputer = SimpleImputer(strategy='mean')
df[num_cols] = num_imputer.fit_transform(df[num_cols])

# Handle missing categorical values
cat_cols = df.select_dtypes(include=['object']).columns
cat_imputer = SimpleImputer(strategy='most_frequent')
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

# Encode categorical columns using one-hot encoding
df_encoded = pd.get_dummies(df, drop_first=True)

# Save cleaned dataset
output_path = "infosys spring board internship/kaggle/input/dataset-loan_default/Loan_Default_Cleaned.csv"
df_encoded.to_csv(output_path, index=False)

output_path


'infosys spring board internship/kaggle/input/dataset-loan_default/Loan_Default_Cleaned.csv'

In [2]:
loan=pd.read_csv('datasets/evaluated_dataset.csv')

In [3]:
loan.isnull().sum()


loan_type                     0
loan_purpose                134
Credit_Worthiness             0
open_credit                   0
business_or_commercial        0
loan_amount                   0
rate_of_interest          36438
Interest_rate_spread      36638
Upfront_charges           39641
term                         41
interest_only                 0
lump_sum_payment              0
property_value            15097
occupancy_type                0
income                     9150
Credit_Score                  0
age                         200
LTV                       15097
Security_Type                 0
Status                        0
dtir1                     24120
dtype: int64

In [5]:
loan.columns



Index(['loan_type', 'loan_purpose', 'Credit_Worthiness', 'open_credit',
       'business_or_commercial', 'loan_amount', 'rate_of_interest',
       'Interest_rate_spread', 'Upfront_charges', 'term', 'interest_only',
       'lump_sum_payment', 'property_value', 'occupancy_type', 'income',
       'Credit_Score', 'age', 'LTV', 'Security_Type', 'Status', 'dtir1'],
      dtype='object')

In [8]:
import pandas as pd

loan = pd.read_csv('datasets/evaluated_dataset.csv')

num_cols = [
    'open_credit', 'loan_amount', 'rate_of_interest', 'Interest_rate_spread',
    'Upfront_charges', 'term', 'property_value', 'income', 
    'Credit_Score', 'age', 'LTV', 'dtir1'
]

# Step 1: Convert to numeric (force non-numeric to NaN)
for col in num_cols:
    loan[col] = pd.to_numeric(loan[col], errors='coerce')

# Step 2: Fill NaN with median
for col in num_cols:
    loan[col].fillna(loan[col].median(), inplace=True)

# Step 3: Fill categorical columns with mode
cat_cols = [c for c in loan.columns if c not in num_cols]
for col in cat_cols:
    loan[col].fillna(loan[col].mode()[0], inplace=True)

# Step 4: Check remaining nulls
print("Remaining nulls:\n", loan.isnull().sum().sort_values(ascending=False))

# Step 5: Save cleaned dataset
loan.to_csv('datasets/cleaned_dataset.csv', index=False)
print("\n✅ Cleaned dataset saved as 'datasets/cleaned_dataset.csv'")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  loan[col].fillna(loan[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  loan[col].fillna(loan[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setti

Remaining nulls:
 open_credit               148669
age                       148669
loan_type                      0
Credit_Worthiness              0
loan_purpose                   0
business_or_commercial         0
loan_amount                    0
Interest_rate_spread           0
rate_of_interest               0
term                           0
interest_only                  0
lump_sum_payment               0
Upfront_charges                0
property_value                 0
occupancy_type                 0
income                         0
Credit_Score                   0
LTV                            0
Security_Type                  0
Status                         0
dtir1                          0
dtype: int64

✅ Cleaned dataset saved as 'datasets/cleaned_dataset.csv'


In [9]:
loan = pd.read_csv('datasets/cleaned_dataset.csv')

In [10]:

# Step 2: Separate categorical & numeric columns
num_cols = [
    'open_credit', 'loan_amount', 'rate_of_interest', 'Interest_rate_spread',
    'Upfront_charges', 'term', 'property_value', 'income',
    'Credit_Score', 'age', 'LTV', 'dtir1'
]
cat_cols = [c for c in loan.columns if c not in num_cols + ['Status']]



In [11]:

# Step 3: One-hot encode categorical columns
loan_encoded = pd.get_dummies(loan, columns=cat_cols, drop_first=True)


In [12]:
# Step 4: Save one-hot encoded dataset
loan_encoded.to_csv('datasets/encoded_dataset.csv', index=False)
print("✅ One-hot encoded dataset saved as 'datasets/encoded_dataset.csv'")


✅ One-hot encoded dataset saved as 'datasets/encoded_dataset.csv'
