<a href="https://colab.research.google.com/github/shubhammane7777/Comparing-Algorithms-for-Credit-Risk-Prediction-/blob/main/Comparison_of_algorithms_of_credit_risk_for_prediction_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
file_path = '/content/drive/MyDrive/lending_club_small.csv'
df = pd.read_csv(file_path, low_memory=False)

# Basic info
print("Shape of data:", df.shape)
df.head()


Shape of data: (40000, 144)


Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,7000.0,7000.0,7000.0,36 months,13.35,237.04,C,C2,9 years,RENT,...,,,Cash,N,,,,,,
1,6000.0,6000.0,6000.0,36 months,15.31,208.91,C,C2,,OWN,...,,,Cash,N,,,,,,
2,14400.0,14400.0,14400.0,36 months,16.99,513.33,D,D1,1 year,RENT,...,,,Cash,N,,,,,,
3,12775.0,12775.0,12775.0,60 months,14.33,299.45,C,C2,2 years,OWN,...,,,Cash,N,,,,,,
4,9100.0,9100.0,9100.0,36 months,12.12,302.78,B,B3,3 years,RENT,...,,,Cash,N,,,,,,


In [None]:
# Drop columns with more than 50% missing values
null_percent = df.isnull().mean() * 100
cols_to_drop = null_percent[null_percent > 50].index
df.drop(columns=cols_to_drop, inplace=True)

print("Dropped columns with >50% missing values:", list(cols_to_drop))

# Drop irrelevant columns
irrelevant = ['id', 'member_id', 'url', 'desc', 'emp_title', 'title', 'zip_code']
df.drop(columns=[col for col in irrelevant if col in df.columns], inplace=True)


print("Shape of data:", df.shape)

Dropped columns with >50% missing values: ['mths_since_last_record', 'next_pymnt_d', 'mths_since_last_major_derog', 'annual_inc_joint', 'dti_joint', 'verification_status_joint', 'open_acc_6m', 'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il', 'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc', 'all_util', 'inq_fi', 'total_cu_tl', 'inq_last_12m', 'mths_since_recent_bc_dlq', 'mths_since_recent_revol_delinq', 'revol_bal_joint', 'sec_app_fico_range_low', 'sec_app_fico_range_high', 'sec_app_earliest_cr_line', 'sec_app_inq_last_6mths', 'sec_app_mort_acc', 'sec_app_open_acc', 'sec_app_revol_util', 'sec_app_open_act_il', 'sec_app_num_rev_accts', 'sec_app_chargeoff_within_12_mths', 'sec_app_collections_12_mths_ex_med', 'sec_app_mths_since_last_major_derog', 'hardship_type', 'hardship_reason', 'hardship_status', 'deferral_term', 'hardship_amount', 'hardship_start_date', 'hardship_end_date', 'payment_plan_start_date', 'hardship_length', 'hardship_dpd', 'hardshi

In [None]:
print("Shape of data:", df.shape)
df.head()

Shape of data: (40000, 89)


Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,disbursement_method,debt_settlement_flag
0,7000.0,7000.0,7000.0,36 months,13.35,237.04,C,C2,9 years,RENT,...,33.3,0.0,0.0,78425.0,57438.0,21100.0,41625.0,N,Cash,N
1,6000.0,6000.0,6000.0,36 months,15.31,208.91,C,C2,,OWN,...,63.6,0.0,0.0,17700.0,14709.0,16900.0,0.0,N,Cash,N
2,14400.0,14400.0,14400.0,36 months,16.99,513.33,D,D1,1 year,RENT,...,0.0,0.0,0.0,54012.0,45503.0,2300.0,44912.0,N,Cash,N
3,12775.0,12775.0,12775.0,60 months,14.33,299.45,C,C2,2 years,OWN,...,50.0,0.0,0.0,173310.0,19845.0,20800.0,11271.0,N,Cash,N
4,9100.0,9100.0,9100.0,36 months,12.12,302.78,B,B3,3 years,RENT,...,0.0,0.0,0.0,83168.0,53257.0,15800.0,58668.0,N,Cash,N


In [None]:
# Filter to only two target classes
df = df[df['loan_status'].isin(['Fully Paid', 'Charged Off'])]

# Map to binary
df['loan_status'] = df['loan_status'].map({'Fully Paid': 0, 'Charged Off': 1})
df['loan_status'].value_counts(normalize=True)


Unnamed: 0_level_0,proportion
loan_status,Unnamed: 1_level_1


In [None]:
# Fill missing numerical features with median
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Fill missing categorical features with mode
cat_cols = df.select_dtypes(include='object').columns
df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])


IndexError: single positional indexer is out-of-bounds

In [None]:
df = pd.get_dummies(df, drop_first=True)
print("Shape after one-hot encoding:", df.shape)


Shape after one-hot encoding: (1345310, 1358)


In [None]:
# Assuming your cleaned and feature-selected DataFrame is called df
# and the target variable is 'loan_status'

X = df.drop('loan_status', axis=1)  # Features
y = df['loan_status']               # Target


In [None]:
from sklearn.preprocessing import StandardScaler

# Select only numeric columns
numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns

# Apply scaling only to numeric columns
scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])


Checking commit changes!

code ends here!