In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score,roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import matplotlib.pyplot as plt
!pip install xgboost
import xgboost as xgb
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')



In [2]:
loan_train =pd.read_csv('train_csv.csv' )
print(loan_train.shape) # (614, 13)
loan_train.head()

(614, 13)


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [3]:
total_null=loan_train.isnull().sum().sort_values(ascending=False)
total_null.head(10)

Credit_History      50
Self_Employed       32
LoanAmount          22
Dependents          15
Loan_Amount_Term    14
Gender              13
Married              3
Loan_ID              0
Education            0
ApplicantIncome      0
dtype: int64

In [4]:
loan_train['Gender'] = loan_train ['Gender'].fillna( loan_train['Gender'].dropna ().mode().values[0])
loan_train['Married'] = loan_train['Married'].fillna( loan_train['Married'].dropna ().mode().values[0])
loan_train['Dependents'] = loan_train['Dependents'].fillna(loan_train['Dependents'].dropna ().mode().values[0])
loan_train['Self Employed'] = loan_train[ 'Self_Employed'].fillna(loan_train['Self_Employed'].dropna().mode().values [0] )
loan_train['LoanAmount'] = loan_train['LoanAmount'].fillna(loan_train ['LoanAmount'].dropna().mean())
loan_train['Loan_Amount_Term'] = loan_train[ 'Loan_Amount_Term'].fillna(loan_train['Loan_Amount_Term'].dropna ().mode().values [0])
loan_train['Credit_History'] = loan_train [ 'Credit_History'].fillna(loan_train['Credit_History'].dropna().mode().values [0])

In [5]:
loan_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             614 non-null    object 
 2   Married            614 non-null    object 
 3   Dependents         614 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         614 non-null    float64
 9   Loan_Amount_Term   614 non-null    float64
 10  Credit_History     614 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
 13  Self Employed      614 non-null    object 
dtypes: float64(4), int64(1), object(9)
memory usage: 67.3+ KB


In [6]:
print(set(loan_train['Gender'].values.tolist()))
print(set(loan_train['Dependents'].values.tolist()))
print(set(loan_train['Married'].values.tolist()))
print(set(loan_train['Education'].values.tolist()))
print(set(loan_train['Self_Employed'].values.tolist()))
print(set(loan_train['Loan_Status'].values.tolist()))
print(set(loan_train['Property_Area'].values.tolist()))

{'Female', 'Male'}
{'0', '2', '1', '3+'}
{'Yes', 'No'}
{'Not Graduate', 'Graduate'}
{'Yes', 'No', nan}
{'N', 'Y'}
{'Urban', 'Rural', 'Semiurban'}


In [7]:
loan_train['Loan_Status'] = loan_train['Loan_Status'].map({'N': 0, 'Y': 1}).fillna(0).astype(int)
loan_train = pd.get_dummies(loan_train, columns=['Gender', 'Dependents', 'Married', 'Education', 'Self Employed', 'Property_Area'])
standardScaler = StandardScaler()
loan_train['Loan_Term'] = loan_train['Loan_Amount_Term']
del loan_train['Loan_Amount_Term']
loan_train[['CoapplicantIncome', 'LoanAmount', 'Loan_Term']] = standardScaler.fit_transform(loan_train[['CoapplicantIncome', 'LoanAmount', 'Loan_Term']])

In [8]:
y = loan_train['Loan_Status']  # Select target variable (Loan_Status)
X = loan_train.drop(['Loan_Status', 'Loan_ID'], axis=1)  # Drop target and ID columns

# Split data using the defined variables
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
gbm_param_grid = {
    'n_estimators': range(1, 1000, 10),
    'max_depth': range(1, 20),
    'learning_rate': [.1, .4, .45, .5, .55, .6],
    'colsample_bytree': [.6, .7, .8, .9, 1.0, 1.1]
}
X_train = pd.get_dummies(X_train, columns=['Self_Employed'])  # One-hot encode Self_Employed
X_test = pd.get_dummies(X_test, columns=['Self_Employed'])
xgb_classifier = XGBClassifier(enable_categorical=True)

xgb_random = RandomizedSearchCV(param_distributions=gbm_param_grid,
                                 estimator=xgb_classifier, scoring="accuracy",
                                 verbose=0, n_iter=100, cv=4)
error_score='raise'


xgb_random.fit(X_train,y_train)


print(f'Best parameters: {xgb_random.best_params_}')

y_pred = xgb_random.predict(X_test)
print(f'Accuracy: {np.sum(y_pred == y_test) / len(y_test)}')

Best parameters: {'n_estimators': 41, 'max_depth': 1, 'learning_rate': 0.55, 'colsample_bytree': 0.7}
Accuracy: 0.7886178861788617


In [10]:
param_grid = {

'max_depth': range(4,25),
'min_samples_leaf': range(10, 100, 10),
'min_samples_split' : range(10, 100, 10),
'criterion': ['gini', 'entropy']
}
n_folds = 5
dt = DecisionTreeClassifier (random_state=np.random.randint(0, 100))
dt_grid = GridSearchCV(dt, param_grid, cv = n_folds, return_train_score=True, verbose=0)
dt_grid.fit(X_train,y_train)
print(dt_grid.best_params_)
# {'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 20, 'min_samples_split': 10}
y_pred_best=dt_grid.predict(X_test)
acc = metrics.accuracy_score (y_test, y_pred_best)
print(acc)


{'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 20, 'min_samples_split': 10}
0.7804878048780488


In [11]:
rf_param_grid = {
'n_estimators': range (1,1000, 100),
}
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(param_distributions=rf_param_grid,estimator = rf, scoring = "accuracy",verbose = 0, n_iter = 100, cv = 4)
rf_random.fit(X_train, y_train)

best_params = rf_random.best_params_
print(f'Best parameters: {best_params}')
# Best parameters: {'n_estimators': 101}
y_pred1 = rf_random.predict(X_test)
print(f'Accuracy: {np.sum(y_pred1==y_test)/len(y_test)}')



Best parameters: {'n_estimators': 701}
Accuracy: 0.7723577235772358


In [None]:
svm_param_grid = {
'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
'C' :range(1,11)
}
svm = SVC( )
svm_random = RandomizedSearchCV(param_distributions=svm_param_grid,estimator = svm, scoring = "accuracy",verbose = 0, n_iter = 100, cv = 4)
svm_random.fit(X_train, y_train)
best_params = svm_random.best_params_
print(f'Best parameters: {best_params}')
# Best parameters: {'kernel': 'linear' 'C': 1}
y_pred_best=svm_random.predict(X_test)
acc = metrics.accuracy_score (y_test, y_pred_best)
print(acc)
# 0.788638980231

In [None]:
def feature_imp(df,model):
    feat = pd.DataFrame(columns=['feature', 'importance'])
    feat["feature"] = df.columns
    feat["importance"] = model.best_estimator_.feature_importances_
    return feat.sort_values(by="importance", ascending=False)

In [None]:
feature_imp(X_train,dt_grid).plot('feature', 'importance', 'barh',figsize=(10,7), legend=False)