In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(style='darkgrid')
import matplotlib.pyplot as plt

# Load dataset
df = pd.read_csv('data/train_u6lujuX_CVtuZ9i.csv')

# Handle missing values
df['Gender'] = df['Gender'].fillna(df['Gender'].dropna().mode().values[0])
df['Married'] = df['Married'].fillna(df['Married'].dropna().mode().values[0])
df['Dependents'] = df['Dependents'].fillna('0').str.rstrip('+')  # Change 1
df['Self_Employed'] = df['Self_Employed'].fillna(df['Self_Employed'].dropna().mode().values[0])
df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].dropna().mean())
df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].dropna().mode().values[0])
df['Credit_History'] = df['Credit_History'].fillna(df['Credit_History'].dropna().mode().values[0])

# Convert 'Dependents' to numeric safely
df['Dependents'] = pd.to_numeric(df['Dependents'], errors='coerce').fillna(0).astype(int)  # Change 2

# Categorical variable encoding
df['Gender'] = df['Gender'].map({'Female': 0, 'Male': 1}).astype(int)
df['Married'] = df['Married'].map({'No': 0, 'Yes': 1}).astype(int)
df['Education'] = df['Education'].map({'Not Graduate': 0, 'Graduate': 1}).astype(int)
df['Self_Employed'] = df['Self_Employed'].map({'No': 0, 'Yes': 1}).astype(int)
df['Loan_Status'] = df['Loan_Status'].map({'N': 0, 'Y': 1}).astype(int)

# Verify data types
df.dtypes

# Split into features and target
X, y = df.iloc[:, 1:-1], df.iloc[:, -1]

# One-hot encoding for categorical variables
X = pd.get_dummies(X)

I defined a function which will help us to create xgboost models and perform cross validation

In [3]:
dtrain= pd.get_dummies(df)
import xgboost as xgb
from sklearn import model_selection, metrics
from sklearn.model_selection import GridSearchCV


train = dtrain
target = 'Loan_Status'
IDcol = 'Loan_ID'

In [12]:
predictors = [x for x in train.columns if x not in [target, IDcol]]

def modelfit(alg, dtrain, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(
            xgb_param,
            xgtrain,
            num_boost_round=alg.get_params()['n_estimators'],
            nfold=cv_folds,
            metrics='auc',
            early_stopping_rounds=early_stopping_rounds
        )
        alg.set_params(n_estimators=cvresult.shape[0])

    # Fit the algorithm on the data
    alg.fit(dtrain[predictors ], dtrain[target])

    # Predict training set
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]

    # Print model report
    print("\nModel Report")
    print("Accuracy : %.4g" % metrics.accuracy_score(dtrain[target], dtrain_predictions))
    print("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain[target], dtrain_predprob))


In [8]:
from xgboost import XGBClassifier

In [14]:
predictors = [x for x in train.columns if x not in [target, IDcol]]

# Initialize the model
xgb2 = XGBClassifier(
    learning_rate=0.1,
    n_estimators=1000,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    n_jobs=4,  # Updated from nthread
    scale_pos_weight=1,
    random_state=27  # Updated from seed
)

# Fit the model
modelfit(xgb2, train, predictors)


Model Report
Accuracy : 0.9153
AUC Score (Train): 0.985733


Parameter Tuning For XGBoost model

In [15]:
param_test1 = {
    'max_depth': list(range(3, 10, 2)),
    'min_child_weight': list(range(1, 6, 2))
}

# Initialize GridSearchCV without 'iid' parameter
gsearch1 = GridSearchCV(
    estimator=XGBClassifier(
        learning_rate=0.1,
        n_estimators=140,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective='binary:logistic',
        n_jobs=4,
        scale_pos_weight=1,
        random_state=27  # Updated from 'seed'
    ),
    param_grid=param_test1,
    scoring='roc_auc',
    n_jobs=4,
    cv=5
)
gsearch1.fit(train[predictors], train[target])
print("Best parameters after test 1:", gsearch1.best_params_)
print("Best AUC score after test 1:", gsearch1.best_score_)
# gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

Best parameters after test 1: {'max_depth': 5, 'min_child_weight': 5}
Best AUC score after test 1: 0.7650016821843446
