In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTENC
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
import joblib

In [2]:
# 1. Load data
df = pd.read_csv('new_file.csv')
df.head()

Unnamed: 0,Total_TL_opened_L6M,Tot_TL_closed_L6M,pct_tl_closed_L6M,pct_closed_tl,pct_tl_open_L12M,pct_tl_closed_L12M,Tot_Missed_Pmnt,CC_TL,Consumer_TL,Gold_TL,...,pct_PL_enq_L6m_of_ever,pct_CC_enq_L6m_of_ever,HL_Flag,GL_Flag,MARITALSTATUS,EDUCATION,GENDER,last_prod_enq2,first_prod_enq2,Approved_Flag
0,0,0,0.0,0.8,0.0,0.0,0,0,0,1,...,0.0,0.0,1,0,Married,12TH,M,PL,PL,P2
1,0,0,0.0,0.0,1.0,0.0,0,0,1,0,...,0.0,0.0,0,0,Single,GRADUATE,F,ConsumerLoan,ConsumerLoan,P2
2,1,0,0.0,0.0,0.25,0.0,1,0,6,1,...,0.0,0.0,1,0,Married,SSC,M,ConsumerLoan,others,P2
3,1,0,0.0,0.0,1.0,0.0,1,0,0,0,...,0.0,0.0,0,0,Married,SSC,M,others,others,P2
4,0,0,0.0,0.667,0.0,0.0,0,0,0,0,...,0.0,0.0,0,0,Married,POST-GRADUATE,M,AL,AL,P1


In [3]:
# 2. Handle missing values
# Replace known missing indicators with np.nan
df.replace([-99999, 'None', 'none', 'NaN', 'nan', ''], np.nan, inplace=True)

In [4]:
# Drop rows where target is missing
df.dropna(subset=['Approved_Flag'], inplace=True)

In [5]:
# Fill or drop remaining nulls (optional strategy: drop rows with >50% missing)
df = df[df.isnull().mean(axis=1) < 0.5]

In [6]:
# 3. Define features and target
X = df.drop(columns='Approved_Flag')
y = df['Approved_Flag']

In [7]:
# 4. Define categorical columns
onehot_cols = ['MARITALSTATUS', 'GENDER', 'last_prod_enq2', 'first_prod_enq2']
ordinal_col = ['EDUCATION']
all_cat_cols = onehot_cols + ordinal_col

In [8]:
df['EDUCATION'].value_counts()

EDUCATION
GRADUATE          16655
12TH              14458
SSC                9268
UNDER GRADUATE     5491
OTHERS             2915
POST-GRADUATE      2240
PROFESSIONAL        269
Name: count, dtype: int64

In [21]:
# Order for ordinal encoding
education_order = [['SSC', '12TH', 'UNDER GRADUATE', 'GRADUATE', 'POST-GRADUATE', 'PROFESSIONAL', 'OTHERS']]

In [22]:
# 5. Define numerical columns
numerical_cols = [col for col in X.columns if col not in all_cat_cols]

In [23]:
# 6. Preprocessing pipelines
onehot_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first'))
])

In [24]:
ordinal_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('ordinal', OrdinalEncoder(categories=education_order))
])

In [25]:
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

In [26]:
# Combine all preprocessors
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', onehot_pipeline, onehot_cols),
        ('ordinal', ordinal_pipeline, ordinal_col),
        ('num', numerical_pipeline, numerical_cols)
    ],
    remainder='drop'
)

In [27]:
# Fit OneHotEncoder to get SMOTENC categorical indices
X_onehot_imputed = SimpleImputer(strategy='constant', fill_value='missing').fit_transform(X[onehot_cols])
ohe = OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False)
ohe.fit(X_onehot_imputed)
num_ohe_features = sum(len(cats) - 1 for cats in ohe.categories_)
categorical_indices = list(range(num_ohe_features)) + [num_ohe_features]

In [28]:
# Encode target
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [29]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, stratify=y_encoded, test_size=0.2, random_state=42)

In [30]:
# Define XGBoost model
xgb_model = XGBClassifier(
    n_estimators=287,
    max_depth=4,
    learning_rate=0.185,
    reg_alpha=0.787,
    reg_lambda=0.884,
    objective='multi:softprob',
    num_class=len(np.unique(y_encoded)),
    eval_metric='mlogloss',
    random_state=42,
    verbosity=0
)

In [31]:
# Full pipeline
pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTENC(categorical_features=categorical_indices, random_state=42)),
    ('selectk', SelectKBest(score_func=mutual_info_classif, k=33)),
    ('model', xgb_model)
])

In [32]:
# Cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_encoded, cv=skf, scoring='accuracy')
print(f"Stratified CV Accuracy: {scores.mean():.4f} ± {scores.std():.4f}")

Stratified CV Accuracy: 0.7561 ± 0.0045


In [33]:
# Train final model
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Test Accuracy: 0.7571150097465887

Classification Report:
               precision    recall  f1-score   support

          P1       0.77      0.80      0.78      1161
          P2       0.82      0.90      0.86      6433
          P3       0.41      0.20      0.27      1490
          P4       0.61      0.64      0.62      1176

    accuracy                           0.76     10260
   macro avg       0.65      0.63      0.63     10260
weighted avg       0.73      0.76      0.74     10260



In [34]:
# Save model
joblib.dump(pipeline, 'best_pipeline.joblib')

['best_pipeline.joblib']

In [35]:
# Get selected feature names
def get_feature_names(preprocessor):
    feature_names = []
    for name, transformer, columns in preprocessor.transformers_:
        if hasattr(transformer, 'named_steps'):
            if 'onehot' in transformer.named_steps:
                names = transformer.named_steps['onehot'].get_feature_names_out(columns)
            else:
                names = columns
        else:
            names = columns
        feature_names.extend(names)
    return feature_names

selectk = pipeline.named_steps['selectk']
selected_mask = selectk.get_support()
feature_names = get_feature_names(pipeline.named_steps['preprocessor'])

In [36]:
selected_features = [f for f, selected in zip(feature_names, selected_mask) if selected]

In [37]:
print("\nSelected Features:", selected_features)
print("\nNumber of Selected Features:", len(selected_features))


Selected Features: ['MARITALSTATUS_Single', 'last_prod_enq2_ConsumerLoan', 'last_prod_enq2_others', 'Total_TL_opened_L6M', 'Tot_TL_closed_L6M', 'pct_tl_closed_L6M', 'pct_closed_tl', 'pct_tl_open_L12M', 'pct_tl_closed_L12M', 'Tot_Missed_Pmnt', 'CC_TL', 'Consumer_TL', 'Gold_TL', 'Home_TL', 'PL_TL', 'Other_TL', 'Age_Oldest_TL', 'Age_Newest_TL', 'time_since_recent_payment', 'num_times_delinquent', 'max_recent_level_of_deliq', 'num_deliq_6_12mts', 'num_std', 'num_std_6mts', 'recent_level_of_deliq', 'time_since_recent_enq', 'Time_With_Curr_Empr', 'pct_opened_TLs_L6m_of_L12m', 'PL_Flag', 'pct_PL_enq_L6m_of_ever', 'pct_CC_enq_L6m_of_ever', 'HL_Flag', 'GL_Flag']

Number of Selected Features: 33
