# [Combined] Credit Card Default Prediction AI

This notebook combines the best strategies from the Private 1st place and Private 3rd place solutions:
1. **Private 1st place (0.6581)**: Powerful `ID` feature identification, KMeans clustering, and 15-fold Stratified CatBoost.
2. **Private 3rd place (0.65913)**: Feature interactions like Income-Age and Income-Employed ratios.

## Key features of this version:
- **Outlier removal**: family_size > 7
- **Feature Engineering**: Rich `ID` concat string, `ability`, `income_mean`, and interaction terms.
- **Clustering**: 36 segments using KMeans.
- **Model**: CatBoost with 15-fold cross-validation.

In [1]:
import numpy as np
import pandas as pd
import warnings, random
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
# from category_encoders.ordinal import OrdinalEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.cluster import KMeans
from catboost import CatBoostClassifier, Pool

warnings.filterwarnings(action='ignore')

# Seed for reproducibility
seed = 42
random.seed(seed)
np.random.seed(seed)

# 1. Load Data
import os
base_path = r'c:\Users\alsld\github\data science\pandas\신용카드 사용자 예측'
train = pd.read_csv(os.path.join(base_path, 'train.csv'))
test = pd.read_csv(os.path.join(base_path, 'test.csv'))
submission = pd.read_csv(os.path.join(base_path, 'sample_submission.csv'))

In [2]:
# 2. Data Preprocessing
train.fillna('NaN', inplace=True)
test.fillna('NaN', inplace=True)

# Outlier Removal
train = train[train['family_size'] <= 7].reset_index(drop=True)

# Remove constant/redundant columns
train.drop(['index', 'FLAG_MOBIL'], axis=1, inplace=True)
test.drop(['index', 'FLAG_MOBIL'], axis=1, inplace=True)

# Normalize time features
train['DAYS_EMPLOYED'] = train['DAYS_EMPLOYED'].map(lambda x: 0 if x > 0 else x)
test['DAYS_EMPLOYED'] = test['DAYS_EMPLOYED'].map(lambda x: 0 if x > 0 else x)

feats = ['DAYS_BIRTH', 'begin_month', 'DAYS_EMPLOYED']
for feat in feats:
    train[feat] = np.abs(train[feat])
    test[feat] = np.abs(test[feat])

In [3]:
# 3. Feature Engineering
for df in [train, test]:
    # Professional & Personal ratios (Notebook 1)
    df['before_EMPLOYED'] = df['DAYS_BIRTH'] - df['DAYS_EMPLOYED']
    df['income_total_befofeEMP_ratio'] = df['income_total'] / (df['before_EMPLOYED'] + 1)
    
    df['Age'] = df['DAYS_BIRTH'] // 365
    df['EMPLOYED'] = df['DAYS_EMPLOYED'] // 365
    df['ability'] = df['income_total'] / (df['DAYS_BIRTH'] + df['DAYS_EMPLOYED'] + 1)
    df['income_mean'] = df['income_total'] / df['family_size']
    
    # Time cycles (Notebook 1)
    df['before_EMPLOYED_m'] = np.floor(df['before_EMPLOYED'] / 30) % 12
    df['before_EMPLOYED_w'] = np.floor(df['before_EMPLOYED'] / 7) % 4
    df['DAYS_BIRTH_m'] = np.floor(df['DAYS_BIRTH'] / 30) % 12
    df['DAYS_BIRTH_w'] = np.floor(df['DAYS_BIRTH'] / 7) % 4
    df['DAYS_EMPLOYED_m'] = np.floor(df['DAYS_EMPLOYED'] / 30) % 12
    df['DAYS_EMPLOYED_w'] = np.floor(df['DAYS_EMPLOYED'] / 7) % 4
    
    # Interactions (Notebook 2)
    df['income_age'] = df['income_total'] * df['Age']
    df['income_emp'] = df['income_total'] * df['EMPLOYED']
    
    # ID Generation (Notebook 1)
    df['ID'] = \
    df['child_num'].astype(str) + '_' + df['income_total'].astype(str) + '_' + \
    df['DAYS_BIRTH'].astype(str) + '_' + df['DAYS_EMPLOYED'].astype(str) + '_' + \
    df['work_phone'].astype(str) + '_' + df['phone'].astype(str) + '_' + \
    df['email'].astype(str) + '_' + df['family_size'].astype(str) + '_' + \
    df['gender'].astype(str) + '_' + df['car'].astype(str) + '_' + \
    df['reality'].astype(str) + '_' + df['income_type'].astype(str) + '_' + \
    df['edu_type'].astype(str) + '_' + df['family_type'].astype(str) + '_' + \
    df['house_type'].astype(str) + '_' + df['occyp_type'].astype(str)

# Drop original columns that are summarized in others
cols_to_drop = ['child_num', 'DAYS_BIRTH', 'DAYS_EMPLOYED']
train.drop(cols_to_drop, axis=1, inplace=True)
test.drop(cols_to_drop, axis=1, inplace=True)

In [7]:
# 4. Encoding, Clustering & Scaling
# numerical_feats = train.select_dtypes(exclude='object').columns.tolist()
# if 'credit' in numerical_feats: numerical_feats.remove('credit')
categorical_feats = train.select_dtypes(include='object').columns.tolist()

train['income_total'] = np.log1p(1 + train['income_total'])
test['income_total'] = np.log1p(1 + test['income_total'])

# encoder = OrdinalEncoder(cols=categorical_feats)
# train[categorical_feats] = encoder.fit_transform(train[categorical_feats])
# test[categorical_feats] = encoder.transform(test[categorical_feats])

# train['ID'] = train['ID'].astype('int64')
# test['ID'] = test['ID'].astype('int64')

# Clustering
kmeans_train = train.drop(['credit'], axis=1)
kmeans = KMeans(n_clusters=36, random_state=seed).fit(kmeans_train)
train['cluster'] = kmeans.predict(kmeans_train)
test['cluster'] = kmeans.predict(test)

# Scale other numeric features
scale_feats = [f for f in numerical_feats if f != 'income_total']
scaler = StandardScaler()
train[scale_feats] = scaler.fit_transform(train[scale_feats])
test[scale_feats] = scaler.transform(test[scale_feats])

ValueError: could not convert string to float: 'F'

In [15]:
# 5. Modeling
n_fold = 15
target = 'credit'
X = train.drop(target, axis=1)
y = train[target]
X_test = test

skfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
cat_pred_test = np.zeros((X_test.shape[0], 3))
cat_cols = ['income_type', 'edu_type', 'family_type', 'house_type', 'occyp_type', 'ID']

for fold, (train_idx, valid_idx) in enumerate(skfold.split(X, y)):
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
    
    train_data = Pool(data=X_train, label=y_train, cat_features=cat_cols)
    valid_data = Pool(data=X_valid, label=y_valid, cat_features=cat_cols)
    
    model = CatBoostClassifier(iterations=2000, random_seed=seed, early_stopping_rounds=100, verbose=100)
    model.fit(train_data, eval_set=valid_data, use_best_model=True)
    
    cat_pred_test += model.predict_proba(X_test) / n_fold
    print(f"Fold {fold} finished.\n")

CatBoostError: Bad value for num_feature[non_default_doc_idx=0,feature_idx=0]="F": Cannot convert 'F' to float

In [4]:
# 6. Submission
submission.iloc[:, 1:] = cat_pred_test
submission.to_csv('combined_submission.csv', index=False)
print("Submission file 'combined_submission.csv' generated successfully!")

NameError: name 'cat_pred_test' is not defined