# [Hybrid] Rank 1 Features + Rank 3 Stacking Ensemble
이 노트북은 1위의 우수한 특징 추출(Feature Engineering) 기법과 3위의 모델 앙상블 전략을 결합한 통합 솔루션입니다.

## 핵심 포인트
- **Rank 1 전략**: 고유 ID 생성, KMeans 클러스터링, 정교한 시간 파생 변수
- **Rank 3 전략**: CatBoost + LGBM + XGBoost 가중치 블렌딩 앙상블

In [5]:
!pip install catboost category_encoders lightgbm xgboost



In [6]:
import numpy as np
import pandas as pd
import warnings
import random
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from category_encoders.ordinal import OrdinalEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.cluster import KMeans
from catboost import CatBoostClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

warnings.filterwarnings(action='ignore')

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

In [7]:
def preprocess_data(train_df, test_df):
    print("Preprosessing...")
    train_df.fillna('NaN', inplace=True)
    test_df.fillna('NaN', inplace=True)
    
    # Outlier (Rank 1)
    train_df = train_df[train_df['family_size'] <= 7].reset_index(drop=True)
    
    # Drop constants
    drop_cols = ['index', 'FLAG_MOBIL']
    train_df.drop([c for c in drop_cols if c in train_df.columns], axis=1, inplace=True)
    test_df.drop([c for c in drop_cols if c in test_df.columns], axis=1, inplace=True)
    
    # Correct time features
    train_df['DAYS_EMPLOYED'] = train_df['DAYS_EMPLOYED'].map(lambda x: 0 if x > 0 else x)
    test_df['DAYS_EMPLOYED'] = test_df['DAYS_EMPLOYED'].map(lambda x: 0 if x > 0 else x)
    
    feats = ['DAYS_BIRTH', 'begin_month', 'DAYS_EMPLOYED']
    for feat in feats:
        train_df[feat] = np.abs(train_df[feat])
        test_df[feat] = np.abs(test_df[feat])
        
    for df in [train_df, test_df]:
        # Rank 1 Features
        df['before_EMPLOYED'] = df['DAYS_BIRTH'] - df['DAYS_EMPLOYED']
        df['income_total_befofeEMP_ratio'] = df['income_total'] / (df['before_EMPLOYED'] + 1)
        
        df['Age'] = df['DAYS_BIRTH'] // 365
        df['EMPLOYED'] = df['DAYS_EMPLOYED'] // 365
        
        df['DAYS_BIRTH_m'] = np.floor(df['DAYS_BIRTH'] / 30) % 12
        df['DAYS_BIRTH_w'] = np.floor(df['DAYS_BIRTH'] / 7) % 4
        df['DAYS_EMPLOYED_m'] = np.floor(df['DAYS_EMPLOYED'] / 30) % 12
        df['DAYS_EMPLOYED_w'] = np.floor(df['DAYS_EMPLOYED'] / 7) % 4
        
        df['ability'] = df['income_total'] / (df['DAYS_BIRTH'] + df['DAYS_EMPLOYED'] + 1)
        df['income_mean'] = df['income_total'] / df['family_size']
        
        # Rank 3 Features
        df['income_age'] = df['income_total'] * df['Age']
        df['income_emp'] = df['income_total'] * df['EMPLOYED']
        
        # ID (Rank 1)
        id_cols = ['child_num', 'income_total', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'work_phone', 'phone', 'email', 'family_size', 'gender', 'car', 'reality', 'income_type', 'edu_type', 'family_type', 'house_type', 'occyp_type']
        df['ID'] = df[id_cols].astype(str).agg('_'.join, axis=1)
        
    train_df.drop(['child_num', 'DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)
    test_df.drop(['child_num', 'DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)
    
    categorical_feats = train_df.select_dtypes(include=['object']).columns.tolist()
    train_df['income_total'] = np.log1p(1 + train_df['income_total'])
    test_df['income_total'] = np.log1p(1 + test_df['income_total'])
    
    encoder = OrdinalEncoder(cols=categorical_feats)
    train_df[categorical_feats] = encoder.fit_transform(train_df[categorical_feats])
    test_df[categorical_feats] = encoder.transform(test_df[categorical_feats])
    
    train_df['ID'] = train_df['ID'].astype('int64')
    test_df['ID'] = test_df['ID'].astype('int64')
    
    print("Clustering...")
    kmeans_train = train_df.drop(['credit'], axis=1)
    kmeans = KMeans(n_clusters=36, random_state=SEED).fit(kmeans_train)
    train_df['cluster'] = kmeans.predict(kmeans_train)
    test_df['cluster'] = kmeans.predict(test_df)
    
    numerical_feats = train_df.select_dtypes(exclude=['object', 'category']).columns.tolist()
    if 'credit' in numerical_feats: numerical_feats.remove('credit')
    scaler_feats = [c for c in numerical_feats if c != 'income_total']
    scaler = StandardScaler()
    train_df[scaler_feats] = scaler.fit_transform(train_df[scaler_feats])
    test_df[scaler_feats] = scaler.transform(test_df[scaler_feats])
    
    return train_df, test_df

In [8]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
train_df, test_df = preprocess_data(train_df, test_df)

n_fold = 15
skf = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=SEED)

X = train_df.drop('credit', axis=1)
y = train_df['credit']
X_test = test_df

cat_preds = np.zeros((X_test.shape[0], 3))
lgb_preds = np.zeros((X_test.shape[0], 3))
xgb_preds = np.zeros((X_test.shape[0], 3))

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"Fold {fold}...")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    cat = CatBoostClassifier(iterations=2000, random_state=SEED, verbose=0, early_stopping_rounds=100)
    cat.fit(X_train, y_train, eval_set=[(X_val, y_val)])
    cat_preds += cat.predict_proba(X_test) / n_fold
    
    lgbm = LGBMClassifier(n_estimators=1000, learning_rate=0.01, random_state=SEED, verbosity=-1)
    lgbm.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(stopping_rounds=50)])
    lgb_preds += lgbm.predict_proba(X_test) / n_fold
    
    xgboost = XGBClassifier(n_estimators=1000, learning_rate=0.01, random_state=SEED, verbosity=0, early_stopping_rounds=50)
    xgboost.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
    xgb_preds += xgboost.predict_proba(X_test) / n_fold

final_preds = 0.5 * cat_preds + 0.3 * lgb_preds + 0.2 * xgb_preds
output_dir = '신용카드 사용자 예측/'
submission = pd.read_csv('data/sample_submission.csv')
submission.iloc[:, 1:] = final_preds
submission.to_csv(output_dir + 'submission.csv', index=False)
test_df.to_csv(output_dir + 'test.csv', index=False)
print(f"Submission and Test files saved in {output_dir}!")

Preprosessing...
Clustering...
Fold 0...
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's multi_logloss: 0.718165
Fold 1...
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's multi_logloss: 0.733991
Fold 2...
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's multi_logloss: 0.726591
Fold 3...
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's multi_logloss: 0.735904
Fold 4...
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's multi_logloss: 0.732314
Fold 5...
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's multi_logloss: 0.730892
Fold 6...