In [3]:
import pandas as pd
import numpy as np
import pickle
import shap
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report, roc_auc_score
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, TransformerMixin

# 1. Load data and check data
def load_data(path):
    return pd.read_csv(path)

def check_missing_values(df, name="Dataset"):
    missing = df.isnull().sum()
    total_missing = missing.sum()
    if total_missing > 0:
        print(f"[{name}] Ada data yang kosong:")
        print(missing[missing > 0])
    else:
        print(f"[{name}] Semua data lengkap, gak ada yang kosong.")

# 2. Feature engineering with original column names
class BusinessFeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        # 1. Fitur kapasitas finansial
        if all(col in X.columns for col in ['balance', 'income']):
            X['savings_ratio'] = X['balance'] / (X['income'] + 1)
            X['balance_income_interaction'] = X['balance'] * X['income']
            X['financial_stability'] = np.log1p(X['balance']) - np.log1p(X['income'])
            X['income_to_age_ratio'] = X['income'] / (X['age'] + 1)

        # 2. Fitur kelompok umur
        if 'age' in X.columns:
            bins = [18, 25, 35, 45, 55, 65, 100]
            labels = ['18-24', '25-34', '35-44', '45-54', '55-64', '65+']
            X['age_group'] = pd.cut(X['age'], bins=bins, labels=labels)
            X['age_squared'] = X['age'] ** 2
            X['is_senior'] = (X['age'] >= 60).astype(int)

        # 3. Fitur engagement kampanye
        if all(col in X.columns for col in ['previous_campaign_contacts', 'previous_campaign_success']):
            X['engagement_intensity'] = np.where(
                X['previous_campaign_contacts'] > 0,
                X['previous_campaign_success'] / X['previous_campaign_contacts'],
                0
            )
            X['high_engagement'] = (X['previous_campaign_contacts'] > np.percentile(
                X['previous_campaign_contacts'], 75)).astype(int)
            X['responsive_client'] = (X['engagement_intensity'] > 0.5).astype(int)
            X['campaign_frequency'] = np.log1p(X['previous_campaign_contacts'])

        # 4. Fitur durasi kontak
        if 'last_contact_duration' in X.columns:
            X['contact_efficiency'] = np.log1p(X['last_contact_duration']) / (
                np.log1p(X['previous_campaign_contacts']) + 1)
            X['duration_bins'] = pd.qcut(X['last_contact_duration'], 5, labels=False, duplicates='drop')
            X['duration_per_contact'] = X['last_contact_duration'] / (X['previous_campaign_contacts'] + 1)

        # 5. Fitur interaksi
        if all(col in X.columns for col in ['age', 'balance']):
            X['age_balance_interaction'] = X['age'] * X['balance']
            X['age_income_interaction'] = X['age'] * X['income']

        # 6. Fitur balance
        if 'balance' in X.columns:
            X['balance_squared'] = X['balance'] ** 2
            X['balance_log'] = np.log1p(X['balance'])
            X['balance_per_age'] = X['balance'] / (X['age'] + 1)

        # 7. Fitur customer tenure
        if 'customer_since' in X.columns:
            X['customer_since'] = pd.to_datetime(X['customer_since'], errors='coerce')
            X['customer_tenure'] = 2023 - X['customer_since'].dt.year
            X['tenure_group'] = pd.cut(X['customer_tenure'], bins=[0, 2, 5, 10, 20, 100],
                                      labels=['0-2', '3-5', '6-10', '11-20', '20+'])

        return X

# 3. Model interpretation
def plot_feature_importance(model, X_val):
    try:
        if hasattr(model.named_steps['preprocessor'], 'get_feature_names_out'):
            feature_names = model.named_steps['preprocessor'].get_feature_names_out()
        else:
            feature_names = X_val.columns.tolist()

        plt.figure(figsize=(12, 8))
        importances = model.named_steps['classifier'].feature_importances_
        indices = np.argsort(importances)[-25:]
        plt.title('Fitur Paling Penting')
        plt.barh(range(len(indices)), importances[indices], color='b', align='center')
        plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
        plt.xlabel('Pentingnya Fitur (Relatif)')
        plt.tight_layout()
        plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight')
        print("\nPlot fitur penting udah disimpan di 'feature_importance.png'")
    except Exception as e:
        print(f"Gagal bikin plot fitur penting: {str(e)}")

def explain_model(model, X_val, sample_idx=0):
    try:
        business_features = model.named_steps['business_features'].transform(X_val)
        preprocessed_data = model.named_steps['preprocessor'].transform(business_features)

        explainer = shap.TreeExplainer(model.named_steps['classifier'])
        shap_values = explainer.shap_values(preprocessed_data)

        plt.figure()
        shap.summary_plot(shap_values, preprocessed_data,
                         feature_names=model.named_steps['preprocessor'].get_feature_names_out(),
                         plot_type="bar", show=False)
        plt.tight_layout()
        plt.savefig('shap_summary.png', dpi=300, bbox_inches='tight')
        plt.close()
        print("Plot SHAP summary udah disimpan di 'shap_summary.png'")

        shap.force_plot(explainer.expected_value, shap_values[0][sample_idx, :],
                        preprocessed_data[sample_idx, :],
                        feature_names=model.named_steps['preprocessor'].get_feature_names_out(),
                        matplotlib=True, show=False)
        plt.tight_layout()
        plt.savefig('shap_force_plot.png', dpi=300, bbox_inches='tight')
        plt.close()
        print("Plot SHAP force udah disimpen di 'shap_force_plot.png'")

        plt.figure()
        shap.decision_plot(explainer.expected_value, shap_values[0][:100, :],
                          feature_names=model.named_steps['preprocessor'].get_feature_names_out(),
                          show=False)
        plt.tight_layout()
        plt.savefig('shap_decision_plot.png', dpi=300, bbox_inches='tight')
        plt.close()
        print("Plot SHAP decision udah disimpen di 'shap_decision_plot.png'")

    except Exception as e:
        print(f"Gagal generate penjelasan SHAP: {str(e)}")

# 4. Generate submission file
def generate_submission_file(model, X_val, original_val_data):
    try:
        y_proba = model.predict_proba(X_val)[:, 1]
        
        submission_df = pd.DataFrame({
            'customer_number': original_val_data['customer_number'].values,
            'berlangganan_deposito': y_proba
        })
        
        submission_df = submission_df[['customer_number', 'berlangganan_deposito']]
        submission_df.to_csv('submission.csv', index=False)
        print("\nFile submission berhasil dibuat di 'submission.csv'")
        print("\nPreview hasil submission:")
        print(submission_df.head())
        
    except Exception as e:
        print(f"Gagal membuat file submission: {str(e)}")

# 5. Train and evaluate multiple models
def train_and_evaluate():
    # Load data
    train_df = load_data("../training_dataset.csv")
    check_missing_values(train_df, "Data Training")

    # Split features and target
    X = train_df.drop(columns=['berlangganan_deposito', 'customer_number'])
    y = train_df['berlangganan_deposito']

    # Train-test split
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.15, random_state=42, stratify=y
    )
    original_val_data = train_df.iloc[X_val.index].copy()

    # Define numeric and categorical columns
    num_cols = X_train.select_dtypes(include=np.number).columns.tolist()
    cat_cols = X_train.select_dtypes(include='object').columns.tolist()

    # Preprocessor
    preprocessor = ColumnTransformer(transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
    ])

    # Initialize models
    models = {
        'LGBM': LGBMClassifier(
            objective='binary',
            metric='auc',
            random_state=42,
            n_jobs=-1,
            verbosity=-1
        ),
        'RandomForest': RandomForestClassifier(
            n_estimators=200,
            max_depth=10,
            random_state=42,
            n_jobs=-1,
            class_weight='balanced'
        ),
        'XGBoost': XGBClassifier(
            objective='binary:logistic',
            eval_metric='auc',
            random_state=42,
            n_jobs=-1,
            verbosity=0
        ),
        'CatBoost': CatBoostClassifier(
            iterations=500,
            learning_rate=0.05,
            depth=6,
            eval_metric='AUC',
            random_state=42,
            verbose=0
        )
    }

    # SMOTE for imbalance
    smote = SMOTE(random_state=42)

    # Evaluate each model individually
    print("\n=== Evaluasi Model Individu ===")
    best_model = None
    best_auc = 0
    best_model_name = ""
    
    for name, model in models.items():
        pipeline = ImbPipeline(steps=[
            ('business_features', BusinessFeatureTransformer()),
            ('preprocessor', preprocessor),
            ('smote', smote),
            ('classifier', model)
        ])
        
        print(f"\nTraining {name}...")
        pipeline.fit(X_train, y_train)
        
        y_pred = pipeline.predict(X_val)
        y_proba = pipeline.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(y_val, y_proba)
        
        print(f"\nHasil evaluasi {name}:")
        print(classification_report(y_val, y_pred))
        print(f"AUC: {auc:.4f}")
        
        if auc > best_auc:
            best_auc = auc
            best_model = pipeline
            best_model_name = name

    # Stacking model
    print("\n=== Membuat Stacking Model ===")
    stacking_model = StackingClassifier(
        estimators=list(models.items()),
        final_estimator=LogisticRegression(max_iter=1000),
        cv=5,
        n_jobs=-1,
        passthrough=True
    )

    stacking_pipeline = ImbPipeline(steps=[
        ('business_features', BusinessFeatureTransformer()),
        ('preprocessor', preprocessor),
        ('smote', smote),
        ('classifier', stacking_model)
    ])

    print("Training stacking model...")
    stacking_pipeline.fit(X_train, y_train)
    
    y_pred_stack = stacking_pipeline.predict(X_val)
    y_proba_stack = stacking_pipeline.predict_proba(X_val)[:, 1]
    auc_stack = roc_auc_score(y_val, y_proba_stack)
    
    print("\nHasil evaluasi Stacking Model:")
    print(classification_report(y_val, y_pred_stack))
    print(f"AUC: {auc_stack:.4f}")

    # Voting classifier
    print("\n=== Membuat Voting Model ===")
    voting_model = VotingClassifier(
        estimators=list(models.items()),
        voting='soft',
        n_jobs=-1
    )

    voting_pipeline = ImbPipeline(steps=[
        ('business_features', BusinessFeatureTransformer()),
        ('preprocessor', preprocessor),
        ('smote', smote),
        ('classifier', voting_model)
    ])

    print("Training voting model...")
    voting_pipeline.fit(X_train, y_train)
    
    y_pred_vote = voting_pipeline.predict(X_val)
    y_proba_vote = voting_pipeline.predict_proba(X_val)[:, 1]
    auc_vote = roc_auc_score(y_val, y_proba_vote)
    
    print("\nHasil evaluasi Voting Model:")
    print(classification_report(y_val, y_pred_vote))
    print(f"AUC: {auc_vote:.4f}")

    # Select best model
    final_model = None
    if auc_stack >= 0.80:
        final_model = stacking_pipeline
        print("\nMenggunakan Stacking Model (AUC >= 0.80)")
    elif auc_vote >= 0.80:
        final_model = voting_pipeline
        print("\nMenggunakan Voting Model (AUC >= 0.80)")
    elif best_auc >= 0.80:
        print(f"\nMenggunakan {best_model_name} Model (AUC >= 0.80)")
        final_model = best_model
    else:
        final_model = stacking_pipeline
        print("\nTidak ada model yang mencapai AUC >= 0.80, menggunakan Stacking Model terbaik")

    # Save model
    with open("deposito_predictor.pkl", "wb") as f:
        pickle.dump(final_model, f)
    print("\nModel terbaik disimpan di 'deposito_predictor.pkl'")

    # Feature importance and SHAP
    plot_feature_importance(final_model, X_val)
    explain_model(final_model, X_val)
    
    # Generate submission
    generate_submission_file(final_model, X_val, original_val_data)

if __name__ == "__main__":
    train_and_evaluate()

[Data Training] Semua data lengkap, gak ada yang kosong.

=== Evaluasi Model Individu ===

Training LGBM...





Hasil evaluasi LGBM:
              precision    recall  f1-score   support

           0       0.91      0.97      0.94      3046
           1       0.58      0.28      0.38       392

    accuracy                           0.90      3438
   macro avg       0.75      0.63      0.66      3438
weighted avg       0.88      0.90      0.88      3438

AUC: 0.7951

Training RandomForest...

Hasil evaluasi RandomForest:
              precision    recall  f1-score   support

           0       0.95      0.91      0.93      3046
           1       0.47      0.59      0.52       392

    accuracy                           0.88      3438
   macro avg       0.71      0.75      0.73      3438
weighted avg       0.89      0.88      0.88      3438

AUC: 0.7929

Training XGBoost...





Hasil evaluasi XGBoost:
              precision    recall  f1-score   support

           0       0.92      0.97      0.94      3046
           1       0.58      0.31      0.40       392

    accuracy                           0.90      3438
   macro avg       0.75      0.64      0.67      3438
weighted avg       0.88      0.90      0.88      3438

AUC: 0.7793

Training CatBoost...





Hasil evaluasi CatBoost:
              precision    recall  f1-score   support

           0       0.91      0.98      0.95      3046
           1       0.63      0.29      0.40       392

    accuracy                           0.90      3438
   macro avg       0.77      0.63      0.67      3438
weighted avg       0.88      0.90      0.88      3438

AUC: 0.7863

=== Membuat Stacking Model ===
Training stacking model...





Hasil evaluasi Stacking Model:
              precision    recall  f1-score   support

           0       0.93      0.94      0.94      3046
           1       0.50      0.44      0.47       392

    accuracy                           0.89      3438
   macro avg       0.72      0.69      0.70      3438
weighted avg       0.88      0.89      0.88      3438

AUC: 0.7708

=== Membuat Voting Model ===
Training voting model...





Hasil evaluasi Voting Model:
              precision    recall  f1-score   support

           0       0.92      0.96      0.94      3046
           1       0.56      0.35      0.43       392

    accuracy                           0.89      3438
   macro avg       0.74      0.66      0.68      3438
weighted avg       0.88      0.89      0.88      3438

AUC: 0.7953

Tidak ada model yang mencapai AUC >= 0.80, menggunakan Stacking Model terbaik

Model terbaik disimpan di 'deposito_predictor.pkl'
Gagal bikin plot fitur penting: 'StackingClassifier' object has no attribute 'feature_importances_'
Gagal generate penjelasan SHAP: Model type not yet supported by TreeExplainer: <class 'sklearn.ensemble._stacking.StackingClassifier'>

File submission berhasil dibuat di 'submission.csv'

Preview hasil submission:
   customer_number  berlangganan_deposito
0           907098               0.043122
1           699895               0.070934
2           440407               0.099262
3           78741



<Figure size 1200x800 with 0 Axes>