In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [None]:
df.info()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.show()

In [None]:
df.duplicated().sum()

In [None]:
# Display the number of unique values for each column
print("Number of unique values per column:")
display(df.nunique())

In [None]:
# Display the number of unique values for each column
print("Number of unique values per column:")
display(df_test.nunique())

In [None]:
# Drop specified columns
df = df.drop(['EmployeeCount', 'Over18', 'StandardHours'], axis=1)
df_test = df_test.drop(['EmployeeCount', 'Over18', 'StandardHours'], axis=1)
print("DataFrames after dropping specified columns:")
print("\nTraining DataFrame (df):")
display(df.head())
print("\nTest DataFrame (df_test):")
display(df_test.head())

In [None]:
# Manually select numerical and ordinal columns
numerical_cols = ['Age', 'DailyRate', 'DistanceFromHome', 'EmployeeNumber', 'HourlyRate', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike', 'TotalWorkingYears', 'TrainingTimesLastYear', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']
ordinal_cols = ['Education', 'EnvironmentSatisfaction', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'WorkLifeBalance']

In [None]:
# Display summary statistics for numerical columns
print("Summary statistics for numerical columns:")
display(df[numerical_cols].describe().T)

In [None]:
# Calculate the mean of numerical columns grouped by 'Attrition'
print("Mean of numerical columns grouped by Attrition:")
display(df[numerical_cols + ['Attrition']].groupby('Attrition')[numerical_cols].mean().T)

In [None]:
import math

# Determine the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns
n_rows = math.ceil(len(numerical_cols) / n_cols)

# Create a figure and subplots
fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, n_rows * 5)) # Adjust figure size as needed

# Flatten the axes array for easier iteration
axes = axes.flatten()

# Create vertical boxplots for each numerical column in numerical_cols
for i, col in enumerate(numerical_cols):
    sns.boxplot(y=df[col], ax=axes[i])
    axes[i].set_title(f'Boxplot of {col}')
    axes[i].set_ylabel('') # Remove y-label for clarity

# Hide any unused subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Create histograms for each numerical column
for col in numerical_cols:
    plt.figure(figsize=(8, 5))
    plt.hist(df[col], bins=30, edgecolor='black')
    plt.title(f'Histogram of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()

In [None]:
# Calculate the skewness of numerical columns
skewed_cols = df[numerical_cols].skew()

print("Skewness of numerical columns:")
display(skewed_cols)

In [None]:
from sklearn.preprocessing import PowerTransformer

# Define the columns to apply Yeo-Johnson transformation
yeo_johnson_cols = ['MonthlyIncome', 'TotalWorkingYears', 'YearsAtCompany', 'YearsSinceLastPromotion']

# Apply Yeo-Johnson transformation to the specified columns
yeo_johnson_transformer = PowerTransformer(method='yeo-johnson')

df[yeo_johnson_cols] = yeo_johnson_transformer.fit_transform(df[yeo_johnson_cols])

print("DataFrames after applying Yeo-Johnson transformation to specified columns:")
print("\nTraining DataFrame (df):")
display(df.head())

In [None]:
print("Skewness of numerical columns:")
display(df[numerical_cols].skew())

In [None]:
# Separate features (X_train) and target variable (y_train)
x_train = df.drop('Attrition', axis=1)
y_train = df['Attrition']

print("X_train (features):")
display(x_train)

print("\ny_train (target):")
display(y_train)

In [None]:
!pip install category_encoders

In [None]:
import category_encoders as ce

nominal_cols = x_train.select_dtypes(include='object').columns.tolist()
if 'id' in nominal_cols:
    nominal_cols.remove('id')

# Gunakan OrdinalEncoder yang handle unknown categories
encoder = ce.OrdinalEncoder(
    cols=nominal_cols,
    handle_unknown='value',  # Assign -1 untuk kategori baru
    handle_missing='value'   # Assign -2 untuk missing values
)

# TRAIN: Fit dan transform
x_train_encoded = encoder.fit_transform(x_train)

# TEST: Transform saja
x_test_encoded = encoder.transform(df_test)

print("DataFrames after Ordinal Encoding:")
print("\nTraining DataFrame:")
display(x_train_encoded)
print("\nTest DataFrame:")
display(x_test_encoded)

In [None]:
from sklearn.preprocessing import RobustScaler

# Inisialisasi scaler
robust_scaler = RobustScaler()

# Tentukan kolom yang akan di-scale
cols_to_scale = [col for col in x_train_encoded.columns if col not in ['id']]

# Pastikan kolom yang sama ada di df_test
missing_cols = set(cols_to_scale) - set(x_test_encoded.columns)
if missing_cols:
    print(f"Warning: Kolom {missing_cols} tidak ada di df_test")
    cols_to_scale = [col for col in cols_to_scale if col in x_test_encoded.columns]

# TRAIN: Fit dan transform
x_train_encoded[cols_to_scale] = robust_scaler.fit_transform(x_train_encoded[cols_to_scale])

# TEST: Transform saja
x_test_encoded[cols_to_scale] = robust_scaler.transform(x_test_encoded[cols_to_scale])

print("DataFrames after scale:")
print("\nTraining DataFrame:")
display(x_train_encoded)
print("\nTest DataFrame:")
display(x_test_encoded)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Exclude the 'id' column and calculate the correlation matrix
x_train_correlation = x_train_encoded.drop('id', axis=1).corr()

# Create the heatmap
plt.figure(figsize=(20, 15))
sns.heatmap(x_train_correlation, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap of Variables (Excluding ID)')
plt.show()

In [None]:
# Find pairs of variables with correlation greater than or equal to 0.8
high_correlation_pairs = x_train_correlation.unstack().sort_values(ascending=False)

# Remove self-correlations and duplicate pairs
high_correlation_pairs = high_correlation_pairs[(abs(high_correlation_pairs) >= 0.8) & (abs(high_correlation_pairs) <1)]

print("Pairs of variables with correlation >= 0.8:")
display(high_correlation_pairs)

In [None]:
# Drop the specified columns from the DataFrame
cols_reduced = ['JobLevel', 'YearsInCurrentRole', 'YearsWithCurrManager']

# Exclude the 'id' column and calculate the correlation matrix for the reduced DataFrame
x_train_reduced = x_train_encoded.drop(columns=cols_reduced)
x_test_reduced = x_test_encoded.drop(columns=cols_reduced)

x_train_correlation_reduced = x_train_reduced.drop('id', axis=1).corr()

# Create the heatmap for the reduced DataFrame
plt.figure(figsize=(20, 15))
sns.heatmap(x_train_correlation_reduced, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap of Variables (Excluding ID)')
plt.show()

In [None]:
print("Data x Train")
print(x_train_reduced.info())
print("\nData y Train")
print(y_train.info())
print("\nData Test")
print(x_test_reduced.info())

In [None]:
print("Data x Train")
display(x_train_reduced)
print("\nData Train")
display(y_train)
print("\nData Test")
display(x_test_reduced)

# Code coba coba

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
import pandas as pd

# Drop kolom 'id' dari data training dan testing
x_train_final = x_train_reduced.drop('id', axis=1)
x_test_final = x_test_reduced.drop('id', axis=1)

# Definisikan dictionary model
models = {
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "Support Vector Machine": SVC(probability=True, random_state=42)
}

# Latih dan evaluasi model di data training
roc_auc_scores = {}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(x_train_final, y_train)

    # Prediksi probabilitas di data training (karena y_test tidak ada)
    if hasattr(model, "predict_proba"):
        y_pred_proba_train = model.predict_proba(x_train_final)[:, 1]
    else:
        y_pred_proba_train = model.decision_function(x_train_final)

    # Hitung ROC AUC di training set
    roc_auc = roc_auc_score(y_train, y_pred_proba_train)
    roc_auc_scores[name] = roc_auc
    print(f"{name} ROC AUC (train): {roc_auc:.4f}")

# Tampilkan skor ROC AUC
print("\nROC AUC Scores:")
for model_name, score in roc_auc_scores.items():
    print(f"{model_name}: {score:.4f}")

# Model terbaik berdasarkan ROC AUC
best_model_name = max(roc_auc_scores, key=roc_auc_scores.get)
best_model = models[best_model_name]
print(f"\nBest Model: {best_model_name} (ROC AUC = {roc_auc_scores[best_model_name]:.4f})")

# Prediksi probabilitas untuk data test
y_test_pred = best_model.predict_proba(x_test_final)[:, 1]

# Revisi disini - gajadi direvisi, pakenya yg coba coba

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score

# Drop the 'id' column from the training and test data before training
x_train_final = x_train_reduced.drop('id', axis=1)
x_test_final = x_test_reduced.drop('id', axis=1)

# Split the original training data into training and validation sets
x_train_split, x_val_split, y_train_split, y_val_split = train_test_split(
    x_train_final, y_train, test_size=0.2, random_state=42, stratify=y_train
)


# Define a dictionary of models to evaluate
models = {
    "Logistic Regression": LogisticRegression(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "Support Vector Machine": SVC(probability=True, random_state=42) # probability=True needed for roc_auc_score
}

# Train and evaluate each model
roc_auc_scores = {}
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(x_train_split, y_train_split)

    # Predict probabilities for ROC AUC
    if hasattr(model, "predict_proba"):
        y_pred_proba = model.predict_proba(x_val_split)[:, 1]
    else: # For models like SVC without predict_proba by default
        y_pred_proba = model.decision_function(x_val_split)


    # Calculate ROC AUC, handling potential errors
    try:
        roc_auc = roc_auc_score(y_val_split, y_pred_proba)
        roc_auc_scores[name] = roc_auc
        print(f"{name} ROC AUC: {roc_auc:.4f}")
    except ValueError as e:
        print(f"Could not calculate ROC AUC for {name}: {e}")
        roc_auc_scores[name] = None


# Display the ROC AUC scores
print("\nROC AUC Scores:")
display(roc_auc_scores)

# Find the model with the highest ROC AUC score
best_model_name = None
best_roc_auc = -1
for name, roc_auc in roc_auc_scores.items():
    if roc_auc is not None and roc_auc > best_roc_auc:
        best_roc_auc = roc_auc
        best_model_name = name

print(f"\nBest performing model based on ROC AUC: {best_model_name} with ROC AUC = {best_roc_auc:.4f}")

# Coba Random Forest

In [None]:
# === 2️⃣ Inisialisasi dan latih model Random Forest ===
rf_model = RandomForestClassifier(
    n_estimators=200,       # jumlah pohon lebih banyak = lebih stabil
    max_depth=None,         # biarkan model cari kedalaman optimal
    random_state=42,
    n_jobs=-1               # gunakan semua core CPU
)

rf_model.fit(x_train_final, y_train)

# === 3️⃣ Prediksi probabilitas di data test ===
y_test_pred = rf_model.predict_proba(x_test_final)[:, 1]

# === 4️⃣ Buat DataFrame hasil prediksi ===
submission_df = pd.DataFrame({
    "id": x_test_reduced["id"],
    "prediction": y_test_pred
})

# === 5️⃣ Simpan ke file CSV untuk submission Kaggle ===
submission_df.to_csv("submission_random_forest.csv", index=False)

In [None]:
# Train the best performing model (Logistic Regression) on the entire training data
best_model = LogisticRegression(random_state=42)
best_model.fit(x_train_final, y_train)

# Predict probabilities on the test data
test_predictions_proba = best_model.predict_proba(x_test_final)[:, 1]

# Create a submission DataFrame
submission_df = pd.DataFrame({'id': df_test['id'], 'Attrition': test_predictions_proba})

print("Submission DataFrame:")
display(submission_df.head())

# Task
Perform hyperparameter tuning on the previously evaluated models to optimize their performance, focusing on improving the ROC AUC score. Compare the performance of the tuned models and select the best one. Finally, train the best model on the entire training data and make predictions on the test data.

## Select model for tuning

### Subtask:
Choose one or more of the previously evaluated models for hyperparameter tuning (e.g., Logistic Regression, Random Forest, Gradient Boosting, SVM, XGBoost).


**Reasoning**:
Review the ROC AUC scores and identify the best performing model to select for hyperparameter tuning.



# 1st Tune - Randomized Search

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import pandas as pd

x_train_final = x_train_reduced.drop('id', axis=1)

param_dist = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False]
}

rf = RandomForestClassifier(random_state=42)

rf_random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=30,
    scoring='roc_auc',
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

rf_random_search.fit(x_train_final, y_train)

print("Best Parameters:", rf_random_search.best_params_)
print("Best ROC AUC (CV):", rf_random_search.best_score_)

best_rf_model = rf_random_search.best_estimator_

x_test_final = x_test_reduced.drop('id', axis=1)
y_test_pred = best_rf_model.predict_proba(x_test_final)[:, 1]

submission_df = pd.DataFrame({
    "id": x_test_reduced["id"],
    "prediction": y_test_pred
})

submission_df.to_csv("submission_rf_tuned.csv", index=False)


Fitting 3 folds for each of 30 candidates, totalling 90 fits


KeyboardInterrupt: 

# 2nd Tune - GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import pandas as pd

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False]
}

rf = RandomForestClassifier(random_state=42)

rf_grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=3,
    verbose=2,
    n_jobs=-1
)

rf_grid_search.fit(x_train_final, y_train)

print("Best Parameters:", rf_grid_search.best_params_)
print("Best ROC AUC (CV):", rf_grid_search.best_score_)

best_rf_model = rf_grid_search.best_estimator_

x_test_final = x_test_reduced.drop('id', axis=1)
y_test_pred = best_rf_model.predict_proba(x_test_final)[:, 1]

submission_df = pd.DataFrame({
    "id": x_test_reduced["id"],
    "prediction": y_test_pred
})

submission_df.to_csv("submission_rf_gridsearch.csv", index=False)


# 3rd Tune - XGBoost + RandomizedSearchCV

In [None]:
pip install xgboost

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score
import pandas as pd


xgb = XGBClassifier(
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [3, 4, 5, 6, 7, 8],
    'learning_rate': [0.01, 0.03, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.7, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3],
    'reg_lambda': [0.5, 1, 1.5, 2],
    'reg_alpha': [0, 0.1, 0.2, 0.3]
}

xgb_random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=30,
    scoring='roc_auc',
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

xgb_random_search.fit(x_train_final, y_train)

print("Best Parameters:", xgb_random_search.best_params_)
print("Best ROC AUC (CV):", xgb_random_search.best_score_)

best_xgb_model = xgb_random_search.best_estimator_

y_test_pred = best_xgb_model.predict_proba(x_test_final)[:, 1]

submission_df = pd.DataFrame({
    "id": x_test_reduced["id"],
    "prediction": y_test_pred
})

submission_df.to_csv("submission_xgb_randomsearch.csv", index=False)


# Terbaik 1


In [None]:
# =====================================================
# ✅ FIXED VERSION: ATTRITION STACKING ENSEMBLE (FINAL)
# =====================================================

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import RobustScaler
from category_encoders import TargetEncoder
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# === LOAD DATA ===
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

id_col = "id"
target_col = "Attrition"

X = train.drop([id_col, target_col], axis=1)
y = train[target_col]
X_test = test.drop(id_col, axis=1)

# === SPLIT KATEGORIKAL & NUMERIK ===
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = X.select_dtypes(exclude=["object"]).columns.tolist()

# === TARGET ENCODING untuk categorical ===
te = TargetEncoder(cols=cat_cols)
X[cat_cols] = te.fit_transform(X[cat_cols], y)
X_test[cat_cols] = te.transform(X_test[cat_cols])

# === SCALING untuk numerical ===
scaler = RobustScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

# === HANDLE IMBALANCE dengan SMOTE ===
smote = SMOTE(random_state=42, sampling_strategy=0.5)
X_res, y_res = smote.fit_resample(X, y)

# === BASE MODELS ===
xgb = XGBClassifier(
    n_estimators=800,
    learning_rate=0.03,
    max_depth=6,
    subsample=0.85,
    colsample_bytree=0.85,
    scale_pos_weight=2,
    reg_lambda=1.2,
    reg_alpha=0.2,
    eval_metric="auc",
    random_state=42
)

lgbm = LGBMClassifier(
    n_estimators=800,
    learning_rate=0.03,
    num_leaves=40,
    subsample=0.85,
    colsample_bytree=0.85,
    class_weight='balanced',
    random_state=42,
    metric='auc'
)

cat = CatBoostClassifier(
    iterations=800,
    learning_rate=0.03,
    depth=6,
    l2_leaf_reg=2,
    eval_metric='AUC',
    verbose=0,
    random_seed=42
)

# === STACKING ENSEMBLE ===
stack_model = StackingClassifier(
    estimators=[
        ('xgb', xgb),
        ('lgbm', lgbm),
        ('cat', cat)
    ],
    final_estimator=LogisticRegression(max_iter=200),
    cv=5,
    n_jobs=-1
)

# === CROSS-VALIDATION ===
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(len(X_res))
fold = 1

for train_idx, val_idx in skf.split(X_res, y_res):
    print(f"\n=== Fold {fold} ===")
    X_tr, X_val = X_res.iloc[train_idx], X_res.iloc[val_idx]
    y_tr, y_val = y_res.iloc[train_idx], y_res.iloc[val_idx]

    stack_model.fit(X_tr, y_tr)
    val_pred = stack_model.predict_proba(X_val)[:, 1]
    oof_preds[val_idx] = val_pred

    auc = roc_auc_score(y_val, val_pred)
    print(f"Fold {fold} AUC: {auc:.4f}")
    fold += 1

# === Overall AUC ===
overall_auc = roc_auc_score(y_res, oof_preds)
print(f"\n✅ Overall ROC-AUC: {overall_auc:.4f}")

# === FIT FULL MODEL & PREDICT TEST ===
stack_model.fit(X_res, y_res)
final_pred = stack_model.predict_proba(X_test)[:, 1]

# === SUBMISSION ===
submission = pd.DataFrame({
    id_col: test[id_col],
    "prediction": final_pred
})
submission.to_csv("submission_attrition_final.csv", index=False)

print("\n🎯 File 'submission_attrition_final.csv' berhasil dibuat tanpa error!")


In [None]:
!pip install catboost

In [None]:
!pip freeze > requirements.txt


In [None]:
!cat requirements.txt


In [None]:
from google.colab import files
files.download("requirements.txt")


In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score

# === 1. RandomForest (GridSearchCV) ===
rf_model = best_rf_model
rf_val_pred_prob = cross_val_predict(rf_model, x_train_final, y_train, cv=3, method='predict_proba')[:, 1]
rf_val_pred = (rf_val_pred_prob >= 0.5).astype(int)

rf_roc_auc = roc_auc_score(y_train, rf_val_pred_prob)
rf_accuracy = accuracy_score(y_train, rf_val_pred)
rf_recall = recall_score(y_train, rf_val_pred)

# === 2. XGBoost (RandomizedSearchCV) ===
xgb_model = best_xgb_model
xgb_val_pred_prob = cross_val_predict(xgb_model, x_train_final, y_train, cv=3, method='predict_proba')[:, 1]
xgb_val_pred = (xgb_val_pred_prob >= 0.5).astype(int)

xgb_roc_auc = roc_auc_score(y_train, xgb_val_pred_prob)
xgb_accuracy = accuracy_score(y_train, xgb_val_pred)
xgb_recall = recall_score(y_train, xgb_val_pred)

# === 3. Stacking Ensemble ===
stack_val_pred_prob = oof_preds  # sudah dihasilkan saat StratifiedKFold
stack_val_pred = (stack_val_pred_prob >= 0.5).astype(int)

stack_roc_auc = roc_auc_score(y_res, stack_val_pred_prob)
stack_accuracy = accuracy_score(y_res, stack_val_pred)
stack_recall = recall_score(y_res, stack_val_pred)

# === Buat dataframe perbandingan ===
comparison_df = pd.DataFrame({
    "Model": ["RandomForest (GridSearchCV)", "XGBoost (RandomizedSearchCV)", "Stacking Ensemble"],
    "ROC AUC": [rf_roc_auc, xgb_roc_auc, stack_roc_auc],
    "Accuracy": [rf_accuracy, xgb_accuracy, stack_accuracy],
    "Recall": [rf_recall, xgb_recall, stack_recall]
})

print(comparison_df)


In [None]:
from sklearn.metrics import classification_report
import pandas as pd

# Threshold 0.5 untuk konversi probabilitas menjadi label
oof_pred_labels = (oof_preds >= 0.5).astype(int)

# Buat classification report
report_dict = classification_report(y_res, oof_pred_labels, target_names=['No Attrition', 'Attrition'], output_dict=True)

# Konversi ke DataFrame
report_df = pd.DataFrame(report_dict).transpose()

# Round angka desimal agar rapi
report_df.iloc[:, :-1] = report_df.iloc[:, :-1].round(4)

# Cetak dataframe
print(report_df)


In [None]:
import pandas as pd
from sklearn.metrics import brier_score_loss, roc_auc_score
from scipy import stats

# --- Hitung metrik ---
brier = brier_score_loss(y_res, oof_preds)

roc_auc = roc_auc_score(y_res, oof_preds)
gini = 2 * roc_auc - 1

preds_positive = oof_preds[y_res == 1]  # Attrition
preds_negative = oof_preds[y_res == 0]  # No Attrition
ks_stat, _ = stats.ks_2samp(preds_positive, preds_negative)

# --- Buat DataFrame ---
metrics_df = pd.DataFrame({
    "Metrik": ["Brier Score", "Gini Coefficient", "KS Statistic"],
    "Stacking Ensemble": [brier, gini, ks_stat]
})

# Round agar rapi
metrics_df["Stacking Ensemble"] = metrics_df["Stacking Ensemble"].round(6)

# Cetak tabel
print(metrics_df)


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Threshold 0.5 untuk konversi probabilitas menjadi label
oof_pred_labels = (oof_preds >= 0.5).astype(int)

# Buat confusion matrix
cm = confusion_matrix(y_res, oof_pred_labels)

# Cetak matrix
print("Confusion Matrix:\n", cm)

# Visualisasi dengan heatmap
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Attrition','Attrition'], yticklabels=['No Attrition','Attrition'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Stacking Ensemble')
plt.show()


In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# --- Hitung ROC curve ---
fpr, tpr, thresholds = roc_curve(y_res, oof_preds)
roc_auc = auc(fpr, tpr)

# --- Plot ROC curve ---
plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='red', lw=1, linestyle='--', label='Random Guess')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Stacking Ensemble')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()


In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# --- ROC curve & AUC ---
fpr, tpr, thresholds = roc_curve(y_res, oof_preds)
roc_auc = auc(fpr, tpr)

# --- Plot ROC curve ---
plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'Stacking Ensemble (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=1, linestyle='--', label='Random Guess')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC-AUC Curve - Stacking Ensemble')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

# --- Cetak nilai AUC ---
print(f"ROC-AUC Stacking Ensemble: {roc_auc:.6f}")


In [None]:
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
import matplotlib.pyplot as plt

# --- Probabilitas prediksi ---
y_scores = oof_preds  # prediksi probabilitas dari Stacking Ensemble

# --------------------------
# ROC-AUC Curve
# --------------------------
fpr, tpr, _ = roc_curve(y_res, y_scores)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(12,5))

# Plot ROC Curve
plt.subplot(1,2,1)
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=1, linestyle='--', label='Random Guess')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Stacking Ensemble')
plt.legend(loc='lower right')
plt.grid(True)

# --------------------------
# Precision-Recall Curve
# --------------------------
precision, recall, _ = precision_recall_curve(y_res, y_scores)
pr_auc = average_precision_score(y_res, y_scores)

plt.subplot(1,2,2)
plt.plot(recall, precision, color='green', lw=2, label=f'PR curve (AUC = {pr_auc:.4f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve - Stacking Ensemble')
plt.legend(loc='lower left')
plt.grid(True)

plt.tight_layout()
plt.show()

# --- Cetak nilai AUC ---
print(f"ROC-AUC Stacking Ensemble: {roc_auc:.6f}")
print(f"PR-AUC Stacking Ensemble: {pr_auc:.6f}")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Buat DataFrame probabilitas dan label asli
df_probs = pd.DataFrame({
    "prob_attrition": oof_preds,  # probabilitas prediksi Attrition
    "actual": y_res               # label asli (0=No Attrition, 1=Attrition)
})

# --- Analisis Distribusi Probabilitas ---
plt.figure(figsize=(10,6))
sns.histplot(df_probs[df_probs["actual"]==0]["prob_attrition"],
             color='blue', label='No Attrition', kde=True, bins=30, stat='density', alpha=0.6)
sns.histplot(df_probs[df_probs["actual"]==1]["prob_attrition"],
             color='red', label='Attrition', kde=True, bins=30, stat='density', alpha=0.6)
plt.title('Distribusi Probabilitas Prediksi Attrition - Stacking Ensemble')
plt.xlabel('Probabilitas Prediksi Attrition')
plt.ylabel('Density')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Buat DataFrame probabilitas dan label asli
df_probs = pd.DataFrame({
    "prob_attrition": oof_preds,  # probabilitas prediksi Attrition
    "actual": y_res               # label asli (0=No Attrition, 1=Attrition)
})

# --- Plot KDE distribusi probabilitas ---
plt.figure(figsize=(10,6))
sns.kdeplot(df_probs[df_probs["actual"]==0]["prob_attrition"],
            shade=True, color='steelblue', label='No Attrition', alpha=0.6)
sns.kdeplot(df_probs[df_probs["actual"]==1]["prob_attrition"],
            shade=True, color='orange', label='Attrition', alpha=0.6)

plt.title('Distribusi Probabilitas Prediksi Attrition - Stacking Ensemble')
plt.xlabel('Predicted probabilities')
plt.ylabel('Density')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Buat DataFrame probabilitas dan label asli
df_probs = pd.DataFrame({
    "prob_attrition": oof_preds,  # probabilitas prediksi Attrition
    "actual": y_res               # label asli (0=No Attrition, 1=Attrition)
})

# --- Bagi ke dalam decile ---
df_probs['decile'] = pd.qcut(df_probs['prob_attrition'], 10, labels=False) + 1  # 1 sampai 10

# --- Hitung churn rate per decile ---
decile_summary = df_probs.groupby('decile')['actual'].mean().reset_index()
decile_summary.rename(columns={'actual': 'attrition_rate'}, inplace=True)

# --- Plot ---
plt.figure(figsize=(10,6))
plt.bar(decile_summary['decile'], decile_summary['attrition_rate'], color='navy', alpha=0.8)
plt.title('Probability Scores Ordering - attrition rate per Decile')
plt.xlabel('Decile (1 = Lowest Prob, 10 = Highest Prob)')
plt.ylabel('atrrition Rate')
plt.xticks(decile_summary['decile'])
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()


In [None]:
# === TAMBAHKAN LIBRARY UNTUK INTERPRETASI ===
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.inspection import permutation_importance
import shap

# Anda sudah fit model di baris sebelumnya:
# stack_model.fit(X_res, y_res)
print("\n=== INTERPRETASI MODEL FINAL ===")

# -----------------------------------------------------------------
# INTERPRETASI LEVEL 1: Bobot Final Estimator (Logistic Regression)
# -----------------------------------------------------------------
print("\n--- Level 1: Bobot Base Model ---")

try:
    # Mengakses final_estimator_ yang sudah di-fit
    final_estimator = stack_model.final_estimator_

    # Mendapatkan koefisien (bobot)
    # .coef_[0] karena ini adalah klasifikasi biner
    weights = final_estimator.coef_[0]

    # Mendapatkan nama base models
    model_names = [name for name, _ in stack_model.estimators]

    print("Bobot yang diberikan Logistic Regression ke tiap base model:")
    for name, weight in zip(model_names, weights):
        print(f"-> {name}: {weight:.4f}")

    # Visualisasi Sederhana
    plt.figure(figsize=(10, 5))
    sns.barplot(x=model_names, y=weights, palette="viridis")
    plt.title("Bobot Base Model di Final Estimator (Logistic Regression)")
    plt.ylabel("Koefisien (Bobot)")
    plt.xlabel("Base Model")
    plt.show()

except Exception as e:
    print(f"Tidak bisa mengambil bobot final estimator: {e}")


# -----------------------------------------------------------------
# INTERPRETASI LEVEL 2: Permutation Importance (Fitur Paling Penting)
# -----------------------------------------------------------------
# CATATAN: Idealnya, ini dijalankan pada validation set terpisah.
# Karena kita fit pada semua data (X_res), kita akan hitung
# importance pada data training itu, tapi ini bisa sedikit bias (overfit).
print("\n--- Level 2: Permutation Importance (Fitur Global) ---")
print("Menghitung Permutation Importance... (Mungkin perlu beberapa saat)")

# Kita hitung importance pada data X_res (data training)
perm_importance = permutation_importance(
    stack_model,
    X_res,
    y_res,
    n_repeats=10,
    random_state=42,
    scoring='roc_auc',
    n_jobs=-1
)

# Dapatkan nama fitur
feature_names = X.columns.tolist()
sorted_idx = perm_importance.importances_mean.argsort()

# Buat DataFrame untuk visualisasi
perm_df = pd.DataFrame(
    data={
        'feature': np.array(feature_names)[sorted_idx],
        'importance_mean': perm_importance.importances_mean[sorted_idx],
    }
).sort_values(by='importance_mean', ascending=False)

print("\nTop 15 Fitur Paling Berpengaruh (Permutation Importance):")
print(perm_df.head(15))

# Visualisasi Permutation Importance
plt.figure(figsize=(12, 8))
sns.barplot(
    data=perm_df.head(15),
    x='importance_mean',
    y='feature',
    palette='rocket'
)
plt.title("Top 15 Feature Importance (Permutation) untuk Stacked Model")
plt.xlabel("Penurunan Rata-rata Skor AUC")
plt.ylabel("Fitur")
plt.tight_layout()
plt.show()


# -----------------------------------------------------------------
# INTERPRETASI LEVEL 2 (Alternatif): SHAP (Proxy Method)
# -----------------------------------------------------------------
# Ini adalah metode "proxy": kita melihat SHAP dari masing-masing
# base model yang sudah di-fit di dalam stack.
print("\n--- Level 2 (Alternatif): SHAP Summary per Base Model ---")
shap.initjs() # Inisialisasi javascript untuk plotting di notebook

# Kita ambil sampel data untuk background (agar SHAP lebih cepat)
# Gunakan X_res karena model di-fit di sana
background_data = shap.utils.sample(X_res, 100)

# Loop ketiga base model yang sudah di-fit di dalam stack
for model_name, _ in stack_model.estimators:
    try:
        print(f"\nMenganalisis SHAP untuk: {model_name}")

        # Mengakses model yang sudah di-fit dari stack
        model_in_stack = stack_model.named_estimators_[model_name]

        # Buat TreeExplainer
        explainer = shap.TreeExplainer(model_in_stack, background_data)

        # Hitung SHAP values (bisa ambil sampel lebih kecil dari X_res agar cepat)
        shap_values = explainer(X_res.sample(1000, random_state=1))

        # Plot summary
        # [:, :, 1] untuk mengambil SHAP values untuk kelas positif (Attrition=1)
        shap.summary_plot(shap_values.values[:,:,1], X_res.sample(1000, random_state=1),
                          show=False, plot_size=(10, 6))
        plt.title(f"SHAP Summary Plot untuk {model_name} (Class 1)")
        plt.show()

    except Exception as e:
        print(f"Error saat membuat SHAP untuk {model_name}: {e}")
        print("Catatan: Beberapa model mungkin butuh perlakuan khusus untuk SHAP.")


print("\n✅ Interpretasi model selesai.")
# Lanjutkan ke prediksi test
# final_pred = stack_model.predict_proba(X_test)[:, 1]
# ...

In [None]:
# === TAMBAHKAN LIBRARY UNTUK INTERPRETASI ===
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.inspection import permutation_importance
import shap
import pandas as pd # Pastikan pandas diimpor jika belum

# Anda sudah fit model di baris sebelumnya:
# stack_model.fit(X_res, y_res)
print("\n=== INTERPRETASI MODEL FINAL ===")

# -----------------------------------------------------------------
# INTERPRETASI LEVEL 1: Bobot Final Estimator (Logistic Regression)
# -----------------------------------------------------------------
print("\n--- Level 1: Bobot Base Model ---")

try:
    # Mengakses final_estimator_ yang sudah di-fit
    final_estimator = stack_model.final_estimator_

    # Mendapatkan koefisien (bobot)
    # .coef_[0] karena ini adalah klasifikasi biner
    weights = final_estimator.coef_[0]

    # Mendapatkan nama base models
    model_names = [name for name, _ in stack_model.estimators]

    print("Bobot yang diberikan Logistic Regression ke tiap base model:")
    for name, weight in zip(model_names, weights):
        print(f"-> {name}: {weight:.4f}")

    # Visualisasi Sederhana
    plt.figure(figsize=(10, 5))
    sns.barplot(x=model_names, y=weights, palette="viridis")
    plt.title("Bobot Base Model di Final Estimator (Logistic Regression)")
    plt.ylabel("Koefisien (Bobot)")
    plt.xlabel("Base Model")
    plt.show()

except Exception as e:
    print(f"Tidak bisa mengambil bobot final estimator: {e}")


# -----------------------------------------------------------------
# INTERPRETASI LEVEL 2: Permutation Importance (Fitur Paling Penting)
# -----------------------------------------------------------------
# CATATAN: Idealnya, ini dijalankan pada validation set terpisah.
# Karena kita fit pada semua data (X_res), kita akan hitung
# importance pada data training itu, tapi ini bisa sedikit bias (overfit).
print("\n--- Level 2: Permutation Importance (Fitur Global) ---")
print("Menghitung Permutation Importance... (Mungkin perlu beberapa saat)")

# Kita hitung importance pada data X_res (data training)
perm_importance = permutation_importance(
    stack_model,
    X_res,
    y_res,
    n_repeats=10,
    random_state=42,
    scoring='roc_auc',
    n_jobs=-1
)

# Dapatkan nama fitur
feature_names = X.columns.tolist()
sorted_idx = perm_importance.importances_mean.argsort()

# Buat DataFrame untuk visualisasi
perm_df = pd.DataFrame(
    data={
        'feature': np.array(feature_names)[sorted_idx],
        'importance_mean': perm_importance.importances_mean[sorted_idx],
    }
).sort_values(by='importance_mean', ascending=False)

# === PERUBAHAN DI SINI: Tampilkan Semua Fitur ===
print("\nSemua Fitur Paling Berpengaruh (Permutation Importance):")
# Atur agar Pandas menampilkan semua baris
pd.set_option('display.max_rows', None)
print(perm_df)
pd.reset_option('display.max_rows') # Kembalikan ke default

# === PERUBAHAN DI SINI: Visualisasi Semua Fitur ===
# Tentukan tinggi plot secara dinamis berdasarkan jumlah fitur
num_features = len(perm_df)
# Atur tinggi minimal 8, dan tambahkan 0.5 inci per fitur
plot_height = max(8, num_features * 0.5)

plt.figure(figsize=(12, plot_height))
sns.barplot(
    data=perm_df,  # Gunakan perm_df (bukan .head(15))
    x='importance_mean',
    y='feature',
    palette='rocket'
)
plt.title("Semua Feature Importance (Permutation) untuk Stacked Model")
plt.xlabel("Penurunan Rata-rata Skor AUC")
plt.ylabel("Fitur")
plt.tight_layout()
plt.show()


# -----------------------------------------------------------------
# INTERPRETASI LEVEL 2 (Alternatif): SHAP (Proxy Method)
# -----------------------------------------------------------------
# Ini adalah metode "proxy": kita melihat SHAP dari masing-masing
# base model yang sudah di-fit di dalam stack.
print("\n--- Level 2 (Alternatif): SHAP Summary per Base Model ---")
shap.initjs() # Inisialisasi javascript untuk plotting di notebook

# Kita ambil sampel data untuk background (agar SHAP lebih cepat)
# Gunakan X_res karena model di-fit di sana
background_data = shap.utils.sample(X_res, 100)

# Loop ketiga base model yang sudah di-fit di dalam stack
for model_name, _ in stack_model.estimators:
    try:
        print(f"\nMenganalisis SHAP untuk: {model_name}")

        # Mengakses model yang sudah di-fit dari stack
        model_in_stack = stack_model.named_estimators_[model_name]

        # Buat TreeExplainer
        explainer = shap.TreeExplainer(model_in_stack, background_data)

        # Hitung SHAP values (bisa ambil sampel lebih kecil dari X_res agar cepat)
        shap_values = explainer(X_res.sample(1000, random_state=1))

        # Plot summary
        # [:, :, 1] untuk mengambil SHAP values untuk kelas positif (Attrition=1)
        shap.summary_plot(shap_values.values[:,:,1], X_res.sample(1000, random_state=1),
                          show=False, plot_size=(10, 6))
        plt.title(f"SHAP Summary Plot untuk {model_name} (Class 1)")
        plt.show()

    except Exception as e:
        print(f"Error saat membuat SHAP untuk {model_name}: {e}")
        print("Catatan: Beberapa model mungkin butuh perlakuan khusus untuk SHAP.")


print("\n✅ Interpretasi model selesai.")
# Lanjutkan ke prediksi test
# final_pred = stack_model.predict_proba(X_test)[:, 1]
# ...

In [None]:
# =====================================================
# ✅ BLOK KODE INTERPRETASI SHAP
# =====================================================

import shap
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display

# --- Pastikan variabel ini sudah ada dari script utama Anda ---
# stack_model: Model StackingClassifier Anda yang sudah di-fit
# X_res: DataFrame fitur Anda (misalnya, X_res atau X_val)
# feature_names: Daftar nama kolom X_res
# -----------------------------------------------------------

print("\n=== INTERPRETASI SHAP UNTUK SATU KARYAWAN ===")

# 1. Inisialisasi SHAP (diperlukan untuk plot di Jupyter/Colab)
shap.initjs()

# 2. Definisikan fungsi Sigmoid
def sigmoid(x):
    """Mengubah log-odds menjadi probabilitas"""
    return 1 / (1 + np.exp(-x))

# 3. Definisikan fungsi prediksi untuk SHAP (mengembalikan log-odds)
# SHAP KernelExplainer butuh output model dalam log-odds.
def predict_log_odds(X):
    # Pastikan X adalah DataFrame dengan nama fitur yang benar
    if not isinstance(X, pd.DataFrame):
        X_df = pd.DataFrame(X, columns=feature_names)
    else:
        X_df = X

    # Dapatkan probabilitas kelas 1 (Attrition)
    probas = stack_model.predict_proba(X_df)[:, 1]

    # Ubah probabilitas ke log-odds: log(p / (1-p))
    # Tambahkan 'clipping' untuk menghindari log(0)
    epsilon = 1e-15
    probas_clipped = np.clip(probas, epsilon, 1 - epsilon)
    log_odds = np.log(probas_clipped / (1 - probas_clipped))

    return log_odds

# 4. Buat SHAP Explainer
print("Menyiapkan SHAP KernelExplainer...")
print("Catatan: Ini mungkin lambat. Kita gunakan 100 sampel data background.")

# Kita gunakan 'shap.sample' untuk membuat background data
# 'background_data' adalah referensi untuk E[f(X)]
background_data = shap.sample(X_res, 100)

# Membuat KernelExplainer (model-agnostic)
explainer = shap.KernelExplainer(predict_log_odds, background_data)

# 5. Pilih satu karyawan untuk dianalisis (misal, indeks ke-10)
karyawan_index = 10
karyawan_data = X_res.iloc[[karyawan_index]]

print(f"\nMenganalisis karyawan di indeks: {karyawan_index}")

# 6. Hitung SHAP values untuk karyawan tersebut
# nsamples=50 (default 'auto') adalah jumlah sampling
# untuk mengestimasi nilai SHAP.
shap_values_karyawan = explainer.shap_values(karyawan_data, nsamples=50)

# 7. Dapatkan Base Value (E[f(X)])
# Ini adalah "Base Value" atau rata-rata prediksi log-odds
base_value = explainer.expected_value
print(f"\nBase Value (E[f(X)]) (rata-rata log-odds): {base_value:.4f}")
print(f"Probabilitas Attrition rata-rata (Sigmoid dari Base Value): {sigmoid(base_value):.4f}")

# 8. Visualisasikan sebagai Waterfall Plot
print("\nMembuat Waterfall Plot (kontribusi fitur dalam log-odds)...")
#
shap.waterfall_plot(shap.Explanation(
    values=shap_values_karyawan[0], # Ambil SHAP values
    base_values=base_value,        # Base E[f(X)]
    data=karyawan_data.values[0],  # Nilai fitur asli
    feature_names=feature_names    # Nama fitur
))

# 9. Verifikasi perhitungan manual (Log-odds -> Prob)
print("\n--- Verifikasi Manual ---")
final_log_odds = base_value + shap_values_karyawan[0].sum()
prob_dari_shap = sigmoid(final_log_odds)
prob_dari_model = stack_model.predict_proba(karyawan_data)[:, 1][0]

print(f"Log-odds Akhir f(x) (Base + SHAP): {final_log_odds:.4f}")
print(f"Probabilitas Attrition (dari Sigmoid): {prob_dari_shap:.4f}")
print(f"Probabilitas Attrition (dari Model.predict_proba): {prob_dari_model:.4f}")

if np.isclose(prob_dari_shap, prob_dari_model):
    print("✅ Hasil perhitungan manual (Sigmoid) SESUAI dengan prediksi model!")
else:
    print("⚠️ Hasil perhitungan manual BERBEDA. Cek proses Explainer.")

# 10. Tampilkan Force Plot
print("\nMembuat Force Plot (Visualisasi interaktif)...")
#
display(shap.force_plot(
    base_value,
    shap_values_karyawan[0],
    karyawan_data,
    matplotlib=False # Gunakan versi Javascript (interaktif)
))

print("\n✅ Interpretasi SHAP selesai.")

In [None]:
# =====================================================
# ✅ BLOK KODE UNTUK SHAP GLOBAL FEATURE IMPORTANCE (Bar Plot)
# =====================================================

import shap
import matplotlib.pyplot as plt
import numpy as np # Pastikan numpy sudah diimpor

# --- Pastikan variabel ini sudah ada dari script utama Anda ---
# stack_model: Model StackingClassifier Anda yang sudah di-fit
# X_res: DataFrame fitur Anda (misalnya, X_res atau X_val)
# feature_names: Daftar nama kolom X_res
# predict_log_odds: Fungsi yang mengembalikan prediksi log-odds (sudah kita definisikan)
# -----------------------------------------------------------

print("\n=== SHAP GLOBAL FEATURE IMPORTANCE (Bar Plot) ===")

# --- Langkah 1: Buat SHAP Explainer (Jika belum ada) ---
# Jika Anda sudah membuat 'explainer' dari kode sebelumnya, Anda bisa langsung menggunakannya.
# Jika tidak, kita perlu membuatnya lagi:
try:
    # Coba gunakan explainer yang sudah ada
    if 'explainer' not in locals() or explainer is None:
        raise NameError("explainer not found, creating a new one.")
    print("Menggunakan KernelExplainer yang sudah ada.")
except NameError:
    print("Membuat SHAP KernelExplainer baru untuk Global Importance.")
    background_data_for_global = shap.sample(X_res, 100)
    explainer = shap.KernelExplainer(predict_log_odds, background_data_for_global)


# --- Langkah 2: Hitung SHAP Values untuk seluruh dataset ---
# Menghitung SHAP values untuk seluruh dataset bisa sangat lambat,
# terutama dengan KernelExplainer. Kita akan mengambil sampel data
# yang lebih besar dari background_data tapi tidak semua X_res.
print("Menghitung SHAP values untuk sampel data (ini mungkin perlu waktu)...")
sample_size_for_global_shap = min(2000, len(X_res)) # Batasi hingga 2000 sampel atau semua data
shap_values_global = explainer.shap_values(X_res.sample(sample_size_for_global_shap, random_state=42))

# --- Langkah 3: Membuat SHAP Summary Plot (Bar Plot) ---
# Untuk plot seperti gambar Anda, kita perlu SHAP values untuk kelas positif (Attrition=1)
# shap_values_global[0] jika kelas target 0, shap_values_global[1] jika kelas target 1
# Asumsi kita tertarik pada kelas Attrition=1, yang biasanya indeks 1.
# Pastikan 'predict_log_odds' sudah mengembalikan nilai yang relevan untuk kelas target positif.
# KernelExplainer biasanya mengembalikan 1 array untuk output tunggal (log-odds).

print("Membuat SHAP Summary Bar Plot...")
plt.figure(figsize=(10, 7)) # Sesuaikan ukuran figure jika banyak fitur
shap.summary_plot(
    shap_values_global, # SHAP values
    X_res.sample(sample_size_for_global_shap, random_state=42), # Data asli untuk label fitur
    plot_type="bar", # Ini yang akan membuat bar plot
    show=False,      # Jangan tampilkan langsung, kita atur judul sendiri
    feature_names=feature_names, # Pastikan nama fitur benar
    color_bar=False, # Tidak perlu color bar untuk bar plot
    max_display=20 # Tampilkan 20 fitur teratas (sesuaikan jika perlu)
)
plt.title("SHAP Global Feature Importance (Rata-rata Absolute SHAP Value)")
plt.xlabel("mean(|SHAP value|)")
plt.tight_layout()
plt.show()

print("\n✅ SHAP Global Feature Importance Bar Plot selesai.")

In [None]:
# =====================================================
# ✅ SOLUSI CEPAT 1: PERMUTATION IMPORTANCE
# =====================================================

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.inspection import permutation_importance

# --- Pastikan variabel ini sudah ada ---
# stack_model: Model yang sudah di-fit
# X_res, y_res: Data yang digunakan untuk fit
# feature_names: Daftar nama kolom
# ----------------------------------------

print("\n--- Menghitung Permutation Importance (Jauh Lebih Cepat) ---")

# Kita hitung importance pada data training (X_res)
# n_repeats=5 sudah cukup cepat dan stabil
perm_importance = permutation_importance(
    stack_model,
    X_res,
    y_res,
    n_repeats=5,  # Ulang 5x (lebih cepat dari 10x)
    random_state=42,
    scoring='roc_auc',
    n_jobs=-1
)

# Urutkan fitur berdasarkan importance
sorted_idx = perm_importance.importances_mean.argsort()

# Buat DataFrame untuk visualisasi
perm_df = pd.DataFrame(
    data={
        'feature': np.array(feature_names)[sorted_idx],
        'importance_mean': perm_importance.importances_mean[sorted_idx],
    }
).sort_values(by='importance_mean', ascending=False)

print("\nFitur Paling Berpengaruh (Permutation Importance):")
pd.set_option('display.max_rows', None)
print(perm_df)
pd.reset_option('display.max_rows')

# --- Visualisasi Bar Plot ---
# Atur tinggi plot secara dinamis
num_features = len(perm_df)
plot_height = max(8, num_features * 0.4)

plt.figure(figsize=(12, plot_height))
sns.barplot(
    data=perm_df,
    x='importance_mean',
    y='feature',
    palette='rocket'
)
plt.title("Global Feature Importance (Permutation Importance - Fast)")
plt.xlabel("Penurunan Rata-rata Skor AUC (Semakin tinggi semakin penting)")
plt.ylabel("Fitur")
plt.tight_layout()
plt.show()

In [None]:
# =====================================================
# ✅ BLOK KODE UNTUK BEESWARM PLOT (Metode "Proxy" Cepat)
# =====================================================

import shap
import matplotlib.pyplot as plt

# --- Pastikan variabel ini sudah ada dari script utama Anda ---
# stack_model: Model StackingClassifier Anda yang sudah di-fit
# X_res: DataFrame fitur Anda (misalnya, X_res)
# -----------------------------------------------------------

print("\n=== SHAP BEESWARM PLOTS (Metode \"Proxy\" Cepat) ===")
shap.initjs() # Inisialisasi javascript untuk plotting

# Ambil sampel data. 2000 sampel sudah lebih dari cukup
# untuk beeswarm plot yang representatif.
sample_data_beeswarm = shap.sample(X_res, 2000, random_state=42)

# Loop ketiga base model yang sudah di-fit di dalam stack
for model_name, _ in stack_model.estimators:
    try:
        print(f"\n--- Menganalisis SHAP Beeswarm untuk: {model_name} ---")

        # 1. Akses model yang sudah di-fit dari stack
        model_in_stack = stack_model.named_estimators_[model_name]

        # 2. Gunakan shap.TreeExplainer (CEPAT)
        explainer_fast = shap.TreeExplainer(model_in_stack)

        # 3. Hitung SHAP values
        shap_values_fast = explainer_fast(sample_data_beeswarm)

        # 4. Buat Summary Plot (Beeswarm)
        #    plot_type="dot" adalah default, jadi kita bisa hilangkan.

        try:
            # Coba ambil shap values untuk kelas 1 (Attrition)
            shap_values_for_plot = shap_values_fast.values[:,:,1]
            data_for_plot = sample_data_beeswarm
        except IndexError:
            # Jika output hanya 1 kelas (jarang terjadi di klasifikasi)
            shap_values_for_plot = shap_values_fast.values
            data_for_plot = sample_data_beeswarm

        print(f"Membuat plot untuk {model_name}...")
        shap.summary_plot(
            shap_values_for_plot,
            data_for_plot,
            plot_type="dot", # "dot" adalah beeswarm plot
            show=False,
            max_display=20 # Tampilkan 20 fitur teratas
        )
        plt.title(f"SHAP Beeswarm Plot untuk: {model_name} (Kelas Attrition=1)")
        plt.tight_layout()
        plt.show()

    except Exception as e:
        print(f"Error saat membuat SHAP Beeswarm untuk {model_name}: {e}")

print("\n✅ SHAP Beeswarm Plots (Proxy) selesai.")

In [None]:
# =====================================================
# ✅ BLOK KODE: ESTIMASI DAMPAK FINANSIAL (FINANCIAL RESULT)
# =====================================================
import matplotlib.pyplot as plt
import seaborn as sns

print("\n\n" + "="*50)
print("✅ MEMULAI ANALISIS DAMPAK FINANSIAL (FINANCIAL RESULT)")
print("="*50)

# --- 1. Definisikan Asumsi Dasar ---
# Mirip dengan 'biaya 18%' dan 'diskon 8%' Anda, kita tetapkan asumsi biaya:
# Asumsi: Biaya turnover (kehilangan karyawan) setara dengan 6 bulan gaji.
# Asumsi: Biaya retensi (bonus/kenaikan gaji) setara dengan 1 bulan gaji.

ASSUMPTIONS = {
    'cost_turnover_months': 6.0,  # Biaya jika FN (False Negative)
    'cost_retention_months': 1.0  # Biaya jika FP (False Positive)
}

# Benefit (TP) = Biaya yang Dihindari - Biaya yang Dikeluarkan
# Benefit (TP) = (6 bulan gaji) - (1 bulan gaji) = 5 bulan gaji
ASSUMPTIONS['benefit_retention_months'] = ASSUMPTIONS['cost_turnover_months'] - ASSUMPTIONS['cost_retention_months']

print("Asumsi Finansial (berdasarkan Gaji Bulanan):")
print(f"  - Biaya Turnover (FN): {ASSUMPTIONS['cost_turnover_months']:.1f}x Gaji Bulanan")
print(f"  - Biaya Retensi (FP):  {ASSUMPTIONS['cost_retention_months']:.1f}x Gaji Bulanan")
print(f"  - Benefit Retensi (TP): {ASSUMPTIONS['benefit_retention_months']:.1f}x Gaji Bulanan")
print(f"  - Biaya/Benefit (TN): 0.0x Gaji Bulanan (Baseline)\n")


# --- 2. Siapkan Data Aktual (Bukan Data SMOTE) ---
# Penting: Analisis finansial harus menggunakan data asli, bukan data sintetis.
# Kita gunakan model yang dilatih pada 'X_res' untuk memprediksi 'X' (data asli).
# 'X' adalah DataFrame fitur Anda yang sudah di-encode dan di-scale.
# 'y' adalah target 'Attrition' asli Anda.
# 'train' adalah DataFrame asli sebelum diubah, untuk mengambil 'MonthlyIncome'.

print("Mendapatkan prediksi probabilitas pada data *asli*...")
try:
    # Dapatkan probabilitas prediksi dari model akhir pada data asli 'X'
    y_probs_actual = stack_model.predict_proba(X)[:, 1]

    # Buat DataFrame hasil untuk analisis
    results_df = pd.DataFrame({
        'Actual_Attrition': y,
        'Prob_Attrition': y_probs_actual,
        'MonthlyIncome': train['MonthlyIncome'] # Ambil pendapatan asli
    })

except Exception as e:
    print(f"Error saat membuat prediksi pada data 'X'. Pastikan 'X' dan 'train' tersedia. Error: {e}")
    # Hentikan eksekusi jika data penting tidak ada
    raise

# --- 3. Fungsi Kalkulator Finansial ---
# Fungsi ini akan menghitung total net benefit berdasarkan 'threshold' probabilitas
def calculate_financial_impact(df, assumptions, threshold):

    # Tentukan prediksi (1 atau 0) berdasarkan threshold
    df['Predicted_Attrition'] = (df['Prob_Attrition'] > threshold).astype(int)

    # Pisahkan berdasarkan Confusion Matrix (TP, FP, FN, TN)
    tp_df = df[(df['Actual_Attrition'] == 1) & (df['Predicted_Attrition'] == 1)]
    fp_df = df[(df['Actual_Attrition'] == 0) & (df['Predicted_Attrition'] == 1)]
    fn_df = df[(df['Actual_Attrition'] == 1) & (df['Predicted_Attrition'] == 0)]
    tn_df = df[(df['Actual_Attrition'] == 0) & (df['Predicted_Attrition'] == 0)]

    # Hitung total dampak finansial untuk setiap kuadran

    # True Positive (TP): Karyawan diprediksi keluar, DIBERI retensi, dan berhasil (asumsi).
    # Benefit: Menghindari biaya turnover, dikurangi biaya retensi.
    total_benefit_tp = (tp_df['MonthlyIncome'] * assumptions['benefit_retention_months']).sum()

    # False Positive (FP): Karyawan diprediksi keluar, DIBERI retensi, padahal TIDAK akan keluar.
    # Cost: Biaya retensi yang tidak perlu.
    total_cost_fp = (fp_df['MonthlyIncome'] * assumptions['cost_retention_months']).sum()

    # False Negative (FN): Karyawan diprediksi bertahan, TIDAK diberi retensi, dan ternyata KELUAR.
    # Cost: Kehilangan pendapatan (biaya turnover penuh).
    total_cost_fn = (fn_df['MonthlyIncome'] * assumptions['cost_turnover_months']).sum()

    # True Negative (TN): Karyawan diprediksi bertahan, TIDAK diberi retensi, dan memang bertahan.
    # Cost/Benefit: 0 (ini adalah baseline kita).
    total_benefit_tn = 0

    # Hitung total Net Financial Impact
    net_impact = total_benefit_tp - total_cost_fp - total_cost_fn

    return net_impact, len(tp_df), len(fp_df), len(fn_df)

# --- 4. Hitung Dampak pada Threshold Default (0.5) ---
default_threshold = 0.5
net_impact_default, tp, fp, fn = calculate_financial_impact(results_df.copy(), ASSUMPTIONS, default_threshold)

print(f"--- Hasil Finansial (Default Threshold = {default_threshold}) ---")
print(f"  Estimasi Total Net Benefit: ${net_impact_default:,.2f}")
print(f"  Intervensi dilakukan pada: {tp+fp} karyawan (TP + FP)")
print(f"  Karyawan yang keluar & gagal dicegah (FN): {fn} karyawan\n")

# --- 5. Optimasi Threshold untuk Keuntungan Maksimal ---
# Ini adalah implementasi dari "menargetkan pelanggan dengan probabilitas churn lebih tinggi"
print("Mencari threshold probabilitas optimal untuk Net Benefit maksimal...")
thresholds = np.linspace(0.05, 0.95, 100) # Cek 100 threshold berbeda
financial_results = []

for thresh in thresholds:
    net_impact, _, _, _ = calculate_financial_impact(results_df.copy(), ASSUMPTIONS, thresh)
    financial_results.append({
        'threshold': thresh,
        'net_benefit': net_impact
    })

# Konversi ke DataFrame untuk analisis mudah
financial_df = pd.DataFrame(financial_results)

# Cari hasil terbaik
best_result = financial_df.loc[financial_df['net_benefit'].idxmax()]
optimal_threshold = best_result['threshold']
max_benefit = best_result['net_benefit']

print(f"\n--- Hasil Finansial (Optimal Threshold = {optimal_threshold:.2f}) ---")
print(f"  Estimasi Total Net Benefit Maksimal: ${max_benefit:,.2f}")

# Hitung ulang jumlah TP/FP/FN pada threshold optimal
_, tp_opt, fp_opt, fn_opt = calculate_financial_impact(results_df.copy(), ASSUMPTIONS, optimal_threshold)
print(f"  Intervensi dilakukan pada: {tp_opt+fp_opt} karyawan (TP + FP)")
print(f"  Karyawan yang keluar & gagal dicegah (FN): {fn_opt} karyawan")
print("\nAnalisis ini menunjukkan bahwa dengan memilih threshold secara strategis,")
print("perusahaan dapat memaksimalkan penghematan biaya.")
print("Threshold yang lebih tinggi = lebih 'konservatif', mengurangi biaya FP.")


# --- 6. Visualisasi Hasil Finansial vs. Threshold ---
print("\nMembuat plot 'Net Benefit vs. Threshold'...")
plt.figure(figsize=(12, 7))
sns.lineplot(data=financial_df, x='threshold', y='net_benefit', linewidth=2.5)
plt.axvline(x=optimal_threshold, color='red', linestyle='--',
            label=f'Optimal Threshold ({optimal_threshold:.2f})\nMax Benefit: ${max_benefit:,.0f}')
plt.axvline(x=default_threshold, color='gray', linestyle=':',
            label=f'Default Threshold (0.50)\nBenefit: ${net_impact_default:,.0f}')
plt.title('Estimasi Net Benefit vs. Threshold Prediksi Attrition', fontsize=16)
plt.xlabel('Threshold Probabilitas untuk Intervensi', fontsize=12)
plt.ylabel('Total Estimasi Net Benefit ($)', fontsize=12)
plt.legend(loc='best')
plt.grid(True, linestyle=':', alpha=0.6)
plt.tight_layout()
plt.show()

print("\n🎯 Analisis Dampak Finansial selesai!")
# =====================================================



✅ MEMULAI ANALISIS DAMPAK FINANSIAL (FINANCIAL RESULT)
Asumsi Finansial (berdasarkan Gaji Bulanan):
  - Biaya Turnover (FN): 6.0x Gaji Bulanan
  - Biaya Retensi (FP):  1.0x Gaji Bulanan
  - Benefit Retensi (TP): 5.0x Gaji Bulanan
  - Biaya/Benefit (TN): 0.0x Gaji Bulanan (Baseline)

Mendapatkan prediksi probabilitas pada data *asli*...
Error saat membuat prediksi pada data 'X'. Pastikan 'X' dan 'train' tersedia. Error: name 'stack_model' is not defined


NameError: name 'stack_model' is not defined