In [None]:
import pandas as pd
import shap
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

# -----------------------
# 1. Load Dataset
# -----------------------
file_path = "C:\\Users\\ZAK-TECH\\Desktop\\KAIM week5\\data\\raw\\creditcard.csv"  # or "Fraud_Data.csv"
target_col = "Class"           # or "class"

df = pd.read_csv(file_path)
X = df.drop(columns=[target_col])
y = df[target_col]

# -----------------------
# 2. Train-Test Split
# -----------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# -----------------------
# 3. Train Random Forest (Best Model)
# -----------------------
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)

# -----------------------
# 4. Feature Importance (built-in)
# -----------------------
importances = rf.feature_importances_
feature_importance_df = pd.DataFrame({
    "feature": X_train.columns,
    "importance": importances
}).sort_values(by="importance", ascending=False)

print("Top 10 features (built-in importance):")
print(feature_importance_df.head(10))

# Visualize top 10
plt.figure(figsize=(10,6))
plt.barh(feature_importance_df['feature'][:10][::-1], feature_importance_df['importance'][:10][::-1])
plt.xlabel("Importance")
plt.title("Top 10 Feature Importances (Random Forest)")
plt.tight_layout()
plt.show()

# -----------------------
# 5. SHAP Analysis
# -----------------------
explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(X_test)

# -----------------------
# 5a. SHAP Summary Plot (global)
# -----------------------
shap.summary_plot(shap_values[1], X_test, plot_type="bar", max_display=10)  # class 1 = fraud

# -----------------------
# 5b. Confusion Matrix to identify TP, FP, FN
# -----------------------
y_pred = rf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

# Identify indices
tp_idx = X_test[(y_test==1) & (y_pred==1)].index[0]
fp_idx = X_test[(y_test==0) & (y_pred==1)].index[0]
fn_idx = X_test[(y_test==1) & (y_pred==0)].index[0]

# -----------------------
# 5c. SHAP Force Plots for individual predictions
# -----------------------
print("Generating SHAP Force Plot for True Positive...")
shap.force_plot(explainer.expected_value[1], shap_values[1][tp_idx,:], X_test.loc[tp_idx,:], matplotlib=True)

print("Generating SHAP Force Plot for False Positive...")
shap.force_plot(explainer.expected_value[1], shap_values[1][fp_idx,:], X_test.loc[fp_idx,:], matplotlib=True)

print("Generating SHAP Force Plot for False Negative...")
shap.force_plot(explainer.expected_value[1], shap_values[1][fn_idx,:], X_test.loc[fn_idx,:], matplotlib=True)

# -----------------------
# 6. Top 5 Drivers of Fraud Predictions
# -----------------------
top_5_features = feature_importance_df['feature'][:5].tolist()
print("Top 5 drivers of fraud predictions (built-in + SHAP confirmed):")
for i, f in enumerate(top_5_features, 1):
    print(f"{i}. {f}")

# -----------------------
# 7. Business Recommendations (placeholders)
# -----------------------
print("\nActionable Business Recommendations:")
print("1. Transactions occurring within X hours of signup should undergo additional verification (related to feature 'Time').")
print("2. Transactions with unusually high amounts or spikes in 'Amount' should trigger manual review.")
print("3. Customers with prior suspicious behavior (e.g., high 'V...' features) should receive alerts or additional checks.")


  from .autonotebook import tqdm as notebook_tqdm
