In [None]:
# Import necessary libraries
import pandas as pd

# Load the cleaned dataset saved from the previous step
cleaned_data_path = r'..\..\data\processed\credit_card_attrition_cleaned.csv'
df = pd.read_csv(cleaned_data_path)

In [None]:
df.head()

In [None]:
import pandas as pd

# Assuming df is your cleaned dataset
df = df.copy()

# Ratios
df['SpendIncomeRatio'] = df['TotalSpend'] / (df['Income'] + 1e-6)
df['TransactionsTenureRatio'] = df['TotalTransactions'] / (df['Tenure'] + 1e-6)
df['CreditUsage'] = df['TotalSpend'] / (df['CreditLimit'] + 1e-6)

# Differences
df['CreditLimitMinusSpend'] = df['CreditLimit'] - df['TotalSpend']
df['AgeMinusTenure'] = df['Age'] - df['Tenure']

# Multiplications
df['AgeTimesTransactions'] = df['Age'] * df['TotalTransactions']
df['IncomeTimesTransactions'] = df['Income'] * df['TotalTransactions']

print(df.head())

In [None]:
# Categorical columns
cat_cols = ['CardType', 'MaritalStatus', 'EducationLevel', 'Country']

for col in cat_cols:
    churn_rate = df.groupby(col)['AttritionFlag'].mean()
    df[col + '_ChurnRate'] = df[col].map(churn_rate)

print(df[[col + '_ChurnRate' for col in cat_cols]].head())

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

num_for_cluster = df.select_dtypes(include=['number']).drop(columns=['AttritionFlag'])
scaler = StandardScaler()
X_scaled = scaler.fit_transform(num_for_cluster)

kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
df['Cluster'] = kmeans.fit_predict(X_scaled)

print(df.groupby('Cluster')['AttritionFlag'].mean())

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Compute the correlation matrix
correlation_matrix = df.corr()

# Set up the figure
plt.figure(figsize=(21, 16))

# Create a heatmap for the correlation matrix
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', cbar=True, center=0, linewidths=0.5)

# Add title
plt.title('Correlation Heatmap', fontsize=16)

# Display the plot
plt.show()

In [None]:
plt.figure(figsize=(20,16))
sns.heatmap(correlation_matrix, cmap='coolwarm', center=0, linewidths=0.5)
plt.title('Spearman Correlation Matrix')
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

df0 = df.copy()
target = 'AttritionFlag'

X_full = df0.drop(columns=[target])  # keep ID out of features
y_full = df0[target].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X_full, y_full, test_size=0.2, random_state=42, stratify=y_full
)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# =====================
# 1. Features & Target
# =====================
target_col = 'AttritionFlag'
X = df.drop(columns=[target_col])
y = df[target_col]

# =====================
# 2. Leakage-safe split
# =====================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# =====================
# 3. Train Model
# =====================
xgb_model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.1,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='logloss',
    n_jobs=-1
)

xgb_model.fit(X_train, y_train)

# =====================
# 4. Evaluate
# =====================
y_pred = xgb_model.predict(X_test)
y_pred_proba = xgb_model.predict_proba(X_test)[:, 1]

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nROC-AUC Score:", roc_auc_score(y_test, y_pred_proba))

# =====================
# 5. Feature Importance
# =====================
importances = pd.Series(xgb_model.feature_importances_, index=X_train.columns)
print("\nTop 10 features:\n", importances.sort_values(ascending=False).head(10))

In [None]:
df.columns