# Data Cleaning – Credit Card Churn Dataset
This notebook performs initial data cleaning on the raw credit card churn dataset.  
The goal is to prepare the dataset for EDA and modeling by:
- Removing duplicates
- Handling missing values
- Fixing data types
- Addressing outliers
- Managing high-cardinality categorical features  
The cleaned dataset will be saved in `data/processed/` for use in later stages.

In [None]:
# Standard libraries
import pandas as pd
import numpy as np
from pathlib import Path
import os

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Display settings
pd.set_option('display.max_columns', None)
sns.set(style="whitegrid")

# Reproducibility
np.random.seed(42)

In [None]:
# Paths
DATA_DIR = Path("../../data/raw")
FILE_PATH = DATA_DIR / "credit_card_attrition_dataset_mark.csv" 

In [None]:
# Load
df = pd.read_csv(FILE_PATH)

## 1. Looking at the Dataset

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

## 2. Checking for Duplicates

In [None]:
# Count duplicates
df.duplicated().sum()

In [None]:
# Remove duplicates
df = df.drop_duplicates()

In [None]:
df.duplicated().sum()

In [None]:
df.shape

## 3. Checking for Missing Data

In [None]:
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
print(df.isna().sum())

In [None]:
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')

In [None]:
df[["Income", "CreditLimit", "TotalSpend"]].isnull().sum()

*The columns that has missing values area `Income`, `CreditLimit`, `TotalSpend` which has 5k missing  values.*

In [None]:
cols_with_missing = ["Income", "CreditLimit", "TotalSpend"]

df[cols_with_missing].skew()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

figures_dir = Path("../../reports/figures")
figures_dir.mkdir(parents=True, exist_ok=True)

cols_with_missing = ["Income", "CreditLimit", "TotalSpend"]

for col in cols_with_missing:
    plt.figure(figsize=(6,4))
    sns.histplot(df[col], kde=True, bins=30)
    plt.title(f"Distribution of {col} (with Missing Values)")
    
    # Save figure
    save_path = figures_dir / f"{col}_distribution.png"
    plt.savefig(save_path, dpi=300, bbox_inches="tight")
    
    # Show plot
    plt.show()

    print(f"Saved: {save_path}")

*Since `Income` and `TotalSpend` were highly right-skewed, I will use median imputation to avoid distortion from outliers. `CreditLimit` was nearly symmetric, so I will use mean imputation to preserve its distribution.*

In [None]:
# Median for skewed features
df["Income"] = df["Income"].fillna(df["Income"].median())
df["TotalSpend"] = df["TotalSpend"].fillna(df["TotalSpend"].median())

# Mean for symmetric feature
df["CreditLimit"] = df["CreditLimit"].fillna(df["CreditLimit"].mean())

In [None]:
df[["Income", "CreditLimit", "TotalSpend"]].isnull().sum()

In [None]:
df.describe()

## 4. Checking for Outliers

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

figures_dir = Path("../../reports/figures/outliers")
figures_dir.mkdir(parents=True, exist_ok=True)

for col in ["Income", "CreditLimit", "TotalSpend"]:
    fig, axes = plt.subplots(1, 2, figsize=(10, 4))
    
    sns.boxplot(x=df[col], ax=axes[0])
    axes[0].set_title(f"{col} Before Capping")
    
    sns.histplot(df[col], kde=True, ax=axes[1])
    axes[1].set_title(f"{col} Distribution Before Capping")
    
    plt.tight_layout()
    plt.savefig(figures_dir / f"{col}_before_capping.png", dpi=300, bbox_inches="tight")
    plt.show()

In [None]:
import numpy as np
import pandas as pd

def cap_outliers_iqr(df, cols):
    """
    Caps outliers in specified numeric columns using the IQR method.
    
    Parameters:
    df (DataFrame): Input dataframe
    cols (list): List of numeric columns to process
    
    Returns:
    DataFrame: Dataframe with capped outliers
    """
    for col in cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Cap values outside the bounds
        df[col] = np.where(df[col] > upper_bound, upper_bound,
                           np.where(df[col] < lower_bound, lower_bound, df[col]))
        
        print(f"{col}: capped values outside [{lower_bound:.2f}, {upper_bound:.2f}]")
        
    return df

# Identify numeric columns (excluding target)
numeric_cols = df.select_dtypes(include=["float64", "int64"]).columns.tolist()
numeric_cols.remove("AttritionFlag")  # exclude target

# Apply IQR capping
df = cap_outliers_iqr(df, numeric_cols)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# figures_dir = Path("../../reports/figures/outliers")
# figures_dir.mkdir(parents=True, exist_ok=True)

for col in ["Income", "CreditLimit", "TotalSpend"]:
    fig, axes = plt.subplots(1, 2, figsize=(10, 4))
    
    sns.boxplot(x=df[col], ax=axes[0])
    axes[0].set_title(f"{col} After Capping")
    
    sns.histplot(df[col], kde=True, ax=axes[1])
    axes[1].set_title(f"{col} Distribution After Capping")
    
    plt.tight_layout()
    # plt.savefig(figures_dir / f"{col}_after_capping.png", dpi=300, bbox_inches="tight")
    plt.show()


*I detected outliers using the IQR method and capped them instead of removing them to preserve dataset size while reducing the influence of extreme values. This is particularly important for financial datasets where high values can be genuine but shouldn’t overly bias the model.*