In [1]:
import pandas as pd
import numpy as np

# Load cleaned data
df = pd.read_csv('../data/churn_cleaned.csv')
print("Loaded:", df.shape)
df.head()


Loaded: (10000, 11)


Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [2]:
# 1. Age buckets
def age_bucket(age):
    if age < 30:
        return 'young'
    elif age < 45:
        return 'middle'
    elif age < 60:
        return 'senior'
    else:
        return 'elderly'

df['AgeBucket'] = df['Age'].apply(age_bucket)

# 2. Tenure buckets
def tenure_bucket(tenure):
    if tenure <= 2:
        return 'new'
    elif tenure <= 5:
        return 'mid'
    else:
        return 'loyal'

df['TenureBucket'] = df['Tenure'].apply(tenure_bucket)

# 3. Balance flag (has balance or not)
df['HasBalance'] = (df['Balance'] > 0).astype(int)

# 4. High balance flag (top 25%)
balance_75 = df['Balance'].quantile(0.75)
df['HighBalance'] = (df['Balance'] > balance_75).astype(int)

# 5. Products flag (single vs multiple)
df['MultipleProducts'] = (df['NumOfProducts'] > 1).astype(int)

# 6. High-risk products (3 or 4 products - from EDA, these churn a lot)
df['HighRiskProducts'] = (df['NumOfProducts'] >= 3).astype(int)

# 7. Balance per product
df['BalancePerProduct'] = df['Balance'] / df['NumOfProducts']

# 8. Salary to balance ratio
df['SalaryBalanceRatio'] = df['EstimatedSalary'] / (df['Balance'] + 1)  # +1 to avoid division by zero

# 9. Is Germany (high churn country from EDA)
df['IsGermany'] = (df['Geography'] == 'Germany').astype(int)

# 10. Inactive with high balance (potential risk)
df['InactiveHighBalance'] = ((df['IsActiveMember'] == 0) & (df['Balance'] > balance_75)).astype(int)

print("New features added. Shape:", df.shape)
print("\nNew columns:")
print([c for c in df.columns if c not in ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 
                                           'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember',
                                           'EstimatedSalary', 'Exited']])


New features added. Shape: (10000, 21)

New columns:
['AgeBucket', 'TenureBucket', 'HasBalance', 'HighBalance', 'MultipleProducts', 'HighRiskProducts', 'BalancePerProduct', 'SalaryBalanceRatio', 'IsGermany', 'InactiveHighBalance']


In [3]:
# One-hot encode Geography and Gender
df_encoded = pd.get_dummies(df, columns=['Geography', 'Gender', 'AgeBucket', 'TenureBucket'], 
                            drop_first=True)

print("After encoding:", df_encoded.shape)
print("\nAll columns:")
print(df_encoded.columns.tolist())


After encoding: (10000, 25)

All columns:
['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Exited', 'HasBalance', 'HighBalance', 'MultipleProducts', 'HighRiskProducts', 'BalancePerProduct', 'SalaryBalanceRatio', 'IsGermany', 'InactiveHighBalance', 'Geography_Germany', 'Geography_Spain', 'Gender_Male', 'AgeBucket_middle', 'AgeBucket_senior', 'AgeBucket_young', 'TenureBucket_mid', 'TenureBucket_new']


In [4]:
# Separate features and target
X = df_encoded.drop('Exited', axis=1)
y = df_encoded['Exited']

print("X shape:", X.shape)
print("y shape:", y.shape)
print("\nFeature names:")
print(X.columns.tolist())


X shape: (10000, 24)
y shape: (10000,)

Feature names:
['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'HasBalance', 'HighBalance', 'MultipleProducts', 'HighRiskProducts', 'BalancePerProduct', 'SalaryBalanceRatio', 'IsGermany', 'InactiveHighBalance', 'Geography_Germany', 'Geography_Spain', 'Gender_Male', 'AgeBucket_middle', 'AgeBucket_senior', 'AgeBucket_young', 'TenureBucket_mid', 'TenureBucket_new']


In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42,
    stratify=y  # Important: keeps same churn ratio in train and test
)

print("Train set:", X_train.shape, "Churn rate:", y_train.mean()*100, "%")
print("Test set:", X_test.shape, "Churn rate:", y_test.mean()*100, "%")


Train set: (8000, 24) Churn rate: 20.375 %
Test set: (2000, 24) Churn rate: 20.349999999999998 %


In [6]:
from sklearn.preprocessing import StandardScaler

# Identify numeric columns to scale
numeric_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 
                'EstimatedSalary', 'BalancePerProduct', 'SalaryBalanceRatio']

scaler = StandardScaler()

# Fit on train, transform both
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test_scaled[numeric_cols] = scaler.transform(X_test[numeric_cols])

print("Scaling done.")
print("\nSample scaled values (first row):")
print(X_train_scaled[numeric_cols].iloc[0])


Scaling done.

Sample scaled values (first row):
CreditScore           1.058568
Age                   1.715086
Tenure                0.684723
Balance              -1.226059
NumOfProducts        -0.910256
EstimatedSalary       1.042084
BalancePerProduct    -1.106569
SalaryBalanceRatio    2.106556
Name: 2151, dtype: float64


In [7]:
import pickle

# Save everything for modeling notebook
processed_data = {
    'X_train': X_train_scaled,
    'X_test': X_test_scaled,
    'y_train': y_train,
    'y_test': y_test,
    'feature_names': X.columns.tolist(),
    'scaler': scaler
}

with open('../data/processed_data.pkl', 'wb') as f:
    pickle.dump(processed_data, f)

print("Processed data saved!")
print("\nReady for modeling. Features:", len(X.columns))


Processed data saved!

Ready for modeling. Features: 24
