# Customer Churn Analysis Notebook

Professional notebook with EDA, preprocessing, baseline model and evaluation.

## 1. Load data

Load dataset and show basic info.

In [None]:
import pandas as pd
from pathlib import Path
repo = Path('.')
df = pd.read_csv('data/Bank_Churn_Dataset.csv')
df.head()

## 2. Exploratory Data Analysis

Include key polished images: churn, age distribution, correlation matrix, ROC and confusion matrix.

In [None]:
from IPython.display import Image, display
display(Image('images/churn_pie_polished.png'))
display(Image('images/age_hist_polished.png'))
display(Image('images/corr_matrix_polished.png'))
display(Image('images/roc_curve_polished.png'))
display(Image('images/confusion_matrix_polished.png'))

## 3. Preprocessing & Feature Engineering

Brief description and code to prepare features for modelling.

In [None]:
# Preprocessing sample
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

df2 = df.copy()
for col in ['RowNumber','CustomerId','Surname']:
    if col in df2.columns:
        df2 = df2.drop(columns=[col])
if 'Geography' in df2.columns:
    df2 = pd.get_dummies(df2, columns=['Geography'], drop_first=True)
if 'Gender' in df2.columns:
    df2['Gender'] = df2['Gender'].map({'Female':0,'Male':1}).fillna(0)

y = df2['Exited']
X = df2.drop(columns=['Exited'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])
print('Prepared train/test sets')

## 4. Baseline Model (Logistic Regression)

Train a baseline logistic regression model and evaluate with AUC and confusion matrix.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, classification_report
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
proba = model.predict_proba(X_test)[:,1]
pred = model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred))
print('AUC:', roc_auc_score(y_test, proba))
print('Confusion matrix:\n', confusion_matrix(y_test, pred))
print('\nClassification report:\n', classification_report(y_test, pred))

## 5. Save Model & Next Steps

Model saved in `models/logistic_regression_baseline.pkl`. Next steps include hyperparameter tuning, feature selection, and more advanced models.