# Customer Churn Analysis
Detailed step-by-step machine learning pipeline using Logistic Regression, Random Forest, and XGBoost.

## 1. Import Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

## 2. Load and Inspect Dataset

In [None]:
df = pd.read_csv('Customer-Churn-Records.csv')
df.head()

## 3. Data Cleaning and Preprocessing

In [None]:
# Drop identifier columns
df_cleaned = df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)

# Label encode categorical columns
categorical_cols = ['Geography', 'Gender', 'Card Type']
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_cleaned[col] = le.fit_transform(df_cleaned[col])
    label_encoders[col] = le

df_cleaned.head()

## 4. Train-Test Split and Scaling

In [None]:
X = df_cleaned.drop('Exited', axis=1)
y = df_cleaned['Exited']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 5. Train Models

In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

results = {}
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    report = classification_report(y_test, y_pred, output_dict=True)
    cm = confusion_matrix(y_test, y_pred)
    auc = roc_auc_score(y_test, model.predict_proba(X_test_scaled)[:, 1])
    results[name] = {
        "model": model,
        "classification_report": report,
        "confusion_matrix": cm,
        "roc_auc": auc
    }

## 6. Model Evaluation and Visualization

In [None]:
for name, result in results.items():
    print(f"\n{name} - ROC AUC: {result['roc_auc']:.4f}")
    print("Classification Report:")
    print(pd.DataFrame(result['classification_report']).transpose())

In [None]:
# Plot confusion matrix
for name, result in results.items():
    cm = result['confusion_matrix']
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['No Churn', 'Churn'], yticklabels=['No Churn', 'Churn'])
    plt.title(f"{name} - Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

In [None]:
# Feature Importances
for name in ['Random Forest', 'XGBoost']:
    model = results[name]['model']
    importances = model.feature_importances_
    indices = importances.argsort()[::-1]
    features = X.columns[indices]
    sns.barplot(x=importances[indices], y=features)
    plt.title(f"{name} - Feature Importances")
    plt.show()