# Data Exploration - Customer Churn Analysis

This notebook provides an exploratory data analysis of the customer churn dataset used in our MLOps pipeline.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set style for better plots
plt.style.use('default')
sns.set_palette('husl')

# Load the data
df = pd.read_csv('../data/raw/customer_churn.csv')
print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Basic statistics
print("Dataset Info:")
print(df.info())
print("\nChurn Distribution:")
print(df['churn'].value_counts(normalize=True))
print("\nBasic Statistics:")
print(df.describe())

In [None]:
# Visualizations - Fixed version
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Churn distribution
churn_counts = df['churn'].value_counts()
axes[0,0].bar(churn_counts.index, churn_counts.values)
axes[0,0].set_title('Churn Distribution')
axes[0,0].set_xlabel('Churn')
axes[0,0].set_ylabel('Count')

# Age distribution (if age column exists, otherwise use tenure)
if 'age' in df.columns:
    axes[0,1].hist(df['age'], bins=20, alpha=0.7)
    axes[0,1].set_title('Age Distribution')
    axes[0,1].set_xlabel('Age')
else:
    axes[0,1].hist(df['tenure'], bins=20, alpha=0.7)
    axes[0,1].set_title('Tenure Distribution')
    axes[0,1].set_xlabel('Tenure (months)')
axes[0,1].set_ylabel('Frequency')

# Monthly charges by churn - Fixed boxplot
churn_groups = df.groupby('churn')['monthly_charges']
box_data = [group.values for name, group in churn_groups]
axes[1,0].boxplot(box_data, labels=churn_groups.groups.keys())
axes[1,0].set_title('Monthly Charges by Churn')
axes[1,0].set_xlabel('Churn')
axes[1,0].set_ylabel('Monthly Charges')

# Tenure by churn - Fixed boxplot
tenure_groups = df.groupby('churn')['tenure']
box_data_tenure = [group.values for name, group in tenure_groups]
axes[1,1].boxplot(box_data_tenure, labels=tenure_groups.groups.keys())
axes[1,1].set_title('Tenure by Churn')
axes[1,1].set_xlabel('Churn')
axes[1,1].set_ylabel('Tenure (months)')

plt.tight_layout()
plt.show()

In [None]:
# Additional visualizations using seaborn (more robust)
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Churn distribution with seaborn
sns.countplot(data=df, x='churn', ax=axes[0,0])
axes[0,0].set_title('Churn Distribution (Seaborn)')

# Contract type by churn
if 'contract' in df.columns:
    sns.countplot(data=df, x='contract', hue='churn', ax=axes[0,1])
    axes[0,1].set_title('Contract Type by Churn')
    axes[0,1].tick_params(axis='x', rotation=45)

# Monthly charges distribution by churn
sns.boxplot(data=df, x='churn', y='monthly_charges', ax=axes[1,0])
axes[1,0].set_title('Monthly Charges by Churn (Seaborn)')

# Correlation heatmap for numerical features
numerical_cols = df.select_dtypes(include=[np.number]).columns
correlation_matrix = df[numerical_cols].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, ax=axes[1,1])
axes[1,1].set_title('Feature Correlation Heatmap')

plt.tight_layout()
plt.show()

In [None]:
# Feature analysis
print("Categorical Features Analysis:")
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    if col != 'churn':  # Skip target variable
        print(f"\n{col.upper()}:")
        print(df[col].value_counts())
        
        # Churn rate by category
        churn_rate = df.groupby(col)['churn'].mean()
        print(f"\nChurn rate by {col}:")
        print(churn_rate.sort_values(ascending=False))

In [None]:
# Summary insights
print("=== DATA EXPLORATION SUMMARY ===")
print(f"Total customers: {len(df)}")
print(f"Churn rate: {df['churn'].mean():.2%}")
print(f"Average tenure: {df['tenure'].mean():.1f} months")
print(f"Average monthly charges: ${df['monthly_charges'].mean():.2f}")
print(f"Missing values: {df.isnull().sum().sum()}")

# Key insights
print("\n=== KEY INSIGHTS ===")
high_churn_tenure = df[df['churn'] == 1]['tenure'].mean()
low_churn_tenure = df[df['churn'] == 0]['tenure'].mean()
print(f"Average tenure for churned customers: {high_churn_tenure:.1f} months")
print(f"Average tenure for retained customers: {low_churn_tenure:.1f} months")

high_churn_charges = df[df['churn'] == 1]['monthly_charges'].mean()
low_churn_charges = df[df['churn'] == 0]['monthly_charges'].mean()
print(f"Average monthly charges for churned customers: ${high_churn_charges:.2f}")
print(f"Average monthly charges for retained customers: ${low_churn_charges:.2f}")