# Customer Churn Prediction - Exploratory Data Analysis

This notebook explores the Telco Customer Churn dataset to understand patterns and relationships.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

## Load Data

In [None]:
# Load dataset
df = pd.read_csv('../data/churn.csv')

print(f"Dataset shape: {df.shape}")
df.head()

## Basic Information

In [None]:
# Data info
df.info()

In [None]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())

## Churn Distribution

In [None]:
# Churn distribution
churn_counts = df['Churn'].value_counts()
print("Churn distribution:")
print(churn_counts)
print(f"\nChurn rate: {churn_counts['Yes'] / len(df) * 100:.2f}%")

# Plot
plt.figure(figsize=(8, 5))
sns.countplot(data=df, x='Churn')
plt.title('Churn Distribution')
plt.ylabel('Count')
plt.show()

## Numerical Features Statistics

In [None]:
# Convert TotalCharges to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Numerical columns
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

# Statistics
df[numerical_cols].describe()

## Tenure vs Churn

In [None]:
# Tenure distribution by churn
plt.figure(figsize=(10, 5))
sns.boxplot(data=df, x='Churn', y='tenure')
plt.title('Tenure Distribution by Churn Status')
plt.ylabel('Tenure (months)')
plt.show()

print("\nTenure statistics by churn:")
print(df.groupby('Churn')['tenure'].describe())

## Monthly Charges vs Churn

In [None]:
# Monthly charges distribution by churn
plt.figure(figsize=(10, 5))
sns.boxplot(data=df, x='Churn', y='MonthlyCharges')
plt.title('Monthly Charges Distribution by Churn Status')
plt.ylabel('Monthly Charges ($)')
plt.show()

print("\nMonthly charges statistics by churn:")
print(df.groupby('Churn')['MonthlyCharges'].describe())

## Total Charges vs Churn

In [None]:
# Total charges distribution by churn
plt.figure(figsize=(10, 5))
sns.boxplot(data=df, x='Churn', y='TotalCharges')
plt.title('Total Charges Distribution by Churn Status')
plt.ylabel('Total Charges ($)')
plt.show()

print("\nTotal charges statistics by churn:")
print(df.groupby('Churn')['TotalCharges'].describe())

## Key Insights

1. **Churn Rate**: About 26-27% of customers churn
2. **Tenure**: Customers who churn tend to have shorter tenure
3. **Monthly Charges**: Churned customers tend to have higher monthly charges
4. **Total Charges**: Churned customers tend to have lower total charges (due to shorter tenure)