
# Quick Data Analysis – Churn Dataset

This notebook prints the **head of the data**, counts **rows/columns**, shows **dtypes & missing values**, and reports **how many churn vs. not-churn** in the **original data** (no resampling).

**How to use:**
1. Update `DATA_PATH` below to point to your CSV (default assumes the processed file).
2. Run the cells top-to-bottom.


In [None]:

# === Configuration ===
# Update this path if your file is located elsewhere.
DATA_PATH = "data/processed/telecom_churn_processed.csv"  # e.g., "data/raw/Customer Churn.csv"

# Target column auto-detection candidates (feel free to edit)
TARGET_CANDIDATES = ["churn", "Churn", "Exited"]


In [None]:

import pandas as pd
import numpy as np

pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)


In [None]:

# === Load data ===
print(f"Reading CSV from: {DATA_PATH}")
df = pd.read_csv(DATA_PATH)
print("Loaded! Shape =", df.shape)
df.head()


In [None]:

# === Basic dataset info ===
n_rows, n_cols = df.shape
print(f"Rows: {n_rows}")
print(f"Columns: {n_cols}\n")

print("Column names:")
print(list(df.columns))

print("\nData types:")
print(df.dtypes)

print("\nMissing values per column:")
print(df.isna().sum())


In [None]:

# === Churn counts (original data) ===

# Try to find the target column among candidates
target_col = None
for c in TARGET_CANDIDATES:
    if c in df.columns:
        target_col = c
        break

if target_col is None:
    raise ValueError(f"Could not find target column among {TARGET_CANDIDATES}. "
                     f"Please set TARGET_CANDIDATES or rename your target to 'churn'.")

# Standardize to numeric 0/1 if possible
if df[target_col].dtype == object:
    # unify common text labels (yes/no, true/false)
    mapped = df[target_col].astype(str).str.strip().str.lower().map({
        "yes": 1, "no": 0, "true": 1, "false": 0
    })
    # if mapping produced many NaNs, keep original values; else use mapped
    if mapped.notna().mean() > 0.5:
        df[target_col] = mapped

# Coerce to numeric if still not numeric
if not np.issubdtype(df[target_col].dtype, np.number):
    df[target_col] = pd.to_numeric(df[target_col], errors="coerce")

# Drop rows where target is missing for counting
df_non_null = df.dropna(subset=[target_col])

# Compute counts
vc = df_non_null[target_col].value_counts(dropna=False).sort_index()
print(f"\nTarget column detected: {target_col}")
print("\nChurn value counts (original data):")
print(vc)

# If binary numeric 0/1, print user-friendly names
if set(vc.index).issubset({0, 1}):
    n_not_churn = int(vc.get(0, 0))
    n_churn = int(vc.get(1, 0))
    print(f"\nNot Churn (0): {n_not_churn}")
    print(f"Churn     (1): {n_churn}")
    print(f"Total (non-null target): {n_not_churn + n_churn}")



---

**Tip:** If your target column is named differently, add it to `TARGET_CANDIDATES` at the top or rename it to `churn` in your CSV.
