# Income Classification & Segmentation — Walkthrough

Author: Sree Harsha Koyi  
This notebook walks line-by-line through the core steps the project implements: data loading, preprocessing, model training/evaluation, threshold trade-offs, and segmentation. It mirrors the code in `src/` and the artifacts under `reports/`.

## 0. Environment & Imports

If needed, uncomment the `pip install` cell to install dependencies in a fresh environment.

In [None]:
# !pip install -q numpy pandas scikit-learn matplotlib seaborn xgboost joblib

In [None]:
import sys, os
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (roc_auc_score, roc_curve, accuracy_score, precision_score, recall_score, f1_score,
                            ConfusionMatrixDisplay, confusion_matrix)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Make project package importable
ROOT = Path.cwd().resolve()
if (ROOT / 'src').exists():
    sys.path.append(str(ROOT))

from src.data_loader import load_census_data, split_features_target_weight, infer_feature_types
from src.preprocessing import build_preprocessor


## 1. Load Data

Load the CPS dataset using the provided headers. We keep raw tokens at load time and handle coercion during preprocessing.

In [None]:
DATA_PATH = 'census-bureau.data'
COLS_PATH = 'census-bureau.columns'
SEED = 17

raw = load_census_data(DATA_PATH, COLS_PATH)
raw.head()

## 2. Split Features/Target/Weights & Infer Types

We exclude `label` and `weight` from features; `weight` is used as `sample_weight` for training and metrics.

In [None]:
X_all, y_all, w_all = split_features_target_weight(raw)
num_cols, cat_cols = infer_feature_types(X_all)
len(X_all), len(num_cols), len(cat_cols), num_cols[:5], cat_cols[:5]

## 2a. Exploratory Data Analysis (weighted)

We summarize target balance, missingness, and selected numeric/categorical distributions with survey weights.

In [None]:
# Target distribution (weighted)
def weighted_counts(y, w):
    df = pd.DataFrame({'y': y, 'w': w})
    agg = df.groupby('y')['w'].sum().rename('weighted').reset_index()
    agg['share'] = agg['weighted'] / agg['weighted'].sum()
    return agg

target_w = weighted_counts(y_all, w_all)
target_w

In [None]:
# Missingness (treat '?' and empty as missing)
def missingness_table(df, w):
    rows = []
    total = len(df)
    total_w = float(w.sum()) if float(w.sum()) > 0 else np.nan
    for col in df.columns:
        miss = (df[col] == '?') | (df[col].isna()) | (df[col] == '')
        mc = int(miss.sum())
        mw = float(w[miss].sum())
        rows.append({
            'column': col,
            'missing_count': mc,
            'missing_pct': mc / total if total > 0 else np.nan,
            'missing_weight': mw,
            'missing_weight_pct': (mw / total_w) if total_w and total_w > 0 else np.nan
        })
    res = pd.DataFrame(rows).sort_values('missing_weight', ascending=False).reset_index(drop=True)
    return res

miss_tbl = missingness_table(X_all, w_all)
miss_tbl.head(15)

In [None]:
# Numeric summaries (weighted)
def wmean(x, w):
    x = pd.to_numeric(x, errors='coerce')
    return float(np.average(x.fillna(0), weights=w.reindex_like(x).fillna(0)))

def wstd(x, w):
    x = pd.to_numeric(x, errors='coerce')
    wv = w.reindex_like(x).fillna(0).values
    xv = x.fillna(0).values
    mu = np.average(xv, weights=wv)
    var = np.average((xv - mu)**2, weights=wv)
    return float(np.sqrt(var))

def wquant(x, w, q):
    x = pd.to_numeric(x, errors='coerce')
    m = x.notna() & w.notna()
    xv, wv = x[m].values, w[m].values
    if len(xv) == 0: return np.nan
    order = np.argsort(xv)
    xv, wv = xv[order], wv[order]
    cw = np.cumsum(wv)
    t = q * cw[-1]
    idx = np.searchsorted(cw, t, side='left')
    idx = np.clip(idx, 0, len(xv)-1)
    return float(xv[idx])

rows = []
for col in num_cols:
    s = X_all[col]
    rows.append({
        'column': col,
        'w_mean': wmean(s, w_all),
        'w_std': wstd(s, w_all),
        'w_p10': wquant(s, w_all, 0.10),
        'w_p50': wquant(s, w_all, 0.50),
        'w_p90': wquant(s, w_all, 0.90),
    })
pd.DataFrame(rows).sort_values('column').reset_index(drop=True)

In [None]:
# Categorical top categories (weighted) for a few key fields
key_cats = [
    'education', 'marital stat', 'major occupation code', 'major industry code', 'race', 'sex', 'citizenship'
]
rows = []
dfc = X_all.copy()
dfc['__w'] = w_all.values
for col in key_cats:
    if col in dfc.columns:
        g = dfc.groupby(col)['__w'].sum().sort_values(ascending=False).head(10)
        for k, v in g.items():
            rows.append({'column': col, 'category': str(k), 'weighted_count': float(v)})
pd.DataFrame(rows).sort_values(['column','weighted_count'], ascending=[True, False]).head(30)

## 3. Preprocessing Pipeline

Numeric: coerce→median impute→scale. Categorical: '?'→NA→most‑frequent impute→one‑hot (ignore unknown).

In [None]:
pre = build_preprocessor(num_cols, cat_cols)
pre

## 4. Train/Test Split


In [None]:
X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(
    X_all, y_all, w_all, test_size=0.2, random_state=SEED, stratify=y_all
)
X_train.shape, X_test.shape

## 5. Baseline Models

We fit two compact baselines: Logistic Regression and XGBoost (hist).

In [None]:
def fit_eval(pipe, name):
    pipe.fit(X_train, y_train, clf__sample_weight=w_train)
    proba = pipe.predict_proba(X_test)[:, 1]
    pred = (proba >= 0.5).astype(int)
    out = {
        'model': name,
        'roc_auc': float(roc_auc_score(y_test, proba, sample_weight=w_test)),
        'accuracy': float(accuracy_score(y_test, pred, sample_weight=w_test)),
        'precision': float(precision_score(y_test, pred, zero_division=0, sample_weight=w_test)),
        'recall': float(recall_score(y_test, pred, zero_division=0, sample_weight=w_test)),
        'f1': float(f1_score(y_test, pred, zero_division=0, sample_weight=w_test)),
    }
    return out, proba

lr = LogisticRegression(max_iter=1000, solver='lbfgs', random_state=SEED)
pipe_lr = Pipeline([('pre', pre), ('clf', lr)])
res_lr, proba_lr = fit_eval(pipe_lr, 'logistic_regression')
res_lr

In [None]:
xgb = XGBClassifier(
    n_estimators=300, max_depth=6, learning_rate=0.1,
    subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0,
    objective='binary:logistic', eval_metric='auc', tree_method='hist',
    n_jobs=-1, random_state=SEED
)
pipe_xgb = Pipeline([('pre', pre), ('clf', xgb)])
res_xgb, proba_xgb = fit_eval(pipe_xgb, 'xgboost_hist')
res_xgb

## 6. Plots: ROC and Confusion Matrix (Test)


In [None]:
def plot_roc(proba, y_true, title='ROC Curve'):
    fpr, tpr, _ = roc_curve(y_true, proba, sample_weight=w_test)
    auc = roc_auc_score(y_true, proba, sample_weight=w_test)
    plt.figure(figsize=(5,4))
    plt.plot(fpr, tpr, label=f'AUC = {auc:.3f}')
    plt.plot([0,1],[0,1],'--',c='gray')
    plt.xlabel('FPR'); plt.ylabel('TPR'); plt.title(title); plt.legend(); plt.show()

def plot_confusion(pred, y_true, title='Confusion Matrix (weighted)'):
    cm = confusion_matrix(y_true, pred, sample_weight=w_test)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    fig, ax = plt.subplots(figsize=(4,4))
    disp.plot(ax=ax, values_format='.0f'); plt.title(title); plt.show()

# Use XGBoost results for plots
pred_xgb = (proba_xgb >= 0.5).astype(int)
plot_roc(proba_xgb, y_test, title='ROC — XGBoost (test)')
plot_confusion(pred_xgb, y_test)

## 7. Threshold–Metrics Sweep

Choose operating points by business cost and goals (recall vs precision).

In [None]:
def threshold_table(proba, y_true, w=None, steps=51):
    th = np.linspace(0,1,steps)
    rows = []
    for t in th:
        pred = (proba >= t).astype(int)
        rows.append({
            'threshold': float(t),
            'precision': float(precision_score(y_true, pred, zero_division=0, sample_weight=w)),
            'recall': float(recall_score(y_true, pred, zero_division=0, sample_weight=w)),
            'f1': float(f1_score(y_true, pred, zero_division=0, sample_weight=w)),
            'accuracy': float(accuracy_score(y_true, pred, sample_weight=w)),
        })
    return pd.DataFrame(rows)

tab = threshold_table(proba_xgb, y_test, w_test, steps=41)
tab.sort_values('f1', ascending=False).head()

## 8. Segmentation (KMeans)

Use the same preprocessed feature space to build 6 clusters and profile them.

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Preprocess to numeric array
Z = Pipeline([('pre', pre)]).fit_transform(X_all)
kmeans = KMeans(n_clusters=6, random_state=SEED, n_init=10)
cl = kmeans.fit_predict(Z)
sil = silhouette_score(Z, cl)
sil

## 9. Save Artifacts (Optional)

Mirror the script outputs if you want to persist results from the notebook run.

In [None]:
from joblib import dump
out_reports = Path('reports'); out_reports.mkdir(parents=True, exist_ok=True)
out_plots = out_reports / 'plots'; out_plots.mkdir(parents=True, exist_ok=True)

# Example: save model
dump(pipe_xgb, 'models/classifier.joblib')

# Example: save threshold table
tab.to_csv(out_reports / 'threshold_metrics_from_notebook.csv', index=False)

# Example: save cluster assignments
pd.DataFrame({'cluster': cl}).to_csv('outputs/segments_from_notebook.csv', index=False)

---
### Notes
- The notebook mirrors the project scripts: `src/train_classifier.py` and `src/segment.py`.
- For production, prefer running the scripts for deterministic artifacts and versioning.