
# DLM — Business Capability Classifier (LightGBM & XGBoost)

This notebook trains classification models to predict **`business_capability`** from file metadata and content fields in `final_data.xlsx`:

Columns expected:

| business_capability | extension | extension_family | file_exists | original_file_path | original_path_depth | original_path_keywords | file_size_bytes | content_text | content_word_count |

**Outline**
1. Setup (install/import)
2. Load & quick sanity checks
3. Train/validation split
4. Preprocessing pipeline (categorical / numeric / text)
5. Baselines (majority class & logistic regression)
6. LightGBM & XGBoost training with cross-validation
7. Evaluation (accuracy, F1, ROC-AUC-OVR, confusion matrix)
8. Feature importances & top signals
9. Save artifacts (models, encoders, metrics)


In [None]:

# If running locally and you don't have these installed, uncomment:
# %pip install -q lightgbm xgboost imbalanced-learn
# %pip install -q pandas openpyxl scikit-learn matplotlib numpy joblib

import os
import math
import json
from pathlib import Path
import numpy as np
import pandas as pd
from collections import Counter

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.metrics import (
    accuracy_score, f1_score, classification_report,
    confusion_matrix, ConfusionMatrixDisplay, roc_auc_score
)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
import joblib

# Optional, models (handled with try/except so notebook can still run)
try:
    from lightgbm import LGBMClassifier
except Exception as e:
    LGBMClassifier = None
    print("⚠️ LightGBM not available. Install with `pip install lightgbm`.")

try:
    from xgboost import XGBClassifier
except Exception as e:
    XGBClassifier = None
    print("⚠️ XGBoost not available. Install with `pip install xgboost`.")

OUTPUT_DIR = Path("artifacts_dlm")
OUTPUT_DIR.mkdir(exist_ok=True)
RANDOM_STATE = 42


In [None]:

# Load data
DATA_PATH = Path("final_data.xlsx")
assert DATA_PATH.exists(), f"Could not find {DATA_PATH.resolve()}. Place final_data.xlsx next to this notebook."
df = pd.read_excel(DATA_PATH)

expected_cols = [
    "business_capability","extension","extension_family","file_exists",
    "original_file_path","original_path_depth","original_path_keywords",
    "file_size_bytes","content_text","content_word_count"
]
missing = [c for c in expected_cols if c not in df.columns]
assert not missing, f"Missing columns: {missing}"

print(df.shape)
df.head(3)


In [None]:

# Basic sanity checks
print("Target value counts:")
print(df['business_capability'].value_counts(dropna=False).head(20))

# Drop rows with missing target
df = df.dropna(subset=['business_capability']).copy()

# Cast types
df['file_exists'] = df['file_exists'].astype('boolean').astype('Int64')  # keep as numeric 0/1 later
# Ensure numeric
for col in ['original_path_depth','file_size_bytes','content_word_count']:
    df[col] = pd.to_numeric(df[col], errors='coerce')
# Fill obvious empties for text
for col in ['content_text','original_path_keywords','original_file_path']:
    df[col] = df[col].fillna('')


In [None]:

# Train / test split
X = df.drop(columns=['business_capability'])
y = df['business_capability'].astype(str)  # treat as labels

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)
X_train.shape, X_test.shape


In [None]:

# Column groups
numeric_features = ['original_path_depth','file_size_bytes','content_word_count']
categorical_features = ['extension','extension_family','file_exists']
text_features = {
    'content_text': {'ngram_range': (1,2), 'min_df': 2, 'max_features': 100000},
    'original_path_keywords': {'ngram_range': (1,2), 'min_df': 2, 'max_features': 50000},
    'original_file_path': {'ngram_range': (1,2), 'min_df': 2, 'max_features': 50000},
}

# Helpers to select a single column for TfidfVectorizer
def col_selector(column_name):
    return FunctionTransformer(lambda X: X[column_name].astype(str).values, validate=False)

text_transformers = []
for col, params in text_features.items():
    text_transformers.append(
        (f"tfidf__{col}", Pipeline(steps=[
            (f"sel__{col}", col_selector(col)),
            ("tfidf", TfidfVectorizer(ngram_range=params['ngram_range'],
                                      min_df=params['min_df'],
                                      max_features=params['max_features']))
        ]), list(X.columns))  # ColumnTransformer ignores this list; we select via FunctionTransformer
    )

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]), numeric_features),
        ("cat", Pipeline([("imputer", SimpleImputer(strategy="most_frequent")),
                          ("onehot", OneHotEncoder(handle_unknown="ignore"))]), categorical_features),
        *text_transformers
    ],
    remainder="drop",
    n_jobs=None
)

preprocess


In [None]:

# Baseline models
dummy = Pipeline([("prep", preprocess), ("clf", DummyClassifier(strategy="most_frequent"))])
logreg = Pipeline([("prep", preprocess),
                   ("clf", LogisticRegression(max_iter=200, n_jobs=None, multi_class="auto"))])

for name, model in [("Dummy", dummy), ("LogReg", logreg)]:
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    f1w = f1_score(y_test, preds, average="weighted")
    print(f"{name} — Acc: {acc:.4f}  |  F1-weighted: {f1w:.4f}")


In [None]:

def evaluate_model_cv(model, X, y, cv_splits=5):
    cv = StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=RANDOM_STATE)
    scoring = {
        "accuracy": "accuracy",
        "f1_weighted": "f1_weighted",
        "f1_macro": "f1_macro"
    }
    out = cross_validate(model, X, y, cv=cv, scoring=scoring, n_jobs=None, return_estimator=False)
    summary = {k: (np.mean(v), np.std(v)) for k, v in out.items() if k.startswith("test_")}
    return summary

def pretty_print_cv(summary, label):
    print(f"\n{label} (CV):")
    for metric, (m, s) in summary.items():
        print(f"  {metric.replace('test_','')}: {m:.4f} ± {s:.4f}")


In [None]:

lgbm_results = None
if LGBMClassifier is not None:
    lgbm_pipe = Pipeline([
        ("prep", preprocess),
        ("clf", LGBMClassifier(
            n_estimators=400,
            learning_rate=0.05,
            num_leaves=63,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=RANDOM_STATE,
            objective="multiclass"
        ))
    ])
    lgbm_cv = evaluate_model_cv(lgbm_pipe, X_train, y_train, cv_splits=5)
    pretty_print_cv(lgbm_cv, "LightGBM")
    lgbm_pipe.fit(X_train, y_train)
    lgbm_preds = lgbm_pipe.predict(X_test)
    lgbm_proba = None
    try:
        lgbm_proba = lgbm_pipe.predict_proba(X_test)
    except Exception:
        pass
    lgbm_results = {
        "accuracy": accuracy_score(y_test, lgbm_preds),
        "f1_weighted": f1_score(y_test, lgbm_preds, average="weighted"),
        "f1_macro": f1_score(y_test, lgbm_preds, average="macro"),
        "report": classification_report(y_test, lgbm_preds, output_dict=True)
    }
    if lgbm_proba is not None and len(np.unique(y)) > 2:
        try:
            lgbm_results["roc_auc_ovr"] = roc_auc_score(y_test, lgbm_proba, multi_class="ovr")
        except Exception:
            pass
    print("\nLightGBM — Test metrics")
    print(f"Acc: {lgbm_results['accuracy']:.4f} | F1-w: {lgbm_results['f1_weighted']:.4f} | F1-macro: {lgbm_results['f1_macro']:.4f}")
else:
    print("Skipping LightGBM — not installed.")


In [None]:

xgb_results = None
if XGBClassifier is not None:
    xgb_pipe = Pipeline([
        ("prep", preprocess),
        ("clf", XGBClassifier(
            n_estimators=600,
            max_depth=6,
            learning_rate=0.05,
            subsample=0.8,
            colsample_bytree=0.8,
            objective="multi:softprob",
            eval_metric="mlogloss",
            tree_method="hist",
            random_state=RANDOM_STATE
        ))
    ])
    xgb_cv = evaluate_model_cv(xgb_pipe, X_train, y_train, cv_splits=5)
    pretty_print_cv(xgb_cv, "XGBoost")
    xgb_pipe.fit(X_train, y_train)
    xgb_preds = xgb_pipe.predict(X_test)
    xgb_proba = None
    try:
        xgb_proba = xgb_pipe.predict_proba(X_test)
    except Exception:
        pass
    xgb_results = {
        "accuracy": accuracy_score(y_test, xgb_preds),
        "f1_weighted": f1_score(y_test, xgb_preds, average="weighted"),
        "f1_macro": f1_score(y_test, xgb_preds, average="macro"),
        "report": classification_report(y_test, xgb_preds, output_dict=True)
    }
    if xgb_proba is not None and len(np.unique(y)) > 2:
        try:
            xgb_results["roc_auc_ovr"] = roc_auc_score(y_test, xgb_proba, multi_class="ovr")
        except Exception:
            pass
    print("\nXGBoost — Test metrics")
    print(f"Acc: {xgb_results['accuracy']:.4f} | F1-w: {xgb_results['f1_weighted']:.4f} | F1-macro: {xgb_results['f1_macro']:.4f}")
else:
    print("Skipping XGBoost — not installed.")


In [None]:

def plot_cm(y_true, y_pred, title):
    labels = np.unique(np.concatenate([y_true, y_pred]))
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    fig, ax = plt.subplots(figsize=(10, 8))
    disp.plot(ax=ax, cmap="Blues", colorbar=False, values_format='d')
    ax.set_title(title)
    plt.tight_layout()
    plt.show()

if 'lgbm_preds' in globals():
    plot_cm(y_test, lgbm_preds, "LightGBM — Confusion Matrix")
if 'xgb_preds' in globals():
    plot_cm(y_test, xgb_preds, "XGBoost — Confusion Matrix")


In [None]:

def get_feature_names_from_ct(ct: ColumnTransformer, X_sample: pd.DataFrame):
    # Works with sklearn >= 1.0
    output_features = []
    for name, trans, cols in ct.transformers_:
        if name == 'remainder':
            continue
        if hasattr(trans, 'get_feature_names_out'):
            if isinstance(cols, (list, tuple)):
                try:
                    feats = list(trans.get_feature_names_out(cols))
                except Exception:
                    feats = list(trans.get_feature_names_out())
            else:
                try:
                    feats = list(trans.get_feature_names_out())
                except Exception:
                    feats = [str(cols)]
            output_features.extend(feats)
        elif isinstance(trans, Pipeline):
            last = trans.steps[-1][1]
            if hasattr(last, 'get_feature_names_out'):
                try:
                    feats = list(last.get_feature_names_out())
                except Exception:
                    feats = [name]
                output_features.extend([f"{name}__{f}" for f in feats])
            else:
                output_features.extend([name])
        else:
            output_features.extend([name])
    return output_features

def top_importances(pipeline, model_key="clf", top_k=25):
    try:
        model = pipeline.named_steps[model_key]
        ct = pipeline.named_steps['prep']
        feats = get_feature_names_from_ct(ct, X_train)
        if hasattr(model, "feature_importances_"):
            importances = model.feature_importances_
            order = np.argsort(importances)[::-1][:top_k]
            top = [(feats[i] if i < len(feats) else f"f{i}", float(importances[i])) for i in order]
            return top
    except Exception as e:
        print("Feature importance extraction failed:", e)
    return []

if LGBMClassifier is not None and 'lgbm_pipe' in globals():
    top_lgbm = top_importances(lgbm_pipe, top_k=25)
    if top_lgbm:
        df_l = pd.DataFrame(top_lgbm, columns=["feature","importance"])
        display(df_l)

if XGBClassifier is not None and 'xgb_pipe' in globals():
    top_xgb = top_importances(xgb_pipe, top_k=25)
    if top_xgb:
        df_x = pd.DataFrame(top_xgb, columns=["feature","importance"])
        display(df_x)


In [None]:

metrics_summary = {}

if lgbm_results:
    metrics_summary['LightGBM'] = {
        k: (float(v) if not isinstance(v, dict) else v) for k, v in lgbm_results.items()
    }
    joblib.dump(lgbm_pipe, OUTPUT_DIR / "dlm_lgbm_pipeline.joblib")
if xgb_results:
    metrics_summary['XGBoost'] = {
        k: (float(v) if not isinstance(v, dict) else v) for k, v in xgb_results.items()
    }
    joblib.dump(xgb_pipe, OUTPUT_DIR / "dlm_xgb_pipeline.joblib")

# Flat CSV of headline metrics
rows = []
for model_name, res in metrics_summary.items():
    row = {"model": model_name}
    for k in ["accuracy","f1_weighted","f1_macro","roc_auc_ovr"]:
        row[k] = res.get(k, np.nan)
    rows.append(row)
pd.DataFrame(rows).to_csv(OUTPUT_DIR / "dlm_model_report.csv", index=False)

with open(OUTPUT_DIR / "dlm_model_report.json","w") as f:
    json.dump(metrics_summary, f, indent=2)

print("Saved artifacts to:", OUTPUT_DIR.resolve())



### Handling class imbalance (optional)
If your target classes are highly imbalanced, consider:
- Using `class_weight='balanced'` in `LogisticRegression`
- For LightGBM/XGBoost, tune parameters or use sample weights per class.
- Try resampling (e.g., `imblearn`'s `RandomUnderSampler`/`SMOTE`) **inside** the pipeline.


In [None]:

from collections import Counter
print("Train class distribution:", Counter(y_train))
print("Test  class distribution:", Counter(y_test))
