# Import Depedency

In [21]:
import data_preprocessing
import pandas as pd
import numpy as np


from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
)

from xgboost import XGBClassifier

import joblib

In [22]:
data = pd.read_csv("./data/skygeni_sales_data.csv")
data.head()

Unnamed: 0,deal_id,created_date,closed_date,sales_rep_id,industry,region,product_type,lead_source,deal_stage,deal_amount,sales_cycle_days,outcome
0,D00001,2023-11-24,2023-12-15,rep_22,SaaS,North America,Enterprise,Referral,Qualified,4253,21,Won
1,D00002,2023-01-17,2023-01-27,rep_7,SaaS,India,Core,Referral,Closed,3905,10,Won
2,D00003,2023-10-29,2023-12-10,rep_5,HealthTech,APAC,Core,Inbound,Proposal,10615,42,Lost
3,D00004,2023-07-14,2023-08-02,rep_18,FinTech,India,Core,Partner,Negotiation,4817,19,Won
4,D00005,2024-02-29,2024-05-26,rep_2,HealthTech,APAC,Core,Outbound,Qualified,45203,87,Lost


In [23]:
data = data_preprocessing.update_date_dtype(data)
data = data_preprocessing.categorical_encoding(data)
data = data_preprocessing.feature_engineering(data)
data.head()

Unnamed: 0,deal_id,created_date,closed_date,sales_rep_id,industry,region,product_type,lead_source,deal_stage,deal_amount,sales_cycle_days,outcome,created_month,amount_bucket
0,D00001,2023-11-24,2023-12-15,1,1,1,1,1,1,4253,21,1,11,1
1,D00002,2023-01-17,2023-01-27,2,1,2,2,1,2,3905,10,1,1,1
2,D00003,2023-10-29,2023-12-10,3,2,3,2,2,3,10615,42,0,10,2
3,D00004,2023-07-14,2023-08-02,4,3,2,2,3,4,4817,19,1,7,1
4,D00005,2024-02-29,2024-05-26,5,2,3,2,4,1,45203,87,0,2,3


In [24]:
feature_columns = [
    'sales_rep_id', 'industry',
    'region', 'product_type', 'lead_source', 'deal_stage',
    'sales_cycle_days', 'created_month'
]

target_col = 'outcome'

In [25]:
def train_cv_and_final_xgb(
    df: pd.DataFrame,
    feature_columns,
    target_col="outcome",
    n_splits=5,
    random_state=42,
    model_params=None,
    use_early_stopping=False,
    verbose=False,
):
    d = df.copy()

    # X, y
    X = d[feature_columns].copy()
    y = d[target_col].astype(int).values

    X = X.replace([np.inf, -np.inf], np.nan)

    # imbalance handling (optional but useful)
    pos = int((y == 1).sum())
    neg = int((y == 0).sum())
    scale_pos_weight = (neg / pos) if pos > 0 else 1.0

    # default params (override via model_params)
    base_params = dict(
        n_estimators=800,
        learning_rate=0.05,
        max_depth=5,
        subsample=0.85,
        colsample_bytree=0.85,
        reg_lambda=1.0,
        min_child_weight=1.0,
        gamma=0.0,
        objective="binary:logistic",
        eval_metric="auc",
        tree_method="hist",
        random_state=random_state,
        scale_pos_weight=scale_pos_weight
    )
    if model_params:
        base_params.update(model_params)

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    oof_proba = np.zeros(len(d), dtype=float)
    oof_pred  = np.zeros(len(d), dtype=int)

    fold_rows = []

    for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), start=1):
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y[tr_idx], y[va_idx]

        model = XGBClassifier(**base_params)

        if use_early_stopping:
            model.fit(
                X_tr, y_tr,
                eval_set=[(X_va, y_va)],
                verbose=verbose,
                early_stopping_rounds=50
            )
        else:
            model.fit(X_tr, y_tr, verbose=verbose)

        proba = model.predict_proba(X_va)[:, 1]
        pred = (proba >= 0.5).astype(int)

        oof_proba[va_idx] = proba
        oof_pred[va_idx]  = pred

        fold_rows.append({
            "fold": fold,
            "auc": roc_auc_score(y_va, proba),
            "accuracy": accuracy_score(y_va, pred),
            "precision": precision_score(y_va, pred, zero_division=0),
            "recall": recall_score(y_va, pred, zero_division=0),
            "f1": f1_score(y_va, pred, zero_division=0),
            "n_val": len(va_idx),
            "pos_rate_val": float(y_va.mean())
        })

    fold_metrics = pd.DataFrame(fold_rows)

    # train FINAL model on all data
    final_model = XGBClassifier(**base_params)
    final_model.fit(X, y, verbose=verbose)

    artifacts = {
        "model": final_model,
        "feature_columns": list(feature_columns),
        "target_col": target_col,
        "params": base_params,
        "train_rows": len(d),
        "pos_rate": float(y.mean()),
        "dtypes": {c: str(d[c].dtype) for c in feature_columns},
    }

    return fold_metrics, artifacts


def save_xgb_artifacts(artifacts: dict, path="models/xgb_sales_model.joblib"):
    joblib.dump(artifacts, path)
    return path

In [30]:
fold_metrics, artifacts = train_cv_and_final_xgb(
    data,
    feature_columns=feature_columns,
    target_col=target_col,
    n_splits=5,
    random_state=42,
    use_early_stopping=False
)

print(fold_metrics)

# 2) save
model_path = save_xgb_artifacts(artifacts, path="models/xgb_sales_model.joblib")
print("Saved to:", model_path)

   fold       auc  accuracy  precision    recall        f1  n_val  \
0     1  0.525879     0.519   0.470339  0.490066  0.480000   1000   
1     2  0.468804     0.480   0.424379  0.415011  0.419643   1000   
2     3  0.494187     0.496   0.444685  0.452539  0.448578   1000   
3     4  0.520557     0.518   0.467672  0.480088  0.473799   1000   
4     5  0.476080     0.484   0.430736  0.440265  0.435449   1000   

   pos_rate_val  
0         0.453  
1         0.453  
2         0.453  
3         0.452  
4         0.452  
Saved to: models/xgb_sales_model.joblib
