In [1]:
# Enable autoreload to auto-reload modules when edited
%reload_ext autoreload
%autoreload 2

# Standard libraries
import sys
import time
import calendar
import warnings
from datetime import datetime, timedelta, date
from pathlib import Path
import logging

# Data science and ML
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

# Machine learning
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from xgboost import XGBClassifier
import shap

# BigQuery
import pandas_gbq
from google.cloud import bigquery

# Serialization
import cloudpickle as pkl

# Custom modules
import dataimp.data_preps
from dataimp import *

# Pandas display options for large DataFrames
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

# Suppress warnings
warnings.filterwarnings('ignore')

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(sys.stdout)]
)

# Initialize BigQuery client once
client = bigquery.Client()

## Import train and test set

In [2]:
target = 'digigold_06-10'

In [3]:
date_today = pd.to_datetime('today').date().strftime('%Y%m%d')
date_today

'20251006'

In [4]:
Path.cwd()

PosixPath('/home/jupyter/Raish Azam/Investment Intent Model')

In [5]:
train_data = pd.read_csv(f'train_data_{target}.csv', low_memory=False)
test_data = pd.read_csv(f'test_data_{target}.csv', low_memory=False)

x_train = train_data.drop('on_us_target', axis=1)
y_train = train_data['on_us_target']
x_test = test_data.drop('on_us_target', axis=1)
y_test = test_data['on_us_target']

x_train.shape

(14431070, 27)

In [6]:
train_data.on_us_target.value_counts()

on_us_target
0    14427213
1        3857
Name: count, dtype: int64

In [7]:
test_data.on_us_target.value_counts()

on_us_target
0    3606804
1        964
Name: count, dtype: int64

In [8]:
train_data.columns

Index(['EXPERIAN_Score_V3', 'amt_shopping_purchase_spend_1m', 'DigiGoldApps',
       'salary_3m', 'mutual_fund_12m', 'InsuranceApps', 'inflow_1m',
       'amt_education_spend_3m', 'mutual_fund_6m_12m_ratio',
       'amt_investment_spend_3m', 'amt_investment_spend_12m', 'expense_1m',
       'pl_loan_open_count_12m', 'amt_shopping_purchase_spend_6m',
       'age_in_years', 'amt_bills_utilities_spend_12m', 'PLApps',
       'IsUsingDigitalPayment', 'transactions', 'inflow_12m',
       'total_investment_app', 'occupation', 'Tier', 'IsParent', 'expense_12m',
       'inflow_3m', 'IsTechie', 'on_us_target'],
      dtype='object')

## Model Training

In [9]:
xgb_params = {
    'objective': 'binary:logistic',
    'max_depth': 3,
    'n_estimators': 192,
    'learning_rate': 0.0717252009725675,
    'gamma': 3.3302831350250246,
    'subsample': 0.6940915844429382,
    'colsample_bytree': 0.7512464544067159,
    'reg_lambda': 5.3292763111800845,
    'n_jobs': -1,
    'enable_categorical': True,
    'use_label_encoder': False,
    'eval_metric': 'auc',
    'random_state': 42
}


In [14]:
# Convert all object columns to category
for col in x_train.select_dtypes(include='object').columns:
    x_train[col] = x_train[col].astype('category')

In [15]:
scale_pos_weight=(y_train==0).sum() / (y_train==1).sum()
scale_pos_weight

3740.5270935960593

In [16]:
%%time
model = XGBClassifier(**xgb_params,scale_pos_weight=scale_pos_weight)
model.fit(x_train,y_train)

CPU times: user 27min 33s, sys: 4.45 s, total: 27min 38s
Wall time: 1min 16s


### Store model and features list

In [17]:
model_name = f'{target}.pkl'

# Create directory: gold/
Path(target).mkdir(exist_ok=True)

# Save model to: gold/model.pkl
with open(f'{target}/{model_name}', 'wb') as f:
    pkl.dump(model, f)

In [18]:
import os
with open(f'{target}/{model_name}', 'rb') as f:
    model = pkl.load(f)
    
print(os.path.getsize(f'{target}/{model_name}'))

172826


## Basic Evaluation

1. Train & Test Gini
2. Decile
3. Feature Importances Plot

# training gini

In [20]:
import numpy as np
from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold
# Cross-validation on training data
skf = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)
train_auc_scores = []

for fold, (train_index, val_index) in enumerate(skf.split(x_train, y_train), 1):
    x_fold_train = x_train.iloc[train_index]
    y_fold_train = y_train.iloc[train_index]
    x_fold_val = x_train.iloc[val_index]
    y_fold_val = y_train.iloc[val_index]

    model = XGBClassifier(**xgb_params, scale_pos_weight=scale_pos_weight)
    model.fit(x_fold_train, y_fold_train)

    y_pred_proba = model.predict_proba(x_fold_val)[:, 1] #want only yes probability
    auc = roc_auc_score(y_fold_val, y_pred_proba)
    train_auc_scores.append(auc)

# Print Train Gini
train_auc = np.mean(train_auc_scores)
print(f"""
Train Gini:
    Fold 1: {2 * train_auc_scores[0] - 1:.2f}
    Fold 2: {2 * train_auc_scores[1] - 1:.2f}
    Fold 3: {2 * train_auc_scores[2] - 1:.2f}
    Fold 4: {2 * train_auc_scores[3] - 1:.2f}
    Fold 5: {2 * train_auc_scores[4] - 1:.2f}

Average Gini = {2 * train_auc - 1:.2f}
""")


Train Gini:
    Fold 1: 1.00
    Fold 2: 1.00
    Fold 3: 1.00
    Fold 4: 1.00
    Fold 5: 1.00

Average Gini = 1.00



# testing gini

In [21]:
import numpy as np

# Cross-validation on training data
skf = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)
test_auc_scores = []

for fold, (test_index, test_val_index) in enumerate(skf.split(x_test, y_test), 1):
    x_fold_test = x_test.iloc[test_index]
    y_fold_test = y_test.iloc[test_index]
    x_fold_val_test = x_test.iloc[test_val_index]
    y_fold_val_test = y_test.iloc[test_val_index]

    model = XGBClassifier(**xgb_params, scale_pos_weight=scale_pos_weight)
    model.fit(x_fold_test, y_fold_test)

    y_pred_proba = model.predict_proba(x_fold_val_test)[:, 1]
    auc = roc_auc_score(y_fold_val_test, y_pred_proba)
    test_auc_scores.append(auc)

# Print Train Gini
test_auc = np.mean(test_auc_scores)
print(f"""
Test Gini:
    Fold 1: {2 * test_auc_scores[0] - 1:.2f}
    Fold 2: {2 * test_auc_scores[1] - 1:.2f}
    Fold 3: {2 * test_auc_scores[2] - 1:.2f}
    Fold 4: {2 * test_auc_scores[3] - 1:.2f}
    Fold 5: {2 * test_auc_scores[4] - 1:.2f}

Average Gini = {2 * test_auc - 1:.2f}
""")

ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:occupation: object, Tier: object

In [None]:
.

# Training decile and gains table chart

In [None]:
# Predict probabilities for training data
y_train_pred_proba = model.predict_proba(x_train)[:, 1]

# Ensure Series with correct index
y_train_pred_proba_series = pd.Series(y_train_pred_proba, index=y_train.index)

# Create deciles based on rank
train_deciles = pd.qcut(y_train_pred_proba_series.rank(method='first'), q=10, labels=False) + 1

# Create the decile DataFrame
train_deciles_df = pd.DataFrame({
    'Probability': y_train_pred_proba_series,
    'Actual': y_train,
    'Decile': train_deciles
})

# Add bin size
train_deciles_df['Bin Size'] = 1

# Aggregate by Decile
agg_train_deciles_df = train_deciles_df.groupby('Decile').agg({
    'Probability': ['mean', 'count'],
    'Actual': 'sum',
    'Bin Size': 'sum'
}).reset_index()

# Flatten column names
agg_train_deciles_df.columns = ['Decile', 'Avg_Pred_Prob', 'Obs_Count', 'Actual_Responders', 'Bin_Size']

# Sort by decile descending
agg_train_deciles_df = agg_train_deciles_df.sort_values('Decile', ascending=False)

# Calculate response metrics
agg_train_deciles_df['response_rate'] = 100 * agg_train_deciles_df['Actual_Responders'] / agg_train_deciles_df['Bin_Size']
agg_train_deciles_df['decile_recall'] = agg_train_deciles_df['Actual_Responders'] / y_train.sum()
agg_train_deciles_df['capture_rate (%)'] = agg_train_deciles_df['Actual_Responders'].cumsum() * 100 / y_train.sum()

# Final Output
agg_train_deciles_df

# testing decile and gains table chart

In [None]:
from sklearn.metrics import roc_auc_score

# Predict probabilities (class 1)
y_pred_proba = model.predict_proba(x_test)[:, 1]

# Ensure Series with proper index
y_pred_proba_series = pd.Series(y_pred_proba, index=y_test.index)

# Create deciles based on rank
deciles = pd.qcut(y_pred_proba_series.rank(method='first'), q=10, labels=False) + 1

# Create the decile DataFrame
deciles_df = pd.DataFrame({
    'Probability': y_pred_proba_series,
    'Actual': y_test,
    'Decile': deciles
})
# Add bin size for aggregation
deciles_df['Bin Size'] = 1

# Aggregate by Decile
agg_deciles_df = deciles_df.groupby('Decile').agg({
    'Probability': ['mean', 'count'],
    'Actual': 'sum',
    'Bin Size': 'sum'
}).reset_index()

# Flatten column names
agg_deciles_df.columns = ['Decile', 'Avg_Pred_Prob', 'Obs_Count', 'Actual_Responders', 'Bin_Size']

# Sort by decile descending (1 = highest prob, 10 = lowest)
agg_deciles_df = agg_deciles_df.sort_values('Decile', ascending=False)

# Compute response rate, recall, capture rate
agg_deciles_df['response_rate'] = 100 * agg_deciles_df['Actual_Responders'] / agg_deciles_df['Bin_Size']
agg_deciles_df['decile_recall'] = agg_deciles_df['Actual_Responders'] / y_test.sum()
agg_deciles_df['capture_rate (%)'] = agg_deciles_df['Actual_Responders'].cumsum() * 100 / y_test.sum()

# Final Output
agg_deciles_df

In [None]:
feature_importances = model.feature_importances_
indices = np.argsort(feature_importances)[::-1]

plt.figure(figsize=(10,8))
plt.title("Feature importances")
plt.bar(range(x_train.shape[1]), feature_importances[indices], align="center")
plt.xticks(range(x_train.shape[1]), x_train.columns[indices], rotation=90)
plt.xlim([-1, x_train.shape[1]])
plt.tight_layout()
plt.show()

# shap summary plot

In [None]:
import shap
explainer = shap.TreeExplainer(model)  # Using model directly
shap_values = explainer.shap_values(x_test)

# Summary plot
shap.summary_plot(shap_values, x_test)