In [2]:
import xgboost as xgb
from sklearn.metrics import roc_curve, auc
from xgboost.sklearn import XGBClassifier
import pandas as pd
import numpy as np
import pandas_gbq
import shap
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
from google.cloud import bigquery
import time
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import cloudpickle
from datetime import date
pd.set_option('display.max_columns', 500)
client = bigquery.Client()

# Training Data Prep

In [5]:
# snapshot_period = 'Mar-2025'

# date_condition = pd.to_datetime(snapshot_period, format='%b-%Y')
# created_date_condition = date_condition.strftime('%Y-%m-%d')
# event_date_condition = (date_condition + pd.DateOffset(months=1)).strftime('%Y-%m-%d')
# ref_month = (date_condition - pd.DateOffset(days=date_condition.day)).strftime('%Y-%m-%d')

# print(f'Processing Snapshot: {snapshot_period}')
# print(f'Event Date Condition: {event_date_condition}')
# print(f'Scoring Month: {created_date_condition}')
# print(f'Ref_month: {ref_month}')

Processing Snapshot: Mar-2025
Event Date Condition: 2025-04-01
Scoring Month: 2025-03-01
Ref_month: 2025-02-28


In [6]:
# Initialize BigQuery client
client = bigquery.Client()

def process_snapshot(snapshot_period):
    """
    Process one snapshot period and return the resulting base_df.
    """
    date_condition = pd.to_datetime(snapshot_period, format='%b-%Y')
    created_date_condition = date_condition.strftime('%Y-%m-%d')
    event_date_condition = (date_condition + pd.DateOffset(months=1)).strftime('%Y-%m-%d')
    ref_month = (date_condition - pd.DateOffset(days=date_condition.day)).strftime('%Y-%m-%d')

    print(f'Processing Snapshot: {snapshot_period}')
    print(f'Event Date Condition: {event_date_condition}')
    print(f'Scoring Month: {created_date_condition}')
    print(f'Ref_month: {ref_month}')

    # Query 1: personal_details_complete_datetime >= created_date_condition
    query1 = f"""
        WITH base AS (
    SELECT
        '{snapshot_period}' AS snapshot_month,
        t.id AS customer_id,
        CASE
          WHEN a.final_uninstall_flag = 0 THEN 1
          WHEN a.final_uninstall_flag = 1 THEN 0
          ELSE a.final_uninstall_flag
        END AS install_flag,
        t.createddate AS registration_date
    FROM 
        `abcd-dataplatform-prod.abcd_mobileapp_raw.ABCDPRODDB_t_customer` t
    LEFT JOIN 
        `abcd-dataplatform.abcd_data_model.user_activity_raw` a 
    ON 
        t.id=a.customer_id
    WHERE
        t.createddate < "{created_date_condition}"
    ),

    PL AS(

    SELECT
        '{snapshot_period}' AS snapshot_month,
        a.Customer_id AS customer_id,
        a.target as PL_target,
        a.registration_date AS registration_date,
        a.personal_details_complete_datetime,
        a.mobilenumber
    FROM
    (
      SELECT
            t.id AS Customer_id,
            pl.mobilenumber,
            (CASE
              WHEN personal_details_complete = 'Y' THEN 1
              ELSE 0
            END) AS target,
            personal_details_complete_datetime,
            t.createddate AS registration_date,
            ROW_NUMBER() OVER(PARTITION BY pl.mobilenumber ORDER BY personal_details_complete_datetime DESC) AS rn
      FROM `abcd-dataplatform-prod.abcd_mobileapp_raw.ABCDPRODDB_t_pl_customer_detail` pl
      LEFT JOIN `abcd-dataplatform-prod.abcd_mobileapp_raw.ABCDPRODDB_t_customer` t 
      ON pl.mobilenumber = t.mobilenumber
      WHERE
        personal_details_complete = 'Y' 
        AND personal_details_complete_datetime IS NOT NULL 
        AND t.createddate < "{created_date_condition}"

        AND personal_details_complete_datetime >= '{created_date_condition}'
        AND personal_details_complete_datetime < '{event_date_condition}'

    ) AS a
    WHERE a.rn = 1

    )

    SELECT
          b.snapshot_month,
          b.customer_id,
          b.install_flag,
          b.registration_date,
          cust_x.mobilenumber

    FROM base AS b
    LEFT JOIN 
        PL p 
    ON 
        p.customer_id = b.customer_id

    LEFT JOIN 

    (Select SAFE_CAST(id AS STRING) AS customer_id,
    mobilenumber FROM `abcd-dataplatform-prod.abcd_mobileapp_transformed.ABCDPRODDB_t_customer` a) cust_x
    on CAST(b.customer_id as STRING) = CAST(cust_x.customer_id as STRING)

    WHERE 
        (b.install_flag = 1 
        OR (b.install_flag = 0 and PL_target = 1))
    """
    base_df = client.query(query1).to_dataframe()
    
    # Query 2: OPEN_DT >= created_date_condition
    query2 = f"""
    WITH latest_data AS (
    -- CTE for Getting Latest Experian data update on Customer and Loan Account Level
    -- Mark DPD column as NaN if it has values -1 or 900. Keep originally NaN values untouched.

        SELECT
            RANK() OVER (PARTITION BY mobilenumber, ACCT_KEY ORDER BY SCRUB_DATE DESC, source ASC, balance_dt DESC) AS rank,
            mobilenumber, 
            acct_key,
            scrub_date,
            M_SUB_ID,
            ORIG_LOAN_AM,
            OPEN_DATE

        FROM (
            SELECT
            CUSTOMER_ID AS mobilenumber, 
            acct_key,
            M_SUB_ID,
            scrub_date,
            SAFE_CAST(ORIG_LOAN_AM AS FLOAT64) as ORIG_LOAN_AM,
            SAFE.PARSE_DATE('%d/%m/%Y', OPEN_DT) as OPEN_DATE,

        BALANCE_DT,
                1 AS source
            FROM
                `abffsl-dataplatform-uat.abfssl_central_analytics.EXPERIAN_RPT_AR_DAILY_BASE_UPDATED`
            WHERE
                acct_type_cd in ('123')
                AND
                SAFE.PARSE_DATE('%d/%m/%Y', OPEN_DT) < '{event_date_condition}'
                AND
                SAFE.PARSE_DATE('%d/%m/%Y', OPEN_DT) >= '{created_date_condition}'

            UNION ALL

            SELECT
            CUSTOMER_ID AS mobilenumber, 
            acct_key,
            M_SUB_ID,
            scrub_date,
            SAFE_CAST(ORIG_LOAN_AM AS FLOAT64) as ORIG_LOAN_AM,
            SAFE.PARSE_DATE('%d/%m/%Y', OPEN_DT) as OPEN_DATE,
        BALANCE_DT,
                0 AS source
            FROM
                `abffsl-dataplatform-uat.abfssl_central_analytics.EXPERIAN_RPT_AR_ADHOC_BASE_UPDATED`
            WHERE
            acct_type_cd in ('123')
            AND 
                SAFE.PARSE_DATE('%d/%m/%Y', OPEN_DT) < '{event_date_condition}'
                AND
                SAFE.PARSE_DATE('%d/%m/%Y', OPEN_DT) >= '{created_date_condition}'
        )
        QUALIFY rank = 1
    )

    SELECT mobilenumber, lender, count(acct_key) as cnt_loans from ( 

    select 
    mobilenumber,
    acct_key,
    CASE WHEN M_SUB_ID in ('ADITYA','Aditya_Birla','ABFL') then 'onus' else 'ofus' END as lender,
    scrub_date,
    sum(CAST(ORIG_LOAN_AM as INT)) as total_disbursal_amount,
    OPEN_DATE
    FROM latest_data
    group by 
    mobilenumber,
    acct_key,
    lender,
    scrub_date,
    OPEN_DATE
    having CAST(total_disbursal_amount as INT) > 5000

    ) x 

    group by mobilenumber, lender

    """
    exp_td_df = client.query(query2).to_dataframe()

    # Pivot table
    exp_td_df_v1 = exp_td_df.pivot_table(
        index='mobilenumber',
        columns='lender',
        values='cnt_loans',
        aggfunc='sum'
    ).reset_index()

    # Merge with base_df
    base_df = base_df.merge(exp_td_df_v1, on='mobilenumber', how='left')

    # Binary flags
    base_df['ofus'] = np.where(base_df['ofus'].fillna(0) >= 1, 1, 0)
    base_df['onus'] = np.where(base_df['onus'].fillna(0) >= 1, 1, 0)

    # Combined target
    base_df['target_combined'] = ((base_df['ofus'] == 1) | (base_df['onus'] == 1)).astype(int)

    # Add snapshot_month column to identify the period
    base_df['snapshot_month'] = snapshot_period

    return base_df

In [None]:
def run_pipeline(snapshot_months):
    """
    Run pipeline for multiple months and append results into one DataFrame.
    
    Args:
        snapshot_months (list): List of months like ['Mar-2025', 'Apr-2025']
        
    Returns:
        pd.DataFrame: Combined DataFrame across all periods
    """
    combined_df = pd.DataFrame()

    for month in snapshot_months:
        df = process_snapshot(month)
        combined_df = pd.concat([combined_df, df], ignore_index=True)

    return combined_df

In [6]:
months_to_process = ['Oct-2024', 'Nov-2024', 'Dec-2024', 'Jan-2025','Feb-2025','Mar-2025','Apr-2025']

final_df = run_pipeline(months_to_process)

print(final_df.shape)
print(final_df.head())

Processing Snapshot: Oct-2024
Event Date Condition: 2024-11-01
Scoring Month: 2024-10-01
Ref_month: 2024-09-30
Processing Snapshot: Nov-2024
Event Date Condition: 2024-12-01
Scoring Month: 2024-11-01
Ref_month: 2024-10-31
Processing Snapshot: Dec-2024
Event Date Condition: 2025-01-01
Scoring Month: 2024-12-01
Ref_month: 2024-11-30
Processing Snapshot: Jan-2025
Event Date Condition: 2025-02-01
Scoring Month: 2025-01-01
Ref_month: 2024-12-31
Processing Snapshot: Feb-2025
Event Date Condition: 2025-03-01
Scoring Month: 2025-02-01
Ref_month: 2025-01-31
Processing Snapshot: Mar-2025
Event Date Condition: 2025-04-01
Scoring Month: 2025-03-01
Ref_month: 2025-02-28
Processing Snapshot: Apr-2025
Event Date Condition: 2025-05-01
Scoring Month: 2025-04-01
Ref_month: 2025-03-31
(10576672, 8)
  snapshot_month  customer_id  install_flag          registration_date  \
0       Oct-2024      2086004             1 2024-09-15 20:28:39.636403   
1       Oct-2024       307772             1 2024-05-28 20:53:

In [7]:
final_df.head()

Unnamed: 0,snapshot_month,customer_id,install_flag,registration_date,mobilenumber,ofus,onus,target_combined
0,Oct-2024,2086004,1,2024-09-15 20:28:39.636403,6289363619,0,0,0
1,Oct-2024,307772,1,2024-05-28 20:53:16.998229,9974211665,0,0,0
2,Oct-2024,391027,1,2024-06-18 16:56:24.342877,8597345458,0,0,0
3,Oct-2024,306129,1,2024-05-28 18:18:35.862505,8724841095,0,0,0
4,Oct-2024,270287,1,2024-05-23 07:32:05.912367,9140212755,0,0,0


In [12]:
# final_df.to_parquet('revamped_pl_training_data.parquet')

In [3]:
final_df = pd.read_parquet('revamped_pl_training_data.parquet')
print(final_df.shape)

(10576672, 8)


In [4]:
# Convert snapshot_month to datetime
final_df['snapshot_month_dt'] = pd.to_datetime(final_df['snapshot_month'], format='%b-%Y')

# Calculate ref_month: last day of the previous month
final_df['ref_month'] = final_df['snapshot_month_dt'] - pd.offsets.MonthEnd(1)

# Format ref_month as string 'YYYY-MM-DD'
final_df['ref_month'] = final_df['ref_month'].dt.strftime('%Y-%m-%d')

# Optional: Drop the temporary datetime column
final_df.drop(columns='snapshot_month_dt', inplace=True)

In [5]:
print(final_df.shape)
final_df.groupby(['snapshot_month','ref_month'])['target_combined'].value_counts(normalize=True)*100

(10576672, 9)


snapshot_month  ref_month   target_combined
Apr-2025        2025-03-31  0                  97.494945
                            1                   2.505055
Dec-2024        2024-11-30  0                  93.930778
                            1                   6.069222
Feb-2025        2025-01-31  0                  94.859597
                            1                   5.140403
Jan-2025        2024-12-31  0                  94.527686
                            1                   5.472314
Mar-2025        2025-02-28  0                  95.178064
                            1                   4.821936
Nov-2024        2024-10-31  0                  93.850675
                            1                   6.149325
Oct-2024        2024-09-30  0                  92.770148
                            1                   7.229852
Name: proportion, dtype: float64

In [6]:
final_df.head()

Unnamed: 0,snapshot_month,customer_id,install_flag,registration_date,mobilenumber,ofus,onus,target_combined,ref_month
0,Oct-2024,2086004,1,2024-09-15 20:28:39.636403,6289363619,0,0,0,2024-09-30
1,Oct-2024,307772,1,2024-05-28 20:53:16.998229,9974211665,0,0,0,2024-09-30
2,Oct-2024,391027,1,2024-06-18 16:56:24.342877,8597345458,0,0,0,2024-09-30
3,Oct-2024,306129,1,2024-05-28 18:18:35.862505,8724841095,0,0,0,2024-09-30
4,Oct-2024,270287,1,2024-05-23 07:32:05.912367,9140212755,0,0,0,2024-09-30


# Bureau Enquiry Data

In [7]:
def inq_features(ref_month, client):    
    QUERY = f"""

    select customer_id as mobilenumber, AVG(SAFE_CAST(amount AS FLOAT64)) as AVG_AMT_INQ 
    FROM `abffsl-dataplatform-uat.abfssl_central_analytics.EXPERIAN_RPT_ENQ_DAILY_BASE` a
    where
        INQ_PURP_CD in ('13', '03','07')
        AND PARSE_DATE('%d/%m/%Y', INQ_DATE) <= DATE '{ref_month}' 
        AND PARSE_DATE('%d/%m/%Y', INQ_DATE) >= DATE_SUB(DATE '{ref_month}', INTERVAL 365 DAY)
    group by mobilenumber
        ;
    """

    df = client.query(QUERY).to_dataframe()
    df.info(memory_usage='deep')
    
    return df

In [8]:
# Initialize an empty list to collect results
avg_inq_data = []

# Get unique ref_month values from final_df
unique_ref_months = final_df[['snapshot_month', 'ref_month']].drop_duplicates()

# Loop through each ref_month and fetch AVG_AMT_INQ
for _, row in unique_ref_months.iterrows():
    ref_month = row['ref_month']
    snapshot_month = row['snapshot_month']
    
    print(f"Fetching AVG_AMT_INQ for ref_month: {ref_month}")
    
    avg_inq_df = inq_features(ref_month, client)
    
    # Add ref_month and snapshot_month so we can join later
    avg_inq_df['ref_month'] = ref_month
    avg_inq_df['snapshot_month'] = snapshot_month
    
    avg_inq_data.append(avg_inq_df)

# Concatenate all results
avg_inq_full = pd.concat(avg_inq_data, ignore_index=True)

# Now merge this back into final_df
final_df = pd.merge(
    final_df,
    avg_inq_full,
    on=['mobilenumber', 'ref_month', 'snapshot_month'],
    how='left'
)

Fetching AVG_AMT_INQ for ref_month: 2024-09-30
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 732918 entries, 0 to 732917
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   mobilenumber  732918 non-null  object 
 1   AVG_AMT_INQ   732918 non-null  float64
dtypes: float64(1), object(1)
memory usage: 52.4 MB
Fetching AVG_AMT_INQ for ref_month: 2024-10-31
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 786274 entries, 0 to 786273
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   mobilenumber  786274 non-null  object 
 1   AVG_AMT_INQ   786274 non-null  float64
dtypes: float64(1), object(1)
memory usage: 56.2 MB
Fetching AVG_AMT_INQ for ref_month: 2024-11-30
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 873868 entries, 0 to 873867
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        ----------

In [9]:
final_df.head()

Unnamed: 0,snapshot_month,customer_id,install_flag,registration_date,mobilenumber,ofus,onus,target_combined,ref_month,AVG_AMT_INQ
0,Oct-2024,2086004,1,2024-09-15 20:28:39.636403,6289363619,0,0,0,2024-09-30,
1,Oct-2024,307772,1,2024-05-28 20:53:16.998229,9974211665,0,0,0,2024-09-30,
2,Oct-2024,391027,1,2024-06-18 16:56:24.342877,8597345458,0,0,0,2024-09-30,
3,Oct-2024,306129,1,2024-05-28 18:18:35.862505,8724841095,0,0,0,2024-09-30,
4,Oct-2024,270287,1,2024-05-23 07:32:05.912367,9140212755,0,0,0,2024-09-30,


In [10]:
final_df.groupby('snapshot_month')['AVG_AMT_INQ'].count()

snapshot_month
Apr-2025    216842
Dec-2024    213364
Feb-2025    192749
Jan-2025    206501
Mar-2025    203042
Nov-2024    187865
Oct-2024    169516
Name: AVG_AMT_INQ, dtype: int64

In [11]:
final_df.head()

Unnamed: 0,snapshot_month,customer_id,install_flag,registration_date,mobilenumber,ofus,onus,target_combined,ref_month,AVG_AMT_INQ
0,Oct-2024,2086004,1,2024-09-15 20:28:39.636403,6289363619,0,0,0,2024-09-30,
1,Oct-2024,307772,1,2024-05-28 20:53:16.998229,9974211665,0,0,0,2024-09-30,
2,Oct-2024,391027,1,2024-06-18 16:56:24.342877,8597345458,0,0,0,2024-09-30,
3,Oct-2024,306129,1,2024-05-28 18:18:35.862505,8724841095,0,0,0,2024-09-30,
4,Oct-2024,270287,1,2024-05-23 07:32:05.912367,9140212755,0,0,0,2024-09-30,


# Bureau Consent

In [12]:
def bureau_consent(ref_month, client):

    QUERY = f"""

    SELECT 

    LAST_DAY(DATE(_PARTITIONTIME), MONTH) as EVENT_MONTH,

    COALESCE(CAST(mp_user_id AS INT64), CAST(guestid AS INT64)) AS customer_id,

    SUM(CASE

          WHEN mp_event_name = 'hd_mm_section_card_click' AND ctatext IN ('credit_track')

          THEN 1 ELSE 0

      END) AS credit_track_click

    FROM `abcd-dataplatform-prod.abcd_mixpanel_raw.abcd_mp_master_event` mp

    where

    LAST_DAY(DATE(_PARTITIONTIME), MONTH) = DATE '{ref_month}'

    group by 1,2

    """

    df = client.query(QUERY).to_dataframe()
    df.info(memory_usage='deep')
    
    return df

In [13]:
# Initialize an empty list to collect results
bur_con = []

# Get unique ref_month values from final_df
unique_ref_months = final_df[['snapshot_month', 'ref_month']].drop_duplicates()

# Loop through each ref_month and fetch AVG_AMT_INQ
for _, row in unique_ref_months.iterrows():
    ref_month = row['ref_month']
    snapshot_month = row['snapshot_month']
    
    print(f"Fetching bureau_consent for ref_month: {ref_month}")
    
    bur_con_df = bureau_consent(ref_month, client)
    
    # Add ref_month and snapshot_month so we can join later
    bur_con_df['ref_month'] = ref_month
    bur_con_df['snapshot_month'] = snapshot_month
    
    bur_con.append(bur_con_df)

# Concatenate all results
bur_con_full = pd.concat(bur_con, ignore_index=True)

# Now merge this back into final_df
final_df = pd.merge(
    final_df,
    bur_con_full,
    on=['customer_id', 'ref_month', 'snapshot_month'],
    how='left'
)

final_df['cc_consent_flag'] = np.where(final_df['credit_track_click'].fillna(0) >= 1,1,0)

final_df.drop('credit_track_click', axis=1,inplace=True)

del(bur_con, bur_con_full)

Fetching bureau_consent for ref_month: 2024-09-30
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1016414 entries, 0 to 1016413
Data columns (total 3 columns):
 #   Column              Non-Null Count    Dtype 
---  ------              --------------    ----- 
 0   EVENT_MONTH         1016414 non-null  dbdate
 1   customer_id         1016413 non-null  Int64 
 2   credit_track_click  1016414 non-null  Int64 
dtypes: Int64(2), dbdate(1)
memory usage: 25.2 MB
Fetching bureau_consent for ref_month: 2024-10-31
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 726519 entries, 0 to 726518
Data columns (total 3 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   EVENT_MONTH         726519 non-null  dbdate
 1   customer_id         726518 non-null  Int64 
 2   credit_track_click  726519 non-null  Int64 
dtypes: Int64(2), dbdate(1)
memory usage: 18.0 MB
Fetching bureau_consent for ref_month: 2024-11-30
<class 'pandas.core.frame.DataFra

In [14]:
final_df.head()

Unnamed: 0,snapshot_month,customer_id,install_flag,registration_date,mobilenumber,ofus,onus,target_combined,ref_month,AVG_AMT_INQ,EVENT_MONTH,cc_consent_flag
0,Oct-2024,2086004,1,2024-09-15 20:28:39.636403,6289363619,0,0,0,2024-09-30,,2024-09-30,0
1,Oct-2024,307772,1,2024-05-28 20:53:16.998229,9974211665,0,0,0,2024-09-30,,NaT,0
2,Oct-2024,391027,1,2024-06-18 16:56:24.342877,8597345458,0,0,0,2024-09-30,,NaT,0
3,Oct-2024,306129,1,2024-05-28 18:18:35.862505,8724841095,0,0,0,2024-09-30,,NaT,0
4,Oct-2024,270287,1,2024-05-23 07:32:05.912367,9140212755,0,0,0,2024-09-30,,2024-09-30,0


In [15]:
final_df.groupby('snapshot_month')['cc_consent_flag'].value_counts(normalize=True)*100

snapshot_month  cc_consent_flag
Apr-2025        0                  98.488939
                1                   1.511061
Dec-2024        0                  98.756634
                1                   1.243366
Feb-2025        0                  98.968066
                1                   1.031934
Jan-2025        0                  98.909190
                1                   1.090810
Mar-2025        0                  98.966419
                1                   1.033581
Nov-2024        0                  98.521080
                1                   1.478920
Oct-2024        0                  97.618328
                1                   2.381672
Name: proportion, dtype: float64

In [16]:
final_df

Unnamed: 0,snapshot_month,customer_id,install_flag,registration_date,mobilenumber,ofus,onus,target_combined,ref_month,AVG_AMT_INQ,EVENT_MONTH,cc_consent_flag
0,Oct-2024,2086004,1,2024-09-15 20:28:39.636403,6289363619,0,0,0,2024-09-30,,2024-09-30,0
1,Oct-2024,307772,1,2024-05-28 20:53:16.998229,9974211665,0,0,0,2024-09-30,,NaT,0
2,Oct-2024,391027,1,2024-06-18 16:56:24.342877,8597345458,0,0,0,2024-09-30,,NaT,0
3,Oct-2024,306129,1,2024-05-28 18:18:35.862505,8724841095,0,0,0,2024-09-30,,NaT,0
4,Oct-2024,270287,1,2024-05-23 07:32:05.912367,9140212755,0,0,0,2024-09-30,,2024-09-30,0
...,...,...,...,...,...,...,...,...,...,...,...,...
10576667,Apr-2025,1273044,1,2024-08-24 13:22:19.146682,9461038125,0,0,0,2025-03-31,,NaT,0
10576668,Apr-2025,942765,1,2024-08-12 17:36:52.071620,9083189656,0,0,0,2025-03-31,,2025-03-31,0
10576669,Apr-2025,904628,1,2024-08-10 18:25:46.496555,6294767894,0,0,0,2025-03-31,,2025-03-31,0
10576670,Apr-2025,1247869,1,2024-08-23 16:36:31.796373,7347854114,0,0,0,2025-03-31,,NaT,0


In [17]:
# final_df.to_parquet('revamped_pl_training_data.parquet')

In [4]:
final_df = pd.read_parquet('revamped_pl_training_data.parquet')

In [5]:
final_df.shape

(10576672, 12)

In [9]:
final_df.groupby('snapshot_month')['cc_consent_flag'].value_counts()

snapshot_month  cc_consent_flag
Apr-2025        0                  1778856
                1                    27292
Dec-2024        0                  1431748
                1                    18026
Feb-2025        0                  1566711
                1                    16336
Jan-2025        0                  1516358
                1                    16723
Mar-2025        0                  1645672
                1                    17187
Nov-2024        0                  1308023
                1                    19635
Oct-2024        0                  1185189
                1                    28916
Name: count, dtype: int64

# Bureau Demographics

In [17]:
ref_month

'2025-03-31'

In [13]:
QUERY = f"""
with base as (
select 
CUSTOMER_ID,
max(income) as CUST_INCOME,
max(income_freq) INCOME_FREQUENCY,
max(OCCUP_STATUS_CD) as OCCUPATION,
from `abffsl-dataplatform-uat.abfssl_central_analytics.EXPERIAN_RPT_EMPLOYMENT_DAILY_BASE`
group by 
CUSTOMER_ID
)

select 
CUSTOMER_ID as mobilenumber,
CASE WHEN OCCUPATION = '9' THEN "1"
     WHEN OCCUPATION = '10' THEN "2"
     WHEN OCCUPATION = '2' THEN "3"
     WHEN OCCUPATION = '99' THEN "4"
     ELSE null END as OCCUPATION,

  CASE 
    WHEN INCOME_FREQUENCY = '4' THEN CAST(CUST_INCOME as INT) * 3
    WHEN INCOME_FREQUENCY = '6' THEN CAST(CUST_INCOME as INT) * 2
    ELSE CAST(CUST_INCOME as INT)
  END AS Annual_Income

from base
"""

bureau_inc_df = client.query(QUERY).to_dataframe()
bureau_inc_df.info(memory_usage='deep')

In [None]:
QUERY = f"""

select 
a.CUSTOMER_ID as mobilenumber,
a.DOB,
a.GENDER
 FROM `abffsl-dataplatform-uat.abfssl_central_analytics.EXPERIAN_RPT_NAME_DOB_DAILY_BASE` a

"""

bureau_dob_df = client.query(QUERY).to_dataframe()
bureau_dob_df.info(memory_usage='deep')

In [None]:
final_df = final_df.merge(bureau_inc_df, on = 'mobilenumber', how='left')
final_df = final_df.merge(bureau_dob_df, on = 'mobilenumber', how='left')

In [None]:
del(bureau_dob_df)
del(bureau_inc_df)

# Anagog

In [None]:
QUERY = f"""

select 
customerid as mobilenumber,
IsParent,
IsTechie,
IsTraveler,
IsSingle, 
IsUsingDigitalPayment,
bankingapps,
upiapps, 
plapps,
digigoldapps,
IsCarOwner
FROM `abcd-dataplatform-prod.abcd_anagog_transformed.abcd_anagog_data_transformed`
where flag=1

"""

anagog_df = client.query(QUERY).to_dataframe()
anagog_df.info(memory_usage='deep')

In [None]:
final_df = final_df.merge(anagog_df, on= 'mobilenumber', how='left')

del(anagog_df)

cols_to_impute = [
    'IsParent', 'IsTechie', 'IsTraveler', 'IsSingle', 
    'IsUsingDigitalPayment', 'bankingapps', 'upiapps', 
    'plapps', 'digigoldapps','IsCarOwner'
]

final_df[cols_to_impute] = final_df[cols_to_impute].applymap(lambda x: 1 if str(x).strip().upper() == "TRUE" else 0)

# SMS Features

In [17]:
def sms_expenses_data(client,ref_month):
    QUERY = f"""

    with month_calendar AS (
            SELECT 
            DATE_TRUNC(DATE_SUB(DATE('{ref_month}'), INTERVAL num MONTH), MONTH) AS year_month_start
        FROM 
            UNNEST(GENERATE_ARRAY(0, 15)) AS num
    ),
    -- Step 2: Prepare your base data, keeping all user-month combinations
    base AS (
        SELECT 
            right(sms.user_id,10) AS user_id,
            DATE('{ref_month}') AS snapshot_date,
            DATE_TRUNC(sms.year_month_date, MONTH) AS year_month_date,
            SAFE_CAST(REPLACE(sms.total_expense, ',', '') AS FLOAT64) AS expense_amount
        FROM 
            `abcd-dataplatform-prod.abcd_digitap_transformed.users_digitap_sms_data_transformed` sms
        WHERE 
            DATE(sms.year_month_date) <= DATE('{ref_month}')
    ),

    -- Step 3: Join snapshot_date with each of the last 3 actual calendar months
    expanded_base AS (
        SELECT 
            b.user_id,
            b.snapshot_date,
            b.year_month_date,
            b.expense_amount
        FROM base b
        JOIN month_calendar mc
            ON b.year_month_date = mc.year_month_start
            AND mc.year_month_start BETWEEN DATE_SUB(b.snapshot_date, INTERVAL 3 MONTH) AND DATE_SUB(b.snapshot_date, INTERVAL 1 DAY)
    ),

    -- Step 4: Aggregate only valid 3-month data
    agg_features AS (
        SELECT 
            user_id,
            snapshot_date,
            COUNT(*) AS months_available,
            SUM(expense_amount) AS total_expenses_3m,
            AVG(expense_amount) AS avg_expenses_3m,
            MAX(expense_amount) AS max_expenses_3m,
            STDDEV(expense_amount) AS stdev_expenses_3m
        FROM expanded_base
        GROUP BY user_id, snapshot_date
    ),

    -- Step 5: Get latest month record to compute MoM growth
    latest_month AS (
        SELECT *,
            ROW_NUMBER() OVER (PARTITION BY user_id, snapshot_date ORDER BY year_month_date DESC) AS rnk
        FROM base
        WHERE year_month_date BETWEEN DATE_SUB(snapshot_date, INTERVAL 3 MONTH) AND snapshot_date
    ),

    -- Step 6: Get current and previous month to compute growth
    growth_calc AS (
        SELECT 
            lm.user_id,
            lm.snapshot_date,
            lm.year_month_date,
            lm.expense_amount AS current_expense,
            LAG(lm.expense_amount) OVER (PARTITION BY lm.user_id, lm.snapshot_date ORDER BY lm.year_month_date) AS prev_month_expense
        FROM latest_month lm
    )

    -- Final selection
    SELECT 
        af.user_id AS mobilenumber,
        af.snapshot_date AS ref_month,
        af.total_expenses_3m,
        af.avg_expenses_3m,
        af.max_expenses_3m,
        CASE 
            WHEN af.avg_expenses_3m > 0 THEN af.stdev_expenses_3m / af.avg_expenses_3m
            ELSE NULL
        END AS expense_volatility_3m,
        CASE 
            WHEN gc.prev_month_expense > 0 THEN (gc.current_expense - gc.prev_month_expense) / gc.prev_month_expense
            ELSE NULL
        END AS mom_growth_expenses
    FROM agg_features af
    JOIN growth_calc gc
      ON af.user_id = gc.user_id AND af.snapshot_date = gc.snapshot_date
    WHERE gc.year_month_date = (
        SELECT MAX(year_month_date) 
        FROM base b 
        WHERE b.user_id = gc.user_id AND b.snapshot_date = gc.snapshot_date
    )

    """
    df = client.query(QUERY).to_dataframe()
    df.info(memory_usage='deep')

    df['ref_month'] = df['ref_month'].astype('string')
    df['mobilenumber'] = df['mobilenumber'].astype('string')
    
    return df

In [12]:
sms_con = []

unique_ref_months = final_df[['snapshot_month', 'ref_month']].drop_duplicates()

for _, row in unique_ref_months.iterrows():
    ref_month = row['ref_month']
    snapshot_month = row['snapshot_month']
    
    sms_con_df = sms_expenses_data(client,ref_month)

    sms_con_df['ref_month'] = ref_month
    sms_con_df['snapshot_month'] = snapshot_month
    
    sms_con.append(sms_con_df)
    
# Concatenate all results
sms_con_full = pd.concat(sms_con, ignore_index=True)

# Now merge this back into final_df
final_df = pd.merge(
    final_df,
    sms_con_full,
    on=['mobilenumber', 'ref_month', 'snapshot_month'],
    how='left'
)

del(sms_con_full,sms_con_df,sms_con)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5490462 entries, 0 to 5490461
Data columns (total 7 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   mobilenumber           object 
 1   ref_month              dbdate 
 2   total_expenses_3m      float64
 3   avg_expenses_3m        float64
 4   max_expenses_3m        float64
 5   expense_volatility_3m  float64
 6   mom_growth_expenses    float64
dtypes: dbdate(1), float64(5), object(1)
memory usage: 602.1 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5260314 entries, 0 to 5260313
Data columns (total 7 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   mobilenumber           object 
 1   ref_month              dbdate 
 2   total_expenses_3m      float64
 3   avg_expenses_3m        float64
 4   max_expenses_3m        float64
 5   expense_volatility_3m  float64
 6   mom_growth_expenses    float64
dtypes: dbdate(1), float64(5), object(1)
memory usage: 576.9 MB


# Bureau Tradeline

Note for vishakha: This function takes time even for one month dont use loops for month on month append rather do it manually if you wanna make this data or do some optimization

In [6]:
def process_tradeline_data(client, ref_month):
    """Process tradeline data from BigQuery."""
    query = f"""
WITH customer_base AS (
-- CTE for Registered Customers in a month (Customer Base)
 
    SELECT
        mobilenumber
          FROM
         `abcd-dataplatform-prod.abcd_mobileapp_raw.ABCDPRODDB_t_customer` t
         WHERE CAST(t.createddate as DATE) <= DATE('{ref_month}')
),
 
latest_data AS (
-- CTE for Getting Latest Experian data update on Customer and Loan Account Level
-- Mark DPD column as NaN if it has values -1 or 900. Keep originally NaN values untouched.
 
    SELECT
        RANK() OVER (PARTITION BY mobilenumber, ACCT_KEY ORDER BY SCRUB_DATE DESC, source ASC, balance_dt DESC) AS rank,
        OPEN_DATE,
        OPEN_MTH,
        mobilenumber as customer_id,
        acct_key,
        acct_type_cd,
        CLOSED_DT,
        LAST_PAYMENT_DT,
        REPORTING_DATE,
        balance_am,
        credit_limit_am,
        orig_loan_am,
        emi_amt,
        M_SUB_ID

    FROM (
        SELECT
            scrub_Date,
            SAFE.PARSE_DATE('%d/%m/%Y', OPEN_DT) as OPEN_DATE,
            LAST_DAY(SAFE.PARSE_DATE('%d/%m/%Y', OPEN_DT), MONTH) as OPEN_MTH,
            customer_id as mobilenumber,
            acct_key,
            acct_type_cd,
            SAFE.PARSE_DATE('%d/%m/%Y', CLOSED_DT) as CLOSED_DT,
            SAFE.PARSE_DATE('%d/%m/%Y', LAST_PAYMENT_DT) as LAST_PAYMENT_DT,
            SAFE.PARSE_DATE('%d/%m/%Y', BALANCE_DT) AS REPORTING_DATE,
            balance_am,
            credit_limit_am,
            orig_loan_am,
            emi_amt,
            M_SUB_ID,
		
    BALANCE_DT,
            1 AS source
        FROM
            `abffsl-dataplatform-uat.abfssl_central_analytics.EXPERIAN_RPT_AR_DAILY_BASE_UPDATED`
        WHERE
            CUSTOMER_ID IN (SELECT mobilenumber FROM customer_base)
            AND acct_type_cd in ('123','189')
            AND
		SAFE.PARSE_DATE('%d/%m/%Y', OPEN_DT) <= DATE('{ref_month}')
        AND SAFE.PARSE_DATE('%d/%m/%Y', OPEN_DT) >= DATE_SUB(DATE('{ref_month}'), INTERVAL 2 YEAR)
			
        UNION ALL

        SELECT
            scrub_Date,
            SAFE.PARSE_DATE('%d/%m/%Y', OPEN_DT) as OPEN_DATE,
            LAST_DAY(SAFE.PARSE_DATE('%d/%m/%Y', OPEN_DT), MONTH) as OPEN_MTH,
            customer_id as mobilenumber,
            acct_key,
            acct_type_cd,
            SAFE.PARSE_DATE('%d/%m/%Y', CLOSED_DT) as CLOSED_DT,
            SAFE.PARSE_DATE('%d/%m/%Y', LAST_PAYMENT_DT) as LAST_PAYMENT_DT,
            SAFE.PARSE_DATE('%d/%m/%Y', BALANCE_DT) AS REPORTING_DATE,
            balance_am,
            credit_limit_am,
            orig_loan_am,
            emi_amt,
            M_SUB_ID,
    BALANCE_DT,
            0 AS source
        FROM
            `abffsl-dataplatform-uat.abfssl_central_analytics.EXPERIAN_RPT_AR_ADHOC_BASE_UPDATED`
        WHERE
            CUSTOMER_ID IN (SELECT mobilenumber FROM customer_base)
        AND acct_type_cd in ('123','189')
        AND
		SAFE.PARSE_DATE('%d/%m/%Y', OPEN_DT) <= DATE('{ref_month}')
        AND SAFE.PARSE_DATE('%d/%m/%Y', OPEN_DT) >= DATE_SUB(DATE('{ref_month}'), INTERVAL 2 YEAR)

    )
    QUALIFY rank = 1
)


select * FROM latest_data;

    """
    df = client.query(query).to_dataframe(progress_bar_type='tqdm')
   
    # Convert date columns
    df['CLOSED_DT'] = pd.to_datetime(df['CLOSED_DT'], errors='coerce')
    df['LAST_PAYMENT_DT'] = pd.to_datetime(df['LAST_PAYMENT_DT'], errors='coerce')
    df['OPEN_DATE'] = pd.to_datetime(df['OPEN_DATE'], errors='coerce')
    df['REPORTING_DATE'] = pd.to_datetime(df['REPORTING_DATE'], errors='coerce')
    # df['CURRENT_DT'] = pd.to_datetime(datetime.today().strftime('%d/%m/%Y'), format='%d/%m/%Y')
    df['CURRENT_DT'] = pd.to_datetime(ref_month, format='%Y-%m-%d')


    # Clean future dates
    df.loc[df['CLOSED_DT'] > df['CURRENT_DT'], 'CLOSED_DT'] = pd.NaT
    df.loc[df['LAST_PAYMENT_DT'] > df['CURRENT_DT'], 'LAST_PAYMENT_DT'] = pd.NaT
   
    # Calculate time-based features
    df['days_since_last_payment'] = (df['CURRENT_DT'] - df['LAST_PAYMENT_DT']).dt.days
    df['days_since_loan_closure'] = (df['CURRENT_DT'] - df['CLOSED_DT']).dt.days
   
    # Add loan type and status
    df.rename({'acct_type_cd':'loan_type'},axis=1,inplace=True)
    df['loan_active'] = df['CLOSED_DT'].isna().astype(int)
   
    # Convert amounts to numeric
    df['balance_am'] = pd.to_numeric(df['balance_am'], errors='coerce').astype('Int64')
    
    df['loan_amt'] = np.where(
        df['credit_limit_am'].isnull(),
        df['orig_loan_am'].astype(float),
        df['credit_limit_am'].astype(float)
    )
   
    # Add on-us/off-us flag
    df['onus_offus_flag'] = np.where(df['M_SUB_ID'].isin(['ADITYA','Aditya_Birla','ABFL']), 'on_us', 'off_us')
   
    # Calculate months since account opened
    df['months_since_open'] = (df['CURRENT_DT'] - df['OPEN_DATE']).dt.days // 30
    acct_type_mapping = {
    '195': 'home_loan',
    '168': 'microfinance_housing',
    '240': 'pmay',
    '5': 'credit_card',
    '123': 'personal_loan',
    '242': 'short_term_personal_loan',
    '189': 'consumer_loan',
    '191': 'gold_loan'} 
    df['loan_type'] = df['loan_type'].map(acct_type_mapping)
   
    return df

# Bureau Tradeline Feature Generation

In [7]:
def generate_features_in_chunks(tradeline_df, chunk_size=10000):
    """Generate features from the processed data in chunks."""
    unique_customers = tradeline_df['customer_id'].unique()  # Unique customer list
    result_chunks = []  # To store chunk results

    for i in range(0, len(unique_customers), chunk_size):
        chunk_customers = unique_customers[i:i+chunk_size]  # Select a subset of unique customer_ids
        chunk_df = tradeline_df[tradeline_df['customer_id'].isin(chunk_customers)]  # Filter chunk

        # Process tradeline metrics for the current chunk
        active_loans = chunk_df['loan_active'] == 1

        processed_chunk = chunk_df.groupby(
            ['customer_id', 'CURRENT_DT', 'loan_type', 'onus_offus_flag'], observed=True
        ).agg(
            total_loans=('acct_key', 'count'),
            total_active_loans=('loan_active', 'sum'),
            total_closed_loans=('loan_active', lambda x: len(x) - x.sum()),
            min_days_since_last_payment=('days_since_last_payment', lambda x: np.nan if x.isna().all() else x.min()),
            min_days_since_loan_closure=('days_since_loan_closure', lambda x: np.nan if x.isna().all() else x.min())
        ).reset_index()

        # Recent loans metrics (last 12 and 24 months)
        recent_loans = chunk_df[chunk_df['months_since_open'] <= 24]

        exp_data_3_metrics = recent_loans[recent_loans['months_since_open'] <= 3].groupby(
            ['customer_id', 'CURRENT_DT', 'loan_type', 'onus_offus_flag'], observed=True
        )['loan_active'].sum().reset_index().rename(columns={'loan_active': 'total_active_loans_last_3m'})

        # Merge metrics
        processed_chunk = processed_chunk.merge(exp_data_3_metrics, on=['customer_id', 'CURRENT_DT', 'loan_type', 'onus_offus_flag'], how='left')
        processed_chunk[['total_active_loans_last_3m']] = processed_chunk[['total_active_loans_last_3m']].fillna(0)

        # Pivot the data
        metrics = [
            'total_loans', 'total_active_loans', 'total_closed_loans',
            'min_days_since_last_payment', 'min_days_since_loan_closure', 'total_active_loans_last_3m'
        ]

        pivot_chunk = processed_chunk.pivot_table(
            index=['customer_id', 'CURRENT_DT'],
            columns=['loan_type', 'onus_offus_flag'],
            values=metrics,
            aggfunc={
                'total_loans': 'sum',
                'total_active_loans': 'sum',
                'total_closed_loans': 'sum',
                'min_days_since_last_payment': 'min',
                'min_days_since_loan_closure': 'min',
                'total_active_loans_last_3m': 'sum'
            }
        )

        result_chunks.append(pivot_chunk)  # Store chunk result

        print(f"Processed {min(i+chunk_size, len(unique_customers))} / {len(unique_customers)} unique customers...")

    final_df = pd.concat(result_chunks, ignore_index=False)  # Combine all chunks
    return final_df

In [None]:
final_df = final_df.drop_duplicates(subset = ['mobilenumber','REF_MONTH'])
print(final_df.shape)

# Modeling

## Training

In [None]:
validation_df = final_df[final_df['REF_MONTH'] == '2024-11-30']
training_df = final_df[final_df['REF_MONTH'] != '2024-11-30']

In [None]:
X = training_df[['min_days_since_loan_closure_personal_loan_off_us',
 'min_days_since_last_payment_personal_loan_off_us',
 # 'total_active_loans_short_term_personal_loan_on_us',
 'min_days_since_loan_closure_consumer_loan_off_us',
 'total_active_loans_last_3m_personal_loan_off_us',
 'total_closed_loans_personal_loan_off_us',
 'total_loans_consumer_loan_off_us',
 'total_loans_personal_loan_off_us',
 'AVG_AMT_INQ',
 'cc_consent_flag',
 'GENDER',
 'age_in_years',
 'Annual_Income',
 'IsParent',
 'IsTechie', 'IsTraveler', 'IsSingle', 
 'IsUsingDigitalPayment', 'bankingapps', 'upiapps', 
 'plapps', 'digigoldapps','IsCarOwner',
 'occ_Salaried',
 'occ_Self Employed',
 'occ_Self Employed Professional',
 'total_expenses_3m',
 'avg_expenses_3m',
 # 'max_expenses_3m',
 'expense_volatility_3m',
 'mom_growth_expenses'
]]

y = training_df['combined_disbursed_target']

In [None]:
print(X.shape)
print(y.shape)
print(y.value_counts())
print(y.value_counts(normalize=True)*100)

In [None]:
scale_pos_weight = (y == 0).sum() / (y == 1).sum()
print(scale_pos_weight)

In [None]:
# scale_pos_weight = (y == 0).sum() / (y == 1).sum()

clfxb=XGBClassifier(n_estimators=400,max_depth=3,learning_rate=0.3,min_child_weight=0.8, objective='binary:logistic'
                    , scale_pos_weight=scale_pos_weight
                   )
clfxb.fit(X, y)

In [None]:
#train
y_predict =clfxb.predict_proba(X)
y_predict=y_predict[:,1]
y_predict=pd.Series(np.where(y_predict>0.5,1,0))
print(classification_report(y, y_predict))

In [None]:
from sklearn.metrics import roc_auc_score
pred_prob1 = clfxb.predict_proba(X)
pred_prob1=pred_prob1[:,1]

print((2*roc_auc_score(y, pred_prob1))-1)

In [None]:
feature_scores = pd.Series(clfxb.feature_importances_, index=X.columns).sort_values(ascending=False)
print(feature_scores)

# Validation

In [None]:
X_val = validation_df[X.columns]
# X_val_PAPQ = validation_df['PAPQ_FLAG']
# X_val_affluence = validation_df['abcd_affluence_score_flag']
y_val = validation_df['combined_disbursed_target']

In [None]:
print(X_val.shape)
print(y_val.shape)
print(y_val.value_counts())
print(y_val.value_counts(normalize=True)*100)
# print(X_val_PAPQ.value_counts())

In [None]:
#validation
pred_prob1 = clfxb.predict_proba(X_val)
pred_prob1=pred_prob1[:,1]
pred_prob1=pd.Series(np.where(pred_prob1>0.5,1,0))
print(classification_report(y_val, pred_prob1))

In [None]:
from sklearn.metrics import roc_auc_score
pred_prob1 = clfxb.predict_proba(X_val)
pred_prob1=pred_prob1[:,1]

print((2*roc_auc_score(y_val, pred_prob1))-1)

In [None]:
fpr_rs, tpr_rs, thresholds = roc_curve(y_val, pred_prob1)

roc_auc = auc(fpr_rs, tpr_rs)

plt.figure(figsize=(8, 6))
plt.plot(fpr_rs, tpr_rs, color='darkorange', lw=2, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('ROC curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Ensure SHAP visuals are enabled
shap.initjs()

# Compute SHAP values
explainer = shap.Explainer(clfxb)
shap_values = explainer(X_val)  

# Generate SHAP summary plot for all features
shap.summary_plot(shap_values, X_val, plot_type="dot", max_display=len(X_val.columns))

# Save the plot
plt.savefig('shap_summary_all_features.png', bbox_inches='tight', dpi=300)
plt.show()

In [None]:
val_combined = pd.concat([
    X_val,
    # X_val_PAPQ,
    # X_val_affluence,
    y_val,
    pd.Series(pred_prob1, index=X_val.index, name="pred_prob1")
], axis=1)

In [None]:
val_combined = val_combined.sort_values(by = 'pred_prob1', ascending = False)

In [None]:
sns.histplot(val_combined["pred_prob1"], bins=50, kde=True)
plt.show()

In [None]:
# import seaborn as sns
# import matplotlib.pyplot as plt

# plt.figure(figsize=(10, 6))

# # Histogram for all data
# sns.histplot(val_combined["pred_prob1"], bins=50, kde=True, label="All Data", color="skyblue", stat="density")

# # Histogram where PAPQ_FLAG == 1
# sns.histplot(val_combined[val_combined['PAPQ_FLAG'] == 1]["pred_prob1"], 
#              bins=50, kde=True, label="PAPQ_FLAG = 1", color="orange", stat="density")

# plt.title("Histogram of pred_prob1")
# plt.xlabel("pred_prob1")
# plt.ylabel("Density")
# plt.legend()
# plt.show()

In [None]:
# Rank the predictions by probability
val_combined["rank"] = val_combined["pred_prob1"].rank(method="first")
val_combined["Decile"] = pd.qcut(val_combined["rank"], 10, labels=False) + 1

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Sample DataFrame (Replace with your actual data)
# val_combined = pd.DataFrame({
#     'target_combined': [...],  # Actual values (0 or 1)
#     'pred_prob1': [...]  # Model predicted probabilities
# })

# Rank the predictions by probability
val_combined["rank"] = val_combined["pred_prob1"].rank(method="first")
val_combined["Decile"] = pd.qcut(val_combined["rank"], 10, labels=False) + 1

# Calculate gains
gain = val_combined.groupby('Decile').agg(
    num_responses=('combined_disbursed_target', 'sum'), 
    total=('combined_disbursed_target', 'count'),
    min_prob=('pred_prob1', 'min'),
    max_prob=('pred_prob1', 'max')
).reset_index()

# Sort by Decile in descending order
gain.sort_values(by="Decile", ascending=False, inplace=True)

# Calculate response rate
gain['response_rate'] = gain['num_responses'] / gain['total']

# Calculate cumulative values
gain['cumulative_response'] = gain['num_responses'].cumsum()
gain['cumulative_response_rate'] = gain['cumulative_response'] / gain['num_responses'].sum()

# Calculate Lift (Cumulative Response Rate / Random Response Rate)
gain['lift'] = gain['cumulative_response_rate'] / (gain['total'].cumsum() / gain['total'].sum())

# Create Decile Range Column
gain['Decile Range'] = gain.apply(lambda row: f"[{row['min_prob']:.4f}, {row['max_prob']:.4f}]", axis=1)

# Select and reorder columns for clarity
gain = gain[["Decile", "Decile Range", "num_responses", "total", 
             "response_rate", "cumulative_response", "cumulative_response_rate", "lift"]]

# Display Gains Table
print(gain)

# Plot Gains Chart
plt.figure(figsize=(10, 5))
plt.plot(gain['total'].cumsum() / gain['total'].sum(), gain['cumulative_response_rate'], label="Model Gains", color='blue')
plt.plot(gain['total'].cumsum() / gain['total'].sum(), gain['total'].cumsum() / gain['total'].sum(), label="Random (Baseline)", linestyle="dashed", color='gray')

plt.xlabel("Cumulative % of Population")
plt.ylabel("Cumulative Response Rate")
plt.title("Gains Chart")
plt.legend()
plt.grid()
plt.show()

# Plot Lift Chart
plt.figure(figsize=(10, 5))
plt.plot(gain['total'].cumsum() / gain['total'].sum(), gain['lift'], label="Model Lift", color='green')

plt.xlabel("Cumulative % of Population")
plt.ylabel("Lift")
plt.title("Lift Chart")
plt.legend()
plt.grid()
plt.show()

# Plot Response Rate
plt.figure(figsize=(10, 5))
plt.plot(gain['Decile'], gain['response_rate'], label="Response Rate per Decile", marker='o', color='purple')
plt.plot(gain['Decile'], gain['cumulative_response_rate'], label="Cumulative Response Rate", marker='s', linestyle="dashed", color='red')

plt.xlabel("Decile")
plt.ylabel("Response Rate (%)")
plt.title("Response Rate by Decile")
plt.legend()
plt.grid()
plt.show()