# Importing Libraries

In [2]:
import xgboost as xgb
from sklearn.metrics import roc_curve, auc
from xgboost.sklearn import XGBClassifier
import pandas as pd
import numpy as np
import pandas_gbq
import shap
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
from google.cloud import bigquery
import time
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import cloudpickle
pd.set_option('display.max_columns', 500)
client = bigquery.Client()

# Scoring Month

In [3]:
snapshot_period = 'Mar-2025'
date_condition = pd.to_datetime(snapshot_period, format='%b-%Y')
created_date_condition = date_condition.strftime('%Y-%m-%d')
event_date_condition = (date_condition + pd.DateOffset(months=1)).strftime('%Y-%m-%d')
ref_month = (date_condition - pd.DateOffset(days=date_condition.day)).strftime('%Y-%m-%d')
print(f'Event Date Condition {event_date_condition}')
print(f'Scoring Month {created_date_condition}')
print(f'Ref_month {ref_month}')

Event Date Condition 2025-04-01
Scoring Month 2025-03-01
Ref_month 2025-02-28


# PL model Base

In [4]:
QUERY = f"""
-- PL Query --

WITH base AS (
SELECT
    '{snapshot_period}' AS snapshot_month,
    t.id AS customer_id,
    CASE
      WHEN a.final_uninstall_flag = 0 THEN 1
      WHEN a.final_uninstall_flag = 1 THEN 0
      ELSE a.final_uninstall_flag
    END AS install_flag,
    t.createddate AS registration_date
FROM 
    `abcd-dataplatform-prod.abcd_mobileapp_raw.ABCDPRODDB_t_customer` t
LEFT JOIN 
    `abcd-dataplatform.abcd_data_model.user_activity_raw` a 
ON 
    t.id=a.customer_id
WHERE
    t.createddate < "{created_date_condition}"
),
 
PL AS(
 
SELECT
    '{snapshot_period}' AS snapshot_month,
    a.Customer_id AS customer_id,
    a.target as PL_target,
    a.registration_date AS registration_date,
    a.personal_details_complete_datetime
FROM
(
  SELECT
        t.id AS Customer_id,
        pl.mobilenumber,
        (CASE
          WHEN personal_details_complete = 'Y' THEN 1
          ELSE 0
        END) AS target,
        personal_details_complete_datetime,
        t.createddate AS registration_date,
        ROW_NUMBER() OVER(PARTITION BY pl.mobilenumber ORDER BY personal_details_complete_datetime DESC) AS rn
  FROM `abcd-dataplatform-prod.abcd_mobileapp_raw.ABCDPRODDB_t_pl_customer_detail` pl
  LEFT JOIN `abcd-dataplatform-prod.abcd_mobileapp_raw.ABCDPRODDB_t_customer` t 
  ON pl.mobilenumber = t.mobilenumber
  WHERE
    personal_details_complete = 'Y' 
    AND personal_details_complete_datetime IS NOT NULL 
    AND t.createddate < "{created_date_condition}"
    
    AND personal_details_complete_datetime >= '{created_date_condition}'
    AND personal_details_complete_datetime < '{event_date_condition}'
    
) AS a
WHERE a.rn = 1
 
)
 
SELECT
      b.snapshot_month,
      b.customer_id,
      b.install_flag,
      b.registration_date,
      coalesce(p.PL_target,0) As PL_target,
      p.personal_details_complete_datetime
 
FROM base AS b
LEFT JOIN 
    PL p 
ON 
    p.customer_id = b.customer_id
WHERE 
    (b.install_flag = 1 
    OR (b.install_flag = 0 and PL_target = 1))
"""

base_df = client.query(QUERY).to_dataframe()
base_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1673937 entries, 0 to 1673936
Data columns (total 6 columns):
 #   Column                              Non-Null Count    Dtype         
---  ------                              --------------    -----         
 0   snapshot_month                      1673937 non-null  object        
 1   customer_id                         1673937 non-null  Int64         
 2   install_flag                        1673937 non-null  Int64         
 3   registration_date                   1673937 non-null  datetime64[us]
 4   PL_target                           1673937 non-null  Int64         
 5   personal_details_complete_datetime  54081 non-null    datetime64[us]
dtypes: Int64(3), datetime64[us](2), object(1)
memory usage: 172.4 MB


In [5]:
base_df.head()

Unnamed: 0,snapshot_month,customer_id,install_flag,registration_date,PL_target,personal_details_complete_datetime
0,Mar-2025,3091145,1,2024-11-17 20:17:41.799783,0,NaT
1,Mar-2025,3044908,1,2024-11-15 22:55:23.316865,0,NaT
2,Mar-2025,2865868,1,2024-11-09 13:41:52.267362,0,NaT
3,Mar-2025,2823616,1,2024-11-07 18:45:47.084581,0,NaT
4,Mar-2025,2840235,1,2024-11-08 13:57:36.394436,0,NaT


In [6]:
base_df.shape

(1673937, 6)

In [7]:
print(base_df.PL_target.value_counts())
print(base_df.shape)

PL_target
0    1619856
1      54081
Name: count, dtype: Int64
(1673937, 6)


In [8]:
print(base_df.registration_date.max())
print(base_df.registration_date.min())

2025-02-28 23:59:56.569548
2023-08-25 10:13:21.826323


In [9]:
table_name = 'cross_sell_pl_base_scoring_mar25'
#table_name = 'cross_sell_pl_base_scoring_mar'
pandas_gbq.to_gbq(
    dataframe=base_df,
    destination_table=f'abcd_data_science_app.{table_name}',
    project_id='abcd-dataplatform',
    if_exists='replace'
)

100%|██████████| 1/1 [00:00<00:00, 8338.58it/s]


In [10]:
QUERY = f"""

with cust_base as (
select SAFE_CAST(id AS STRING) AS customer_id,
mobilenumber FROM  `abcd-dataplatform-prod.abcd_mobileapp_transformed.ABCDPRODDB_t_customer` a
)

select a.*, b.mobilenumber FROM `abcd-dataplatform.abcd_data_science_app.cross_sell_pl_base_scoring_mar25` a
left join cust_base b on
SAFE_CAST(a.customer_id AS STRING) = b.customer_id

"""

xsell_pl_df = client.query(QUERY).to_dataframe()
xsell_pl_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1673937 entries, 0 to 1673936
Data columns (total 7 columns):
 #   Column                              Non-Null Count    Dtype         
---  ------                              --------------    -----         
 0   snapshot_month                      1673937 non-null  object        
 1   customer_id                         1673937 non-null  Int64         
 2   install_flag                        1673937 non-null  Int64         
 3   registration_date                   1673937 non-null  datetime64[us]
 4   PL_target                           1673937 non-null  Int64         
 5   personal_details_complete_datetime  54081 non-null    datetime64[us]
 6   mobilenumber                        1673937 non-null  object        
dtypes: Int64(3), datetime64[us](2), object(2)
memory usage: 279.4 MB


In [11]:
xsell_pl_df.head()

Unnamed: 0,snapshot_month,customer_id,install_flag,registration_date,PL_target,personal_details_complete_datetime,mobilenumber
0,Mar-2025,3147505,1,2024-11-19 19:07:10.544626,0,NaT,7795408678
1,Mar-2025,3094942,1,2024-11-18 07:01:53.087430,0,NaT,7763821758
2,Mar-2025,3051618,1,2024-11-16 10:37:18.437698,0,NaT,9348064956
3,Mar-2025,3046097,1,2024-11-16 02:13:49.830905,0,NaT,9799906851
4,Mar-2025,3109445,1,2024-11-18 15:15:44.011751,0,NaT,9125945369


In [12]:
xsell_pl_df.install_flag.value_counts()

install_flag
1    1636841
0      37096
Name: count, dtype: Int64

# Bureau Enquiry Data

In [13]:
ref_month

'2025-02-28'

In [14]:
QUERY = f"""

select * FROM `abffsl-dataplatform-uat.abfssl_central_analytics.EXPERIAN_RPT_ENQ_DAILY_BASE` a
where
    INQ_PURP_CD in ('13', '03','07')
    AND PARSE_DATE('%d/%m/%Y', INQ_DATE) <= DATE '{ref_month}' 
    AND PARSE_DATE('%d/%m/%Y', INQ_DATE) >= DATE_SUB(DATE '{ref_month}', INTERVAL 365 DAY)
    ;
"""

exp_enq_df = client.query(QUERY).to_dataframe()
exp_enq_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10425244 entries, 0 to 10425243
Data columns (total 7 columns):
 #   Column         Dtype              
---  ------         -----              
 0   CUSTOMER_ID    object             
 1   INQ_PURP_CD    object             
 2   INQ_DATE       object             
 3   M_SUB_ID       object             
 4   AMOUNT         object             
 5   SCRUB_DATE     dbdate             
 6   INGESTION_TMS  datetime64[us, UTC]
dtypes: datetime64[us, UTC](1), dbdate(1), object(5)
memory usage: 3.2 GB


In [15]:
exp_enq_df['INQ_DT'] = pd.to_datetime(exp_enq_df['INQ_DATE'],format = '%d/%m/%Y')

In [16]:
print(exp_enq_df.INQ_DT.min())
print(exp_enq_df.INQ_DT.max())

2024-02-29 00:00:00
2025-02-28 00:00:00


In [17]:
xsell_pl_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1673937 entries, 0 to 1673936
Data columns (total 7 columns):
 #   Column                              Non-Null Count    Dtype         
---  ------                              --------------    -----         
 0   snapshot_month                      1673937 non-null  object        
 1   customer_id                         1673937 non-null  Int64         
 2   install_flag                        1673937 non-null  Int64         
 3   registration_date                   1673937 non-null  datetime64[us]
 4   PL_target                           1673937 non-null  Int64         
 5   personal_details_complete_datetime  54081 non-null    datetime64[us]
 6   mobilenumber                        1673937 non-null  object        
dtypes: Int64(3), datetime64[us](2), object(2)
memory usage: 94.2+ MB


In [18]:
exp_enq_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10425244 entries, 0 to 10425243
Data columns (total 8 columns):
 #   Column         Dtype              
---  ------         -----              
 0   CUSTOMER_ID    object             
 1   INQ_PURP_CD    object             
 2   INQ_DATE       object             
 3   M_SUB_ID       object             
 4   AMOUNT         object             
 5   SCRUB_DATE     dbdate             
 6   INGESTION_TMS  datetime64[us, UTC]
 7   INQ_DT         datetime64[ns]     
dtypes: datetime64[ns](1), datetime64[us, UTC](1), dbdate(1), object(5)
memory usage: 636.3+ MB


In [19]:
xsell_pl_df['mobilenumber'] = xsell_pl_df['mobilenumber'].astype('string')
exp_enq_df['CUSTOMER_ID'] = exp_enq_df['CUSTOMER_ID'].astype('string')

In [20]:
exp_enq_df_v1 = exp_enq_df[exp_enq_df['CUSTOMER_ID'].isin(xsell_pl_df['mobilenumber'])]

In [21]:
print(exp_enq_df_v1.shape)
print(exp_enq_df_v1.CUSTOMER_ID.nunique())

(3360068, 8)
182260


In [22]:
exp_enq_df_v1.head()

Unnamed: 0,CUSTOMER_ID,INQ_PURP_CD,INQ_DATE,M_SUB_ID,AMOUNT,SCRUB_DATE,INGESTION_TMS,INQ_DT
2,9004010501,13,02/12/2024,PVT,24999,2025-03-24,2025-03-25 16:07:08.643270+00:00,2024-12-02
3,9594349777,13,02/12/2024,PVT,27999,2025-03-24,2025-03-25 16:07:08.643270+00:00,2024-12-02
21,9529797232,13,05/10/2024,NBF,25500,2025-03-24,2025-03-25 16:07:08.643270+00:00,2024-10-05
22,8878032094,13,06/02/2025,PUB,1200000,2025-03-24,2025-03-25 16:07:08.643270+00:00,2025-02-06
33,8017912416,13,08/04/2024,NBF,19435,2025-03-24,2025-03-25 16:07:08.643270+00:00,2024-04-08


In [23]:
exp_enq_df_v1.drop(['INQ_DATE','INGESTION_TMS','SCRUB_DATE'], axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exp_enq_df_v1.drop(['INQ_DATE','INGESTION_TMS','SCRUB_DATE'], axis = 1, inplace = True)


In [24]:
exp_enq_df_v1['REF_MONTH'] = pd.to_datetime(ref_month, format = '%Y-%m-%d')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exp_enq_df_v1['REF_MONTH'] = pd.to_datetime(ref_month, format = '%Y-%m-%d')


In [25]:
exp_enq_df_v1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3360068 entries, 2 to 10425239
Data columns (total 6 columns):
 #   Column       Dtype         
---  ------       -----         
 0   CUSTOMER_ID  string        
 1   INQ_PURP_CD  object        
 2   M_SUB_ID     object        
 3   AMOUNT       object        
 4   INQ_DT       datetime64[ns]
 5   REF_MONTH    datetime64[ns]
dtypes: datetime64[ns](2), object(3), string(1)
memory usage: 179.4+ MB


In [26]:
exp_enq_df_v1.head()

Unnamed: 0,CUSTOMER_ID,INQ_PURP_CD,M_SUB_ID,AMOUNT,INQ_DT,REF_MONTH
2,9004010501,13,PVT,24999,2024-12-02,2025-02-28
3,9594349777,13,PVT,27999,2024-12-02,2025-02-28
21,9529797232,13,NBF,25500,2024-10-05,2025-02-28
22,8878032094,13,PUB,1200000,2025-02-06,2025-02-28
33,8017912416,13,NBF,19435,2024-04-08,2025-02-28


In [27]:
exp_enq_df_v1["DAYS_SINCE_LAST_INQ"] = (exp_enq_df_v1["REF_MONTH"] - exp_enq_df_v1["INQ_DT"]).dt.days

exp_enq_df_v1['AMOUNT_INT'] = exp_enq_df_v1['AMOUNT'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exp_enq_df_v1["DAYS_SINCE_LAST_INQ"] = (exp_enq_df_v1["REF_MONTH"] - exp_enq_df_v1["INQ_DT"]).dt.days
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exp_enq_df_v1['AMOUNT_INT'] = exp_enq_df_v1['AMOUNT'].astype(int)


In [28]:
exp_enq_df_v1.head()

Unnamed: 0,CUSTOMER_ID,INQ_PURP_CD,M_SUB_ID,AMOUNT,INQ_DT,REF_MONTH,DAYS_SINCE_LAST_INQ,AMOUNT_INT
2,9004010501,13,PVT,24999,2024-12-02,2025-02-28,88,24999
3,9594349777,13,PVT,27999,2024-12-02,2025-02-28,88,27999
21,9529797232,13,NBF,25500,2024-10-05,2025-02-28,146,25500
22,8878032094,13,PUB,1200000,2025-02-06,2025-02-28,22,1200000
33,8017912416,13,NBF,19435,2024-04-08,2025-02-28,326,19435


In [29]:
min_max_amt = exp_enq_df_v1.groupby('CUSTOMER_ID')['AMOUNT_INT'].agg(['max','mean']).reset_index()

min_max_amt.rename({'max': 'MAX_AMT_INQ', 'mean' : 'AVG_AMT_INQ'}, axis = 1, inplace = True)

In [30]:
exp_enq_df_v1 = exp_enq_df_v1.merge(min_max_amt, on = "CUSTOMER_ID")

In [31]:
exp_enq_df_v1.head()

Unnamed: 0,CUSTOMER_ID,INQ_PURP_CD,M_SUB_ID,AMOUNT,INQ_DT,REF_MONTH,DAYS_SINCE_LAST_INQ,AMOUNT_INT,MAX_AMT_INQ,AVG_AMT_INQ
0,9004010501,13,PVT,24999,2024-12-02,2025-02-28,88,24999,700000,3.624995e+05
1,9594349777,13,PVT,27999,2024-12-02,2025-02-28,88,27999,27999,2.799900e+04
2,9529797232,13,NBF,25500,2024-10-05,2025-02-28,146,25500,50000,2.683333e+04
3,8878032094,13,PUB,1200000,2025-02-06,2025-02-28,22,1200000,1500000,1.190000e+06
4,8017912416,13,NBF,19435,2024-04-08,2025-02-28,326,19435,19435,1.943500e+04
...,...,...,...,...,...,...,...,...,...,...
3360063,7799317860,13,Aditya_Birla,600000,2024-10-24,2025-02-28,127,600000,600000,6.000000e+05
3360064,7799317860,13,Aditya_Birla,600000,2024-10-24,2025-02-28,127,600000,600000,6.000000e+05
3360065,9387486564,13,Aditya_Birla,600000,2024-10-25,2025-02-28,126,600000,1500000,4.357140e+05
3360066,9387486564,13,Aditya_Birla,600000,2024-10-25,2025-02-28,126,600000,1500000,4.357140e+05


In [32]:
time_windows = {
    "3m": 3 * 30,
    "6m": 6 * 30,
    "12m": 12 * 30
}

# Initialize results dictionary
results = exp_enq_df_v1[["CUSTOMER_ID"]].drop_duplicates().set_index("CUSTOMER_ID")

# Compute total enquiries for each time window
for label, days in time_windows.items():
    mask = exp_enq_df_v1["DAYS_SINCE_LAST_INQ"] <= days  # Filter based on days
    total_enquiries = mask.groupby(exp_enq_df_v1["CUSTOMER_ID"]).sum()
    results[f"total_enquiries_last_{label}"] = total_enquiries

# Compute enquiry counts per purpose & institute using pivot tables
for category in ["INQ_PURP_CD", "M_SUB_ID"]:
    temp_df = exp_enq_df_v1.pivot_table(index="CUSTOMER_ID", columns=category, values="DAYS_SINCE_LAST_INQ", aggfunc="count", fill_value=0)
    
    # Add features for each time window
    for label, days in time_windows.items():
        mask = exp_enq_df_v1["DAYS_SINCE_LAST_INQ"] <= days
        counts = exp_enq_df_v1[mask].pivot_table(index="CUSTOMER_ID", columns=category, values="DAYS_SINCE_LAST_INQ", aggfunc="count", fill_value=0)
        
        # Rename columns
        counts.columns = [f"count_{col}_last_{label}" for col in counts.columns]
        
        # Merge with results
        results = results.join(counts, how="left")

# Fill NaN values with 0
results = results.fillna(0).reset_index()

In [33]:
amt_df = exp_enq_df_v1[['CUSTOMER_ID','MAX_AMT_INQ','AVG_AMT_INQ','REF_MONTH']].drop_duplicates()

In [34]:
results = results.merge(amt_df, on = 'CUSTOMER_ID', how = 'left')

In [35]:
results.head()

Unnamed: 0,CUSTOMER_ID,total_enquiries_last_3m,total_enquiries_last_6m,total_enquiries_last_12m,count_13_last_3m,count_13_last_6m,count_13_last_12m,count_ADITYA_last_3m,count_Aditya_Birla_last_3m,count_COB_last_3m,count_FOR_last_3m,count_NBF_last_3m,count_PUB_last_3m,count_PVT_last_3m,count_RRB_last_3m,count_SFB_last_3m,count_ADITYA_last_6m,count_Aditya_Birla_last_6m,count_COB_last_6m,count_FOR_last_6m,count_NBF_last_6m,count_PUB_last_6m,count_PVT_last_6m,count_RRB_last_6m,count_SFB_last_6m,count_ADITYA_last_12m,count_Aditya_Birla_last_12m,count_COB_last_12m,count_FOR_last_12m,count_NBF_last_12m,count_PUB_last_12m,count_PVT_last_12m,count_RRB_last_12m,count_SFB_last_12m,MAX_AMT_INQ,AVG_AMT_INQ,REF_MONTH
0,9004010501,1,1,2,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,700000,362499.5,2025-02-28
1,9594349777,1,1,1,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,27999,27999.0,2025-02-28
2,9529797232,0,3,3,0.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,50000,26833.33,2025-02-28
3,8878032094,3,4,5,3.0,4.0,5.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,1500000,1190000.0,2025-02-28
4,8017912416,0,0,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,19435,19435.0,2025-02-28


In [36]:
results.shape

(182260, 37)

In [37]:
# table_name = 't_dg_xsell_model_exp_enq_aggregated_scoring'
table_name = 't_dg_xsell_model_exp_enq_aggregated_scoring_mar25'
pandas_gbq.to_gbq(
    dataframe=results,
    destination_table=f'abcd_data_science_app.{table_name}',
    project_id='abcd-dataplatform',
    if_exists='replace'
)

100%|██████████| 1/1 [00:00<00:00, 13025.79it/s]


# Bureau Consent Data

In [38]:
ref_month

'2025-02-28'

In [39]:
QUERY = f"""

SELECT 

LAST_DAY(DATE(_PARTITIONTIME), MONTH) as EVENT_MONTH,

COALESCE(CAST(mp_user_id AS INT64), CAST(guestid AS INT64)) AS customer_id,

SUM(CASE

      WHEN mp_event_name = 'hd_mm_section_card_click' AND ctatext IN ('credit_track')

      THEN 1 ELSE 0

  END) AS credit_track_click
 
FROM `abcd-dataplatform-prod.abcd_mixpanel_raw.abcd_mp_master_event` mp

where
-- customerid in 
-- (select customer_id FROM `abcd-dataplatform.abcd_data_science_app.t_dg_xsell_model_base_pl`
-- where snapshot_month = 'Dec-2024')  

-- AND

--_PARTITIONTIME = '2024-11-02'

LAST_DAY(DATE(_PARTITIONTIME), MONTH) =  DATE '{ref_month}'

group by 1,2

"""

bureau_consent_df = client.query(QUERY).to_dataframe()
bureau_consent_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 637906 entries, 0 to 637905
Data columns (total 3 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   EVENT_MONTH         637906 non-null  dbdate
 1   customer_id         637905 non-null  Int64 
 2   credit_track_click  637906 non-null  Int64 
dtypes: Int64(2), dbdate(1)
memory usage: 15.8 MB


In [40]:
bureau_consent_df.head()

Unnamed: 0,EVENT_MONTH,customer_id,credit_track_click
0,2025-02-28,2512204,3
1,2025-02-28,527033,2
2,2025-02-28,4389876,0
3,2025-02-28,194612,1
4,2025-02-28,1115749,0


In [41]:
bureau_consent_df['customer_id'] = bureau_consent_df['customer_id'].astype('string')
xsell_pl_df['customer_id'] = xsell_pl_df['customer_id'].astype('string')

In [42]:
bureau_consent_df = bureau_consent_df[bureau_consent_df['customer_id'].isin(xsell_pl_df['customer_id'])]

In [43]:
bureau_consent_df.shape

(283990, 3)

# Bureau Tradeline

In [44]:
ref_month

'2025-02-28'

In [45]:
start_time = time.time()

QUERY = f"""

with exp_base as(
select 
scrub_Date,
SAFE.PARSE_DATE('%d/%m/%Y', OPEN_DT) as OPEN_DATE,
LAST_DAY(SAFE.PARSE_DATE('%d/%m/%Y', OPEN_DT), MONTH) as OPEN_MTH,
customer_id,
acct_key,
acct_type_cd,
SAFE.PARSE_DATE('%d/%m/%Y', CLOSED_DT) as CLOSED_DT,
SAFE.PARSE_DATE('%d/%m/%Y', LAST_PAYMENT_DT) as LAST_PAYMENT_DT,
SAFE.PARSE_DATE('%d/%m/%Y', BALANCE_DT) AS REPORTING_DATE,
balance_am,
credit_limit_am,
orig_loan_am,
emi_amt,
M_SUB_ID
FROM `abffsl-dataplatform-uat.abfssl_central_analytics.EXPERIAN_RPT_AR_DAILY_BASE_UPDATED`
where 
SAFE.PARSE_DATE('%d/%m/%Y', OPEN_DT) <= DATE '{ref_month}'
AND
SAFE.PARSE_DATE('%d/%m/%Y', OPEN_DT) >= DATE_SUB(DATE '{ref_month}', INTERVAL 730 DAY)

QUALIFY RANK() OVER (PARTITION BY CUSTOMER_ID, ACCT_KEY ORDER BY SCRUB_DATE DESC, SAFE.PARSE_DATE('%d/%m/%Y', BALANCE_DT) DESC) = 1
)

select * from exp_base

"""

exp_tradeline_df = client.query(QUERY).to_dataframe()
exp_tradeline_df.info(memory_usage='deep')

end_time = time.time()

print(f"Execution time: {end_time - start_time:.4f} seconds")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47247773 entries, 0 to 47247772
Data columns (total 14 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   scrub_Date       object
 1   OPEN_DATE        object
 2   OPEN_MTH         object
 3   customer_id      object
 4   acct_key         object
 5   acct_type_cd     object
 6   CLOSED_DT        object
 7   LAST_PAYMENT_DT  object
 8   REPORTING_DATE   object
 9   balance_am       object
 10  credit_limit_am  object
 11  orig_loan_am     object
 12  emi_amt          object
 13  M_SUB_ID         object
dtypes: object(14)
memory usage: 32.0 GB
Execution time: 248.8765 seconds


In [46]:
exp_tradeline_df.head()

Unnamed: 0,scrub_Date,OPEN_DATE,OPEN_MTH,customer_id,acct_key,acct_type_cd,CLOSED_DT,LAST_PAYMENT_DT,REPORTING_DATE,balance_am,credit_limit_am,orig_loan_am,emi_amt,M_SUB_ID
0,2025-03-08,2022-08-18,2022-08-31,9346569441,4278003708,173,2024-03-20,2024-02-27,2024-05-01,0,90500,90500,-1,NBF
1,2025-03-12,2024-10-08,2024-10-31,9346657020,6552033923,123,2024-10-13,2024-10-13,2024-10-15,0,500,500,518,NBF
2,2025-04-01,2025-01-27,2025-01-31,9346679014,6872049160,242,2025-02-17,2025-02-17,2025-02-23,0,1000,1000,-1,NBF
3,2025-04-01,2023-10-06,2023-10-31,9346686623,5378029285,189,,2024-03-03,2025-03-21,2302,13356,13356,2334,NBF
4,2025-03-11,2025-01-06,2025-01-31,9347007133,6772404575,242,,,2025-01-12,1112,1000,1000,-1,NBF


In [47]:
exp_tradeline_df['onus_offus_flag'] = np.where(exp_tradeline_df['M_SUB_ID'].isin(['ADITYA','Aditya_Birla','ABFL']),'on_us','off_us')
exp_tradeline_df.drop('M_SUB_ID', axis = 1,inplace = True)

In [48]:
exp_tradeline_df['CLOSED_DT'] = pd.to_datetime(exp_tradeline_df['CLOSED_DT'], errors='coerce')
exp_tradeline_df['LAST_PAYMENT_DT'] = pd.to_datetime(exp_tradeline_df['LAST_PAYMENT_DT'], errors='coerce')
exp_tradeline_df['OPEN_DATE'] = pd.to_datetime(exp_tradeline_df['OPEN_DATE'], errors='coerce')
exp_tradeline_df['REPORTING_DATE'] = pd.to_datetime(exp_tradeline_df['REPORTING_DATE'], errors='coerce')

In [50]:
exp_tradeline_df['REF_MONTH'] = pd.to_datetime(ref_month, format = '%Y-%m-%d')

In [52]:
exp_tradeline_df['loan_amt'] = np.where(
    exp_tradeline_df['credit_limit_am'].isnull(), 
    exp_tradeline_df['orig_loan_am'].astype(float), 
    exp_tradeline_df['credit_limit_am'].astype(float)
)

In [None]:
exp_tradeline_df[exp_tradeline_df['CLOSED_DT'] > exp_tradeline_df['REF_MONTH']].head()

In [53]:
exp_tradeline_df.loc[exp_tradeline_df['CLOSED_DT'] > exp_tradeline_df['REF_MONTH'], 'CLOSED_DT'] = pd.NaT
exp_tradeline_df.loc[exp_tradeline_df['LAST_PAYMENT_DT'] > exp_tradeline_df['REF_MONTH'], 'LAST_PAYMENT_DT'] = pd.NaT

In [54]:
exp_tradeline_df['days_since_last_payment'] = (exp_tradeline_df['REF_MONTH'] - exp_tradeline_df['LAST_PAYMENT_DT']).dt.days
exp_tradeline_df['days_since_loan_closure'] = (exp_tradeline_df['REF_MONTH'] - exp_tradeline_df['CLOSED_DT']).dt.days

In [55]:
exp_tradeline_df['loan_active'] = exp_tradeline_df['CLOSED_DT'].isna().astype(int)

In [56]:
acct_type_mapping = {
    '5': 'credit_card',
    '123': 'personal_loan',
    '242': 'short_term_personal_loan',
    '189': 'consumer_loan',
    '191': 'gold_loan'
}

exp_tradeline_df['loan_type'] = exp_tradeline_df['acct_type_cd'].map(acct_type_mapping).fillna('others')

In [57]:
exp_tradeline_df['emi_amt'] = exp_tradeline_df['emi_amt'].astype(int)
exp_tradeline_df['balance_am'] = exp_tradeline_df['balance_am'].astype(int)
exp_tradeline_df['emi_amt'] = exp_tradeline_df["emi_amt"].mask(exp_tradeline_df["emi_amt"] <= 0, 0)

In [58]:
exp_tradeline_df.drop(['credit_limit_am','orig_loan_am'], axis = 1, inplace = True)

In [59]:
# Convert necessary columns to correct types
exp_tradeline_df['loan_active'] = exp_tradeline_df['loan_active'].astype(int)

In [60]:
exp_tradeline_df = exp_tradeline_df[~exp_tradeline_df['loan_type'].isin(['others'])]

In [61]:
exp_tradeline_df.shape

(43343331, 18)

In [62]:
exp_tradeline_df['customer_id'] = exp_tradeline_df['customer_id'].astype('string')
xsell_pl_df['mobilenumber'] = xsell_pl_df['mobilenumber'].astype('string')

In [63]:
exp_tradeline_df = exp_tradeline_df[exp_tradeline_df['customer_id'].isin(xsell_pl_df['mobilenumber'])]

In [64]:
exp_tradeline_df.shape

(12411670, 18)

In [65]:
exp_tradeline_df['rank'] = exp_tradeline_df['customer_id'].rank(method='dense').astype(int)

In [66]:
print(exp_tradeline_df.shape)
print(exp_tradeline_df['rank'].nunique())

(12411670, 19)
723432


In [67]:
exp_tradeline_df.head()

Unnamed: 0,scrub_Date,OPEN_DATE,OPEN_MTH,customer_id,acct_key,acct_type_cd,CLOSED_DT,LAST_PAYMENT_DT,REPORTING_DATE,balance_am,emi_amt,onus_offus_flag,REF_MONTH,loan_amt,days_since_last_payment,days_since_loan_closure,loan_active,loan_type,rank
1,2025-03-12,2024-10-08,2024-10-31,9346657020,6552033923,123,2024-10-13,2024-10-13,2024-10-15,0,518,off_us,2025-02-28,500.0,138.0,138.0,0,personal_loan,448096
4,2025-03-11,2025-01-06,2025-01-31,9347007133,6772404575,242,NaT,NaT,2025-01-12,1112,0,off_us,2025-02-28,1000.0,,,1,short_term_personal_loan,448433
7,2025-03-11,2024-04-28,2024-04-30,9347331844,6043722432,242,2024-05-27,2024-05-27,2024-06-02,0,0,off_us,2025-02-28,2000.0,277.0,277.0,0,short_term_personal_loan,448871
8,2025-03-08,2024-08-02,2024-08-31,9347378893,6401670884,242,2025-01-23,2025-01-23,2025-01-26,0,0,off_us,2025-02-28,1000.0,36.0,36.0,0,short_term_personal_loan,448926
10,2025-04-03,2024-09-01,2024-09-30,9347579245,6410341967,242,2024-09-30,2024-09-30,2024-10-06,0,0,off_us,2025-02-28,1000.0,151.0,151.0,0,short_term_personal_loan,449173


In [68]:
def process_in_chunks(df, chunk_size=10000):
    """
    Processes large dataframe in chunks based on unique_customer_id.
    
    Parameters:
        df (pd.DataFrame): The input dataframe.
        chunk_size (int): Number of unique customer_ids to process in each chunk.

    Returns:
        pd.DataFrame: Aggregated results.
    """
    unique_customers = df['rank'].unique()  # Unique customer list
    result_chunks = []  # To store chunk results

    active_loans = df['loan_active'] == 1
    emi_active_loans = (df['emi_amt'] > 0) & active_loans

    for i in range(0, len(unique_customers), chunk_size):
        chunk_customers = unique_customers[i:i+chunk_size]  # Select a subset of unique customer_ids
        chunk_df = df[df['rank'].isin(chunk_customers)]  # Filter chunk
        
        grouped_chunk = chunk_df.groupby(
            ['customer_id', 'REF_MONTH', 'loan_type', 'onus_offus_flag'], observed=True
        ).agg(
            total_loans=('acct_key', 'count'),
            total_active_loans=('loan_active', 'sum'),
            total_closed_loans=('loan_active', lambda x: len(x) - x.sum()),
            min_loan_amt=('loan_amt', 'min'),
            max_loan_amt=('loan_amt', 'max'),

            min_days_since_last_payment=('days_since_last_payment', lambda x: np.nan if x.isna().all() else x.min()),
            max_days_since_last_payment=('days_since_last_payment', lambda x: np.nan if x.isna().all() else x.max()),

            min_days_since_loan_closure=('days_since_loan_closure', lambda x: np.nan if x.isna().all() else x.min()),
            max_days_since_loan_closure=('days_since_loan_closure', lambda x: np.nan if x.isna().all() else x.max()),

            total_emi_loans=('emi_amt', lambda x: emi_active_loans.loc[x.index].sum())
        ).reset_index()
        
        result_chunks.append(grouped_chunk)  # Store chunk result

        print(f"Processed {min(i+chunk_size, len(unique_customers))} / {len(unique_customers)} unique customers...")

    final_df = pd.concat(result_chunks, ignore_index=True)  # Combine all chunks
    return final_df

In [69]:
processed_df = process_in_chunks(exp_tradeline_df, chunk_size=10000)

Processed 10000 / 723432 unique customers...
Processed 20000 / 723432 unique customers...
Processed 30000 / 723432 unique customers...
Processed 40000 / 723432 unique customers...
Processed 50000 / 723432 unique customers...
Processed 60000 / 723432 unique customers...
Processed 70000 / 723432 unique customers...
Processed 80000 / 723432 unique customers...
Processed 90000 / 723432 unique customers...
Processed 100000 / 723432 unique customers...
Processed 110000 / 723432 unique customers...
Processed 120000 / 723432 unique customers...
Processed 130000 / 723432 unique customers...
Processed 140000 / 723432 unique customers...
Processed 150000 / 723432 unique customers...
Processed 160000 / 723432 unique customers...
Processed 170000 / 723432 unique customers...
Processed 180000 / 723432 unique customers...
Processed 190000 / 723432 unique customers...
Processed 200000 / 723432 unique customers...
Processed 210000 / 723432 unique customers...
Processed 220000 / 723432 unique customers.

In [46]:
processed_df.customer_id.nunique()

842104

In [70]:
processed_df.head()

Unnamed: 0,customer_id,REF_MONTH,loan_type,onus_offus_flag,total_loans,total_active_loans,total_closed_loans,min_loan_amt,max_loan_amt,min_days_since_last_payment,max_days_since_last_payment,min_days_since_loan_closure,max_days_since_loan_closure,total_emi_loans
0,8112207954,2025-02-28,consumer_loan,off_us,2,1,1,1000.0,10000.0,58.0,492.0,18.0,18.0,0
1,8112207954,2025-02-28,credit_card,off_us,3,3,0,25000.0,66000.0,17.0,17.0,,,0
2,8112207954,2025-02-28,personal_loan,off_us,22,3,19,300.0,80000.0,1.0,557.0,1.0,557.0,0
3,8112207954,2025-02-28,short_term_personal_loan,off_us,40,6,34,500.0,6000.0,24.0,337.0,24.0,337.0,0
4,8120871167,2025-02-28,consumer_loan,off_us,3,2,1,8776.0,17999.0,52.0,210.0,177.0,177.0,1


In [71]:
exp_tradeline_df['months_since_open'] = (exp_tradeline_df['REF_MONTH'] - exp_tradeline_df['OPEN_DATE']).dt.days // 30

recent_loans = exp_tradeline_df[exp_tradeline_df['months_since_open'] <= 6] 

exp_data_l3_metrics = recent_loans[recent_loans['months_since_open'] <= 3].groupby(
    ['customer_id', 'REF_MONTH', 'loan_type', 'onus_offus_flag'], observed=True
)['loan_active'].sum().reset_index().rename(columns={'loan_active': 'total_active_loans_last_3m'})

exp_data_l6_metrics = recent_loans.groupby(
    ['customer_id', 'REF_MONTH', 'loan_type', 'onus_offus_flag'], observed=True
)['loan_active'].sum().reset_index().rename(columns={'loan_active': 'total_active_loans_last_6m'})


processed_df = processed_df.merge(exp_data_l3_metrics, on=['customer_id', 'REF_MONTH', 'loan_type', 'onus_offus_flag'], how='left')
processed_df = processed_df.merge(exp_data_l6_metrics, on=['customer_id', 'REF_MONTH', 'loan_type', 'onus_offus_flag'], how='left')

processed_df[['total_active_loans_last_3m','total_active_loans_last_6m']] = processed_df[['total_active_loans_last_3m','total_active_loans_last_6m']].fillna(0)

In [77]:
processed_df[processed_df['customer_id'] == '7845202413']

Unnamed: 0,customer_id,REF_MONTH,loan_type,onus_offus_flag,total_loans,total_active_loans,total_closed_loans,min_loan_amt,max_loan_amt,min_days_since_last_payment,max_days_since_last_payment,min_days_since_loan_closure,max_days_since_loan_closure,total_emi_loans,total_active_loans_last_3m,total_active_loans_last_6m
2,7845202413,2025-03-31,credit_card,off_us,1,1,0,192000.0,192000.0,66.0,66.0,,,0,0.0,0.0
3,7845202413,2025-03-31,personal_loan,off_us,2,1,1,300000.0,700000.0,83.0,83.0,82.0,82.0,1,0.0,0.0
4,7845202413,2025-03-31,short_term_personal_loan,off_us,1,1,0,30000.0,30000.0,58.0,58.0,,,0,0.0,0.0


In [72]:
metrics = [
    'total_loans', 'total_active_loans', 'total_closed_loans', 'min_loan_amt', 'max_loan_amt',
    'min_days_since_last_payment', 'max_days_since_last_payment',
    'min_days_since_loan_closure', 'max_days_since_loan_closure', 'total_emi_loans',
    'total_active_loans_last_3m', 'total_active_loans_last_6m'
]

# Pivot the dataframe to flatten all metrics
bureau_td_pivot_df = processed_df.pivot_table(
    index=['customer_id', 'REF_MONTH'],
    columns=['loan_type', 'onus_offus_flag'],
    values=metrics,
    aggfunc={
        'total_loans': 'sum',
        'total_active_loans': 'sum',
        'total_closed_loans': 'sum',
        'min_loan_amt': 'min',
        'max_loan_amt': 'max',
        'min_days_since_last_payment': 'min',
        'min_days_since_loan_closure': 'min',
        'total_emi_loans': 'sum',
        'total_active_loans_last_3m': 'sum',
        'total_active_loans_last_6m': 'sum'
    }
    # fill_value=0  # Fill NaN values with 0
)

# Flatten MultiIndex columns
bureau_td_pivot_df.columns = ['_'.join(col).strip() for col in bureau_td_pivot_df.columns]

# Reset index to make it a normal dataframe
bureau_td_pivot_df.reset_index(inplace=True)

In [73]:
bureau_td_pivot_df.head()

Unnamed: 0,customer_id,REF_MONTH,max_loan_amt_consumer_loan_off_us,max_loan_amt_credit_card_off_us,max_loan_amt_gold_loan_off_us,max_loan_amt_personal_loan_off_us,max_loan_amt_personal_loan_on_us,max_loan_amt_short_term_personal_loan_off_us,max_loan_amt_short_term_personal_loan_on_us,min_days_since_last_payment_consumer_loan_off_us,min_days_since_last_payment_credit_card_off_us,min_days_since_last_payment_gold_loan_off_us,min_days_since_last_payment_personal_loan_off_us,min_days_since_last_payment_personal_loan_on_us,min_days_since_last_payment_short_term_personal_loan_off_us,min_days_since_last_payment_short_term_personal_loan_on_us,min_days_since_loan_closure_consumer_loan_off_us,min_days_since_loan_closure_credit_card_off_us,min_days_since_loan_closure_gold_loan_off_us,min_days_since_loan_closure_personal_loan_off_us,min_days_since_loan_closure_personal_loan_on_us,min_days_since_loan_closure_short_term_personal_loan_off_us,min_days_since_loan_closure_short_term_personal_loan_on_us,min_loan_amt_consumer_loan_off_us,min_loan_amt_credit_card_off_us,min_loan_amt_gold_loan_off_us,min_loan_amt_personal_loan_off_us,min_loan_amt_personal_loan_on_us,min_loan_amt_short_term_personal_loan_off_us,min_loan_amt_short_term_personal_loan_on_us,total_active_loans_consumer_loan_off_us,total_active_loans_credit_card_off_us,total_active_loans_gold_loan_off_us,total_active_loans_personal_loan_off_us,total_active_loans_personal_loan_on_us,total_active_loans_short_term_personal_loan_off_us,total_active_loans_short_term_personal_loan_on_us,total_active_loans_last_3m_consumer_loan_off_us,total_active_loans_last_3m_credit_card_off_us,total_active_loans_last_3m_gold_loan_off_us,total_active_loans_last_3m_personal_loan_off_us,total_active_loans_last_3m_personal_loan_on_us,total_active_loans_last_3m_short_term_personal_loan_off_us,total_active_loans_last_3m_short_term_personal_loan_on_us,total_active_loans_last_6m_consumer_loan_off_us,total_active_loans_last_6m_credit_card_off_us,total_active_loans_last_6m_gold_loan_off_us,total_active_loans_last_6m_personal_loan_off_us,total_active_loans_last_6m_personal_loan_on_us,total_active_loans_last_6m_short_term_personal_loan_off_us,total_active_loans_last_6m_short_term_personal_loan_on_us,total_closed_loans_consumer_loan_off_us,total_closed_loans_credit_card_off_us,total_closed_loans_gold_loan_off_us,total_closed_loans_personal_loan_off_us,total_closed_loans_personal_loan_on_us,total_closed_loans_short_term_personal_loan_off_us,total_closed_loans_short_term_personal_loan_on_us,total_emi_loans_consumer_loan_off_us,total_emi_loans_credit_card_off_us,total_emi_loans_gold_loan_off_us,total_emi_loans_personal_loan_off_us,total_emi_loans_personal_loan_on_us,total_emi_loans_short_term_personal_loan_off_us,total_emi_loans_short_term_personal_loan_on_us,total_loans_consumer_loan_off_us,total_loans_credit_card_off_us,total_loans_gold_loan_off_us,total_loans_personal_loan_off_us,total_loans_personal_loan_on_us,total_loans_short_term_personal_loan_off_us,total_loans_short_term_personal_loan_on_us
0,6000013391,2025-02-28,,,,,,2000.0,,,,,,,424.0,,,,,,,424.0,,,,,,,500.0,,,,,,,5.0,,,,,,,0.0,,,,,,,0.0,,,,,,,14.0,,,,,,,0.0,,,,,,,19.0,
1,6000019141,2025-02-28,,,,,,13000.0,,,,,,,41.0,,,,,,,102.0,,,,,,,700.0,,,,,,,1.0,,,,,,,1.0,,,,,,,1.0,,,,,,,2.0,,,,,,,1.0,,,,,,,3.0,
2,6000020305,2025-02-28,21440.0,,,,,,,,,,,,,,,,,,,,,21440.0,,,,,,,1.0,,,,,,,0.0,,,,,,,1.0,,,,,,,0.0,,,,,,,1.0,,,,,,,1.0,,,,,,
3,6000025460,2025-02-28,,,,8800.0,,2200.0,,,,,,,168.0,,,,,129.0,,265.0,,,,,8800.0,,1000.0,,,,,0.0,,3.0,,,,,0.0,,0.0,,,,,0.0,,0.0,,,,,1.0,,6.0,,,,,0.0,,1.0,,,,,1.0,,9.0,
4,6000025484,2025-02-28,,,,1000.0,,2000.0,,,,,525.0,,20.0,,,,,525.0,,20.0,,,,,1000.0,,500.0,,,,,0.0,,1.0,,,,,0.0,,1.0,,,,,0.0,,1.0,,,,,1.0,,9.0,,,,,0.0,,0.0,,,,,1.0,,10.0,


In [74]:
bureau_td_pivot_df[bureau_td_pivot_df['customer_id'] == '7845202413']

Unnamed: 0,customer_id,REF_MONTH,max_loan_amt_consumer_loan_off_us,max_loan_amt_credit_card_off_us,max_loan_amt_gold_loan_off_us,max_loan_amt_personal_loan_off_us,max_loan_amt_personal_loan_on_us,max_loan_amt_short_term_personal_loan_off_us,max_loan_amt_short_term_personal_loan_on_us,min_days_since_last_payment_consumer_loan_off_us,min_days_since_last_payment_credit_card_off_us,min_days_since_last_payment_gold_loan_off_us,min_days_since_last_payment_personal_loan_off_us,min_days_since_last_payment_personal_loan_on_us,min_days_since_last_payment_short_term_personal_loan_off_us,min_days_since_last_payment_short_term_personal_loan_on_us,min_days_since_loan_closure_consumer_loan_off_us,min_days_since_loan_closure_credit_card_off_us,min_days_since_loan_closure_gold_loan_off_us,min_days_since_loan_closure_personal_loan_off_us,min_days_since_loan_closure_personal_loan_on_us,min_days_since_loan_closure_short_term_personal_loan_off_us,min_days_since_loan_closure_short_term_personal_loan_on_us,min_loan_amt_consumer_loan_off_us,min_loan_amt_credit_card_off_us,min_loan_amt_gold_loan_off_us,min_loan_amt_personal_loan_off_us,min_loan_amt_personal_loan_on_us,min_loan_amt_short_term_personal_loan_off_us,min_loan_amt_short_term_personal_loan_on_us,total_active_loans_consumer_loan_off_us,total_active_loans_credit_card_off_us,total_active_loans_gold_loan_off_us,total_active_loans_personal_loan_off_us,total_active_loans_personal_loan_on_us,total_active_loans_short_term_personal_loan_off_us,total_active_loans_short_term_personal_loan_on_us,total_active_loans_last_3m_consumer_loan_off_us,total_active_loans_last_3m_credit_card_off_us,total_active_loans_last_3m_gold_loan_off_us,total_active_loans_last_3m_personal_loan_off_us,total_active_loans_last_3m_personal_loan_on_us,total_active_loans_last_3m_short_term_personal_loan_off_us,total_active_loans_last_3m_short_term_personal_loan_on_us,total_active_loans_last_6m_consumer_loan_off_us,total_active_loans_last_6m_credit_card_off_us,total_active_loans_last_6m_gold_loan_off_us,total_active_loans_last_6m_personal_loan_off_us,total_active_loans_last_6m_personal_loan_on_us,total_active_loans_last_6m_short_term_personal_loan_off_us,total_active_loans_last_6m_short_term_personal_loan_on_us,total_closed_loans_consumer_loan_off_us,total_closed_loans_credit_card_off_us,total_closed_loans_gold_loan_off_us,total_closed_loans_personal_loan_off_us,total_closed_loans_personal_loan_on_us,total_closed_loans_short_term_personal_loan_off_us,total_closed_loans_short_term_personal_loan_on_us,total_emi_loans_consumer_loan_off_us,total_emi_loans_credit_card_off_us,total_emi_loans_gold_loan_off_us,total_emi_loans_personal_loan_off_us,total_emi_loans_personal_loan_on_us,total_emi_loans_short_term_personal_loan_off_us,total_emi_loans_short_term_personal_loan_on_us,total_loans_consumer_loan_off_us,total_loans_credit_card_off_us,total_loans_gold_loan_off_us,total_loans_personal_loan_off_us,total_loans_personal_loan_on_us,total_loans_short_term_personal_loan_off_us,total_loans_short_term_personal_loan_on_us
157306,7845202413,2025-02-28,,192000.0,,700000.0,,30000.0,,,35.0,,52.0,,27.0,,,,,51.0,,,,,192000.0,,300000.0,,30000.0,,,1.0,,1.0,,1.0,,,0.0,,0.0,,0.0,,,0.0,,0.0,,0.0,,,0.0,,1.0,,0.0,,,0.0,,1.0,,0.0,,,1.0,,2.0,,1.0,


In [75]:
# table_name = 't_dg_bureau_tradeline_agg_scoring'
table_name = 't_dg_bureau_tradeline_agg_scoring_mar25'
pandas_gbq.to_gbq(
    dataframe=bureau_td_pivot_df,
    destination_table=f'abcd_data_science_app.{table_name}',
    project_id='abcd-dataplatform',
    if_exists='replace'
)

100%|██████████| 1/1 [00:00<00:00, 12748.64it/s]


In [76]:
del(exp_tradeline_df)

# Bureau Demographics

In [77]:
QUERY = f"""
with base as (
select 
last_day(SCRUB_DATE,MONTH) as REF_MONTH,
CUSTOMER_ID,
max(income) as CUST_INCOME,
max(income_freq) INCOME_FREQUENCY,
max(OCCUP_STATUS_CD) as OCCUPATION,
from `abffsl-dataplatform-uat.abfssl_central_analytics.EXPERIAN_RPT_EMPLOYMENT_DAILY_BASE`
group by 
REF_MONTH,
CUSTOMER_ID
)

select 
REF_MONTH,CUSTOMER_ID,
CASE WHEN OCCUPATION = '9' THEN "1"
     WHEN OCCUPATION = '10' THEN "2"
     WHEN OCCUPATION = '2' THEN "3"
     WHEN OCCUPATION = '99' THEN "4"
     ELSE null END as OCCUPATION,

  CASE 
    WHEN INCOME_FREQUENCY = '4' THEN CAST(CUST_INCOME as INT) * 3
    WHEN INCOME_FREQUENCY = '6' THEN CAST(CUST_INCOME as INT) * 2
    ELSE CAST(CUST_INCOME as INT)
  END AS Annual_Income

from base
"""

bureau_inc_df = client.query(QUERY).to_dataframe()
bureau_inc_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1572348 entries, 0 to 1572347
Data columns (total 4 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   REF_MONTH      1572348 non-null  dbdate
 1   CUSTOMER_ID    1572348 non-null  object
 2   OCCUPATION     1521615 non-null  object
 3   Annual_Income  677053 non-null   Int64 
dtypes: Int64(1), dbdate(1), object(2)
memory usage: 211.3 MB


In [78]:
bureau_inc_df = bureau_inc_df[bureau_inc_df['CUSTOMER_ID'].isin(xsell_pl_df['mobilenumber'])]

In [79]:
bureau_inc_df.rename({'CUSTOMER_ID':'mobilenumber'}, axis=1, inplace=True)
bureau_inc_df['REF_MONTH'] = pd.to_datetime(bureau_inc_df['REF_MONTH'])

In [80]:
QUERY = f"""

select 
a.CUSTOMER_ID,
a.DOB,
a.GENDER
 FROM `abffsl-dataplatform-uat.abfssl_central_analytics.EXPERIAN_RPT_NAME_DOB_DAILY_BASE` a

"""

bureau_dob_df = client.query(QUERY).to_dataframe()
bureau_dob_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2889977 entries, 0 to 2889976
Data columns (total 3 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   CUSTOMER_ID  object
 1   DOB          object
 2   GENDER       object
dtypes: object(3)
memory usage: 526.1 MB


In [81]:
bureau_dob_df.rename({'CUSTOMER_ID':'mobilenumber'}, axis=1, inplace=True)

In [82]:
print(bureau_dob_df.shape)

(2889977, 3)


In [83]:
bureau_dob_df.head()

Unnamed: 0,mobilenumber,DOB,GENDER
0,9866720352,01/01/1965,
1,9198510187,01/01/1977,
2,9963072002,01/01/1983,
3,9035955335,01/01/1983,
4,9953286479,01/01/1985,


# Feature Engineering for Scoring Month

In [84]:
print(xsell_pl_df.shape)
print(xsell_pl_df.head())

(1673937, 7)
  snapshot_month customer_id  install_flag          registration_date  \
0       Mar-2025     3147505             1 2024-11-19 19:07:10.544626   
1       Mar-2025     3094942             1 2024-11-18 07:01:53.087430   
2       Mar-2025     3051618             1 2024-11-16 10:37:18.437698   
3       Mar-2025     3046097             1 2024-11-16 02:13:49.830905   
4       Mar-2025     3109445             1 2024-11-18 15:15:44.011751   

   PL_target personal_details_complete_datetime mobilenumber  
0          0                                NaT   7795408678  
1          0                                NaT   7763821758  
2          0                                NaT   9348064956  
3          0                                NaT   9799906851  
4          0                                NaT   9125945369  


In [85]:
print(snapshot_period)
print(ref_month)

Mar-2025
2025-02-28


In [86]:
ref_month_df = {"snapshot_month": [snapshot_period],
                "REF_MONTH": [ref_month]}

ref_month_df = pd.DataFrame(ref_month_df)
ref_month_df['REF_MONTH'] = pd.to_datetime(ref_month_df['REF_MONTH'])

In [87]:
print(ref_month_df)

  snapshot_month  REF_MONTH
0       Mar-2025 2025-02-28


In [88]:
xsell_pl_df = xsell_pl_df.merge(ref_month_df, on = 'snapshot_month', how = 'left')

In [72]:
# xsell_pl_df.drop(['REF_MONTH_x','REF_MONTH_y'],axis=1,inplace=True)

In [89]:
xsell_pl_df.head()

Unnamed: 0,snapshot_month,customer_id,install_flag,registration_date,PL_target,personal_details_complete_datetime,mobilenumber,REF_MONTH
0,Mar-2025,3147505,1,2024-11-19 19:07:10.544626,0,NaT,7795408678,2025-02-28
1,Mar-2025,3094942,1,2024-11-18 07:01:53.087430,0,NaT,7763821758,2025-02-28
2,Mar-2025,3051618,1,2024-11-16 10:37:18.437698,0,NaT,9348064956,2025-02-28
3,Mar-2025,3046097,1,2024-11-16 02:13:49.830905,0,NaT,9799906851,2025-02-28
4,Mar-2025,3109445,1,2024-11-18 15:15:44.011751,0,NaT,9125945369,2025-02-28


### Bureau Consent Merged

In [90]:
xsell_pl_df

Unnamed: 0,snapshot_month,customer_id,install_flag,registration_date,PL_target,personal_details_complete_datetime,mobilenumber,REF_MONTH
0,Mar-2025,3147505,1,2024-11-19 19:07:10.544626,0,NaT,7795408678,2025-02-28
1,Mar-2025,3094942,1,2024-11-18 07:01:53.087430,0,NaT,7763821758,2025-02-28
2,Mar-2025,3051618,1,2024-11-16 10:37:18.437698,0,NaT,9348064956,2025-02-28
3,Mar-2025,3046097,1,2024-11-16 02:13:49.830905,0,NaT,9799906851,2025-02-28
4,Mar-2025,3109445,1,2024-11-18 15:15:44.011751,0,NaT,9125945369,2025-02-28
...,...,...,...,...,...,...,...,...
1673932,Mar-2025,1623072,1,2024-09-03 16:41:06.735368,0,NaT,9373361835,2025-02-28
1673933,Mar-2025,3884711,1,2024-12-14 12:41:29.595897,0,NaT,8539854550,2025-02-28
1673934,Mar-2025,3865720,1,2024-12-13 13:39:17.477126,0,NaT,9786224984,2025-02-28
1673935,Mar-2025,3808853,1,2024-12-11 11:10:48.665999,0,NaT,8800832484,2025-02-28


In [91]:
xsell_pl_df['mobilenumber'] = xsell_pl_df['mobilenumber'].astype('string')
xsell_pl_df['REF_MONTH'] = pd.to_datetime(xsell_pl_df['REF_MONTH'], format='%Y-%m').dt.date

In [92]:
bureau_consent_df['cc_consent_flag'] = np.where(bureau_consent_df['credit_track_click'] > 0, 1,0)
bureau_consent_df.rename({'EVENT_MONTH':'REF_MONTH'}, axis=1, inplace = True)

In [93]:
bureau_consent_df.cc_consent_flag.value_counts()

cc_consent_flag
0    266373
1     17617
Name: count, dtype: int64

In [94]:
print(xsell_pl_df.shape)
xsell_pl_df = xsell_pl_df.merge(bureau_consent_df[['REF_MONTH','customer_id','cc_consent_flag']], on = ['REF_MONTH','customer_id'], how = 'left')
print(xsell_pl_df.shape)

(1673937, 8)
(1673937, 9)


### Bureau DOB & Income Merged

In [95]:
print(xsell_pl_df.shape)
xsell_pl_df['REF_MONTH'] = pd.to_datetime(xsell_pl_df['REF_MONTH'])
xsell_pl_df = xsell_pl_df.merge(bureau_dob_df, on = ['mobilenumber'], how='left')
xsell_pl_df = xsell_pl_df.merge(bureau_inc_df, on = ['REF_MONTH','mobilenumber'], how='left')
xsell_pl_df = xsell_pl_df.drop_duplicates()
print(xsell_pl_df.shape)

(1673937, 9)
(1673937, 13)


In [96]:
xsell_pl_df.shape

(1673937, 13)

In [97]:
xsell_pl_df['DOB'] = pd.to_datetime(xsell_pl_df['DOB'], format='%d/%m/%Y')

# Calculate age in years
xsell_pl_df['age_in_years'] = (xsell_pl_df['REF_MONTH'].dt.year - xsell_pl_df['DOB'].dt.year) - (
    (xsell_pl_df['REF_MONTH'].dt.month < xsell_pl_df['DOB'].dt.month) | 
    ((xsell_pl_df['REF_MONTH'].dt.month == xsell_pl_df['DOB'].dt.month) & (xsell_pl_df['DOB'].dt.day < xsell_pl_df['DOB'].dt.day))
)

In [98]:
xsell_pl_df['GENDER'] = pd.to_numeric(xsell_pl_df['GENDER'], errors='coerce')
xsell_pl_df['OCCUPATION'] = pd.to_numeric(xsell_pl_df['OCCUPATION'], errors='coerce')

In [99]:
xsell_pl_df.drop('DOB',axis=1,inplace=True)

In [100]:
xsell_pl_df.head()

Unnamed: 0,snapshot_month,customer_id,install_flag,registration_date,PL_target,personal_details_complete_datetime,mobilenumber,REF_MONTH,cc_consent_flag,GENDER,OCCUPATION,Annual_Income,age_in_years
0,Mar-2025,3147505,1,2024-11-19 19:07:10.544626,0,NaT,7795408678,2025-02-28,,,,,
1,Mar-2025,3094942,1,2024-11-18 07:01:53.087430,0,NaT,7763821758,2025-02-28,0.0,1.0,,,40.0
2,Mar-2025,3051618,1,2024-11-16 10:37:18.437698,0,NaT,9348064956,2025-02-28,,2.0,,,21.0
3,Mar-2025,3046097,1,2024-11-16 02:13:49.830905,0,NaT,9799906851,2025-02-28,,2.0,,,48.0
4,Mar-2025,3109445,1,2024-11-18 15:15:44.011751,0,NaT,9125945369,2025-02-28,,2.0,,,32.0


In [101]:
xsell_pl_df.notna().mean()*100

snapshot_month                        100.000000
customer_id                           100.000000
install_flag                          100.000000
registration_date                     100.000000
PL_target                             100.000000
personal_details_complete_datetime      3.230767
mobilenumber                          100.000000
REF_MONTH                             100.000000
cc_consent_flag                        16.965394
GENDER                                 48.684090
OCCUPATION                              1.081164
Annual_Income                           0.510772
age_in_years                           50.218318
dtype: float64

### Bureau Enquiry

In [42]:
# QUERY = f"""

# select * FROM `abcd-dataplatform.abcd_data_science_app.t_dg_xsell_model_exp_enq_aggregated_scoring`

# """

# bureau_inq_df = client.query(QUERY).to_dataframe()
# bureau_inq_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221367 entries, 0 to 221366
Data columns (total 37 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   CUSTOMER_ID                  221367 non-null  object        
 1   total_enquiries_last_3m      221367 non-null  Int64         
 2   total_enquiries_last_6m      221367 non-null  Int64         
 3   total_enquiries_last_12m     221367 non-null  Int64         
 4   count_13_last_3m             221367 non-null  float64       
 5   count_13_last_6m             221367 non-null  float64       
 6   count_13_last_12m            221367 non-null  float64       
 7   count_ADITYA_last_3m         221367 non-null  float64       
 8   count_Aditya_Birla_last_3m   221367 non-null  float64       
 9   count_COB_last_3m            221367 non-null  float64       
 10  count_FOR_last_3m            221367 non-null  float64       
 11  count_NBF_last_3m         

In [103]:
bureau_inq_df = results.copy()

In [104]:
bureau_inq_df.shape

(182260, 37)

In [105]:
bureau_inq_df.rename({'CUSTOMER_ID':'mobilenumber'}, axis =1 ,inplace = True)
bureau_inq_df['mobilenumber'] = bureau_inq_df['mobilenumber'].astype('string')

In [106]:
bureau_inq_df.head()

Unnamed: 0,mobilenumber,total_enquiries_last_3m,total_enquiries_last_6m,total_enquiries_last_12m,count_13_last_3m,count_13_last_6m,count_13_last_12m,count_ADITYA_last_3m,count_Aditya_Birla_last_3m,count_COB_last_3m,count_FOR_last_3m,count_NBF_last_3m,count_PUB_last_3m,count_PVT_last_3m,count_RRB_last_3m,count_SFB_last_3m,count_ADITYA_last_6m,count_Aditya_Birla_last_6m,count_COB_last_6m,count_FOR_last_6m,count_NBF_last_6m,count_PUB_last_6m,count_PVT_last_6m,count_RRB_last_6m,count_SFB_last_6m,count_ADITYA_last_12m,count_Aditya_Birla_last_12m,count_COB_last_12m,count_FOR_last_12m,count_NBF_last_12m,count_PUB_last_12m,count_PVT_last_12m,count_RRB_last_12m,count_SFB_last_12m,MAX_AMT_INQ,AVG_AMT_INQ,REF_MONTH
0,9004010501,1,1,2,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,700000,362499.5,2025-02-28
1,9594349777,1,1,1,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,27999,27999.0,2025-02-28
2,9529797232,0,3,3,0.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,50000,26833.33,2025-02-28
3,8878032094,3,4,5,3.0,4.0,5.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,1500000,1190000.0,2025-02-28
4,8017912416,0,0,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,19435,19435.0,2025-02-28


In [107]:
bureau_inq_df['REF_MONTH'] = pd.to_datetime(bureau_inq_df['REF_MONTH'], format='%Y-%m').dt.date
xsell_pl_df['REF_MONTH'] = pd.to_datetime(xsell_pl_df['REF_MONTH'], format='%Y-%m').dt.date

In [108]:
print(xsell_pl_df.shape)
xsell_pl_df = xsell_pl_df.merge(bureau_inq_df, on = ['REF_MONTH','mobilenumber'], how = 'left')
print(xsell_pl_df.shape)

(1673937, 13)
(1673937, 48)


In [109]:
xsell_pl_df.head()

Unnamed: 0,snapshot_month,customer_id,install_flag,registration_date,PL_target,personal_details_complete_datetime,mobilenumber,REF_MONTH,cc_consent_flag,GENDER,OCCUPATION,Annual_Income,age_in_years,total_enquiries_last_3m,total_enquiries_last_6m,total_enquiries_last_12m,count_13_last_3m,count_13_last_6m,count_13_last_12m,count_ADITYA_last_3m,count_Aditya_Birla_last_3m,count_COB_last_3m,count_FOR_last_3m,count_NBF_last_3m,count_PUB_last_3m,count_PVT_last_3m,count_RRB_last_3m,count_SFB_last_3m,count_ADITYA_last_6m,count_Aditya_Birla_last_6m,count_COB_last_6m,count_FOR_last_6m,count_NBF_last_6m,count_PUB_last_6m,count_PVT_last_6m,count_RRB_last_6m,count_SFB_last_6m,count_ADITYA_last_12m,count_Aditya_Birla_last_12m,count_COB_last_12m,count_FOR_last_12m,count_NBF_last_12m,count_PUB_last_12m,count_PVT_last_12m,count_RRB_last_12m,count_SFB_last_12m,MAX_AMT_INQ,AVG_AMT_INQ
0,Mar-2025,3147505,1,2024-11-19 19:07:10.544626,0,NaT,7795408678,2025-02-28,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,Mar-2025,3094942,1,2024-11-18 07:01:53.087430,0,NaT,7763821758,2025-02-28,0.0,1.0,,,40.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,Mar-2025,3051618,1,2024-11-16 10:37:18.437698,0,NaT,9348064956,2025-02-28,,2.0,,,21.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,Mar-2025,3046097,1,2024-11-16 02:13:49.830905,0,NaT,9799906851,2025-02-28,,2.0,,,48.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,Mar-2025,3109445,1,2024-11-18 15:15:44.011751,0,NaT,9125945369,2025-02-28,,2.0,,,32.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [110]:
xsell_pl_df.notna().mean()*100

snapshot_month                        100.000000
customer_id                           100.000000
install_flag                          100.000000
registration_date                     100.000000
PL_target                             100.000000
personal_details_complete_datetime      3.230767
mobilenumber                          100.000000
REF_MONTH                             100.000000
cc_consent_flag                        16.965394
GENDER                                 48.684090
OCCUPATION                              1.081164
Annual_Income                           0.510772
age_in_years                           50.218318
total_enquiries_last_3m                10.888104
total_enquiries_last_6m                10.888104
total_enquiries_last_12m               10.888104
count_13_last_3m                       10.888104
count_13_last_6m                       10.888104
count_13_last_12m                      10.888104
count_ADITYA_last_3m                   10.888104
count_Aditya_Birla_l

### Bureau Tradeline Merged

In [50]:
# start_time = time.time()

# QUERY = f"""

# select * FROM
# `abcd-dataplatform.abcd_data_science_app.t_dg_bureau_tradeline_agg_scoring`

# """

# bureau_td_pivot_df = client.query(QUERY).to_dataframe()
# bureau_td_pivot_df.info(memory_usage='deep')

# end_time = time.time()

# print(f"Execution time: {end_time - start_time:.4f} seconds")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 956527 entries, 0 to 956526
Data columns (total 72 columns):
 #   Column                                                       Non-Null Count   Dtype         
---  ------                                                       --------------   -----         
 0   customer_id                                                  956527 non-null  object        
 1   REF_MONTH                                                    956527 non-null  datetime64[us]
 2   max_loan_amt_consumer_loan_off_us                            613674 non-null  float64       
 3   max_loan_amt_credit_card_off_us                              328473 non-null  float64       
 4   max_loan_amt_gold_loan_off_us                                245748 non-null  float64       
 5   max_loan_amt_personal_loan_off_us                            682359 non-null  float64       
 6   max_loan_amt_personal_loan_on_us                             84487 non-null   float64       
 7   ma

In [113]:
bureau_td_pivot_df.head()

Unnamed: 0,customer_id,REF_MONTH,max_loan_amt_consumer_loan_off_us,max_loan_amt_credit_card_off_us,max_loan_amt_gold_loan_off_us,max_loan_amt_personal_loan_off_us,max_loan_amt_personal_loan_on_us,max_loan_amt_short_term_personal_loan_off_us,max_loan_amt_short_term_personal_loan_on_us,min_days_since_last_payment_consumer_loan_off_us,min_days_since_last_payment_credit_card_off_us,min_days_since_last_payment_gold_loan_off_us,min_days_since_last_payment_personal_loan_off_us,min_days_since_last_payment_personal_loan_on_us,min_days_since_last_payment_short_term_personal_loan_off_us,min_days_since_last_payment_short_term_personal_loan_on_us,min_days_since_loan_closure_consumer_loan_off_us,min_days_since_loan_closure_credit_card_off_us,min_days_since_loan_closure_gold_loan_off_us,min_days_since_loan_closure_personal_loan_off_us,min_days_since_loan_closure_personal_loan_on_us,min_days_since_loan_closure_short_term_personal_loan_off_us,min_days_since_loan_closure_short_term_personal_loan_on_us,min_loan_amt_consumer_loan_off_us,min_loan_amt_credit_card_off_us,min_loan_amt_gold_loan_off_us,min_loan_amt_personal_loan_off_us,min_loan_amt_personal_loan_on_us,min_loan_amt_short_term_personal_loan_off_us,min_loan_amt_short_term_personal_loan_on_us,total_active_loans_consumer_loan_off_us,total_active_loans_credit_card_off_us,total_active_loans_gold_loan_off_us,total_active_loans_personal_loan_off_us,total_active_loans_personal_loan_on_us,total_active_loans_short_term_personal_loan_off_us,total_active_loans_short_term_personal_loan_on_us,total_active_loans_last_3m_consumer_loan_off_us,total_active_loans_last_3m_credit_card_off_us,total_active_loans_last_3m_gold_loan_off_us,total_active_loans_last_3m_personal_loan_off_us,total_active_loans_last_3m_personal_loan_on_us,total_active_loans_last_3m_short_term_personal_loan_off_us,total_active_loans_last_3m_short_term_personal_loan_on_us,total_active_loans_last_6m_consumer_loan_off_us,total_active_loans_last_6m_credit_card_off_us,total_active_loans_last_6m_gold_loan_off_us,total_active_loans_last_6m_personal_loan_off_us,total_active_loans_last_6m_personal_loan_on_us,total_active_loans_last_6m_short_term_personal_loan_off_us,total_active_loans_last_6m_short_term_personal_loan_on_us,total_closed_loans_consumer_loan_off_us,total_closed_loans_credit_card_off_us,total_closed_loans_gold_loan_off_us,total_closed_loans_personal_loan_off_us,total_closed_loans_personal_loan_on_us,total_closed_loans_short_term_personal_loan_off_us,total_closed_loans_short_term_personal_loan_on_us,total_emi_loans_consumer_loan_off_us,total_emi_loans_credit_card_off_us,total_emi_loans_gold_loan_off_us,total_emi_loans_personal_loan_off_us,total_emi_loans_personal_loan_on_us,total_emi_loans_short_term_personal_loan_off_us,total_emi_loans_short_term_personal_loan_on_us,total_loans_consumer_loan_off_us,total_loans_credit_card_off_us,total_loans_gold_loan_off_us,total_loans_personal_loan_off_us,total_loans_personal_loan_on_us,total_loans_short_term_personal_loan_off_us,total_loans_short_term_personal_loan_on_us
0,6000013391,2025-02-28,,,,,,2000.0,,,,,,,424.0,,,,,,,424.0,,,,,,,500.0,,,,,,,5.0,,,,,,,0.0,,,,,,,0.0,,,,,,,14.0,,,,,,,0.0,,,,,,,19.0,
1,6000019141,2025-02-28,,,,,,13000.0,,,,,,,41.0,,,,,,,102.0,,,,,,,700.0,,,,,,,1.0,,,,,,,1.0,,,,,,,1.0,,,,,,,2.0,,,,,,,1.0,,,,,,,3.0,
2,6000020305,2025-02-28,21440.0,,,,,,,,,,,,,,,,,,,,,21440.0,,,,,,,1.0,,,,,,,0.0,,,,,,,1.0,,,,,,,0.0,,,,,,,1.0,,,,,,,1.0,,,,,,
3,6000025460,2025-02-28,,,,8800.0,,2200.0,,,,,,,168.0,,,,,129.0,,265.0,,,,,8800.0,,1000.0,,,,,0.0,,3.0,,,,,0.0,,0.0,,,,,0.0,,0.0,,,,,1.0,,6.0,,,,,0.0,,1.0,,,,,1.0,,9.0,
4,6000025484,2025-02-28,,,,1000.0,,2000.0,,,,,525.0,,20.0,,,,,525.0,,20.0,,,,,1000.0,,500.0,,,,,0.0,,1.0,,,,,0.0,,1.0,,,,,0.0,,1.0,,,,,1.0,,9.0,,,,,0.0,,0.0,,,,,1.0,,10.0,


In [114]:
bureau_td_pivot_df.shape

(723432, 72)

In [115]:
bureau_td_pivot_df.rename({'customer_id': 'mobilenumber'}, axis=1, inplace = True)

In [116]:
bureau_td_pivot_df['REF_MONTH'] = pd.to_datetime(bureau_td_pivot_df['REF_MONTH'], format='%Y-%m').dt.date
bureau_td_pivot_df['mobilenumber'] = bureau_td_pivot_df['mobilenumber'].astype('string')

In [117]:
print(xsell_pl_df.shape)
xsell_pl_df = xsell_pl_df.merge(bureau_td_pivot_df, on = ['REF_MONTH','mobilenumber'], how = 'left')
print(xsell_pl_df.shape)

(1673937, 48)
(1673937, 118)


In [124]:
xsell_pl_df.columns.to_list()

['snapshot_month',
 'customer_id',
 'install_flag',
 'registration_date',
 'PL_target',
 'personal_details_complete_datetime',
 'mobilenumber',
 'REF_MONTH',
 'cc_consent_flag',
 'GENDER',
 'OCCUPATION',
 'Annual_Income',
 'age_in_years',
 'total_enquiries_last_3m',
 'total_enquiries_last_6m',
 'total_enquiries_last_12m',
 'count_13_last_3m',
 'count_13_last_6m',
 'count_13_last_12m',
 'count_ADITYA_last_3m',
 'count_Aditya_Birla_last_3m',
 'count_COB_last_3m',
 'count_FOR_last_3m',
 'count_NBF_last_3m',
 'count_PUB_last_3m',
 'count_PVT_last_3m',
 'count_RRB_last_3m',
 'count_SFB_last_3m',
 'count_ADITYA_last_6m',
 'count_Aditya_Birla_last_6m',
 'count_COB_last_6m',
 'count_FOR_last_6m',
 'count_NBF_last_6m',
 'count_PUB_last_6m',
 'count_PVT_last_6m',
 'count_RRB_last_6m',
 'count_SFB_last_6m',
 'count_ADITYA_last_12m',
 'count_Aditya_Birla_last_12m',
 'count_COB_last_12m',
 'count_FOR_last_12m',
 'count_NBF_last_12m',
 'count_PUB_last_12m',
 'count_PVT_last_12m',
 'count_RRB_last_12

## Features

In [118]:
xsell_pl_df.drop(['total_enquiries_last_3m','total_enquiries_last_6m','total_enquiries_last_12m'], axis =1, inplace= True)

In [143]:
# xsell_pl_df['count_inq_on_us_3m'] = xsell_pl_df['count_A_last_3m'] + xsell_pl_df['count_ADITYA_last_3m'] + xsell_pl_df['count_Aditya_Birla_last_3m']
# xsell_pl_df['count_inq_on_us_6m'] = xsell_pl_df['count_A_last_6m'] + xsell_pl_df['count_ADITYA_last_6m'] + xsell_pl_df['count_Aditya_Birla_last_6m']
# xsell_pl_df['count_inq_on_us_12m'] = xsell_pl_df['count_A_last_12m'] + xsell_pl_df['count_ADITYA_last_12m'] + xsell_pl_df['count_Aditya_Birla_last_12m']

# xsell_pl_df['count_inq_off_us_3m'] = xsell_pl_df['count_FOR_last_3m'] + xsell_pl_df['count_NBF_last_3m'] + xsell_pl_df['count_PUB_last_3m'] + xsell_pl_df['count_PVT_last_3m'] + xsell_pl_df['count_SFB_last_3m']
# xsell_pl_df['count_inq_off_us_6m'] = xsell_pl_df['count_FOR_last_6m'] + xsell_pl_df['count_NBF_last_6m'] + xsell_pl_df['count_PUB_last_6m'] + xsell_pl_df['count_PVT_last_6m'] + xsell_pl_df['count_SFB_last_6m']
# xsell_pl_df['count_inq_off_us_12m'] = xsell_pl_df['count_FOR_last_12m'] + xsell_pl_df['count_NBF_last_12m'] + xsell_pl_df['count_PUB_last_12m'] + xsell_pl_df['count_PVT_last_12m'] + xsell_pl_df['count_SFB_last_12m']

KeyError: 'count_A_last_3m'

In [119]:
xsell_pl_df['count_inq_on_us_3m'] = xsell_pl_df['count_ADITYA_last_3m'] + xsell_pl_df['count_Aditya_Birla_last_3m']
xsell_pl_df['count_inq_on_us_6m'] = xsell_pl_df['count_ADITYA_last_6m'] + xsell_pl_df['count_Aditya_Birla_last_6m']
xsell_pl_df['count_inq_on_us_12m'] = xsell_pl_df['count_ADITYA_last_12m'] + xsell_pl_df['count_Aditya_Birla_last_12m']

xsell_pl_df['count_inq_off_us_3m'] = xsell_pl_df['count_FOR_last_3m'] + xsell_pl_df['count_NBF_last_3m'] + xsell_pl_df['count_PUB_last_3m'] + xsell_pl_df['count_PVT_last_3m'] + xsell_pl_df['count_SFB_last_3m']
xsell_pl_df['count_inq_off_us_6m'] = xsell_pl_df['count_FOR_last_6m'] + xsell_pl_df['count_NBF_last_6m'] + xsell_pl_df['count_PUB_last_6m'] + xsell_pl_df['count_PVT_last_6m'] + xsell_pl_df['count_SFB_last_6m']
xsell_pl_df['count_inq_off_us_12m'] = xsell_pl_df['count_FOR_last_12m'] + xsell_pl_df['count_NBF_last_12m'] + xsell_pl_df['count_PUB_last_12m'] + xsell_pl_df['count_PVT_last_12m'] + xsell_pl_df['count_SFB_last_12m']

In [120]:
xsell_pl_df.drop(['count_ADITYA_last_3m','count_ADITYA_last_6m','count_ADITYA_last_12m',
                  'count_Aditya_Birla_last_3m','count_Aditya_Birla_last_6m','count_Aditya_Birla_last_12m',
                  'count_FOR_last_3m','count_FOR_last_6m','count_FOR_last_12m',
                 'count_NBF_last_3m','count_NBF_last_6m','count_NBF_last_12m',
                 'count_PUB_last_3m','count_PUB_last_6m','count_PUB_last_12m',
                 'count_PVT_last_3m','count_PVT_last_6m','count_PVT_last_12m',
                 'count_SFB_last_3m','count_SFB_last_6m','count_SFB_last_12m'
                 ], axis = 1, inplace = True)

In [121]:
xsell_pl_df['registration_date'] = xsell_pl_df['registration_date'].dt.date

In [122]:
xsell_pl_df.groupby('registration_date').count().to_csv('march_df.csv')

In [123]:
xsell_pl_df.head()

Unnamed: 0,snapshot_month,customer_id,install_flag,registration_date,PL_target,personal_details_complete_datetime,mobilenumber,REF_MONTH,cc_consent_flag,GENDER,OCCUPATION,Annual_Income,age_in_years,count_13_last_3m,count_13_last_6m,count_13_last_12m,count_COB_last_3m,count_RRB_last_3m,count_COB_last_6m,count_RRB_last_6m,count_COB_last_12m,count_RRB_last_12m,MAX_AMT_INQ,AVG_AMT_INQ,max_loan_amt_consumer_loan_off_us,max_loan_amt_credit_card_off_us,max_loan_amt_gold_loan_off_us,max_loan_amt_personal_loan_off_us,max_loan_amt_personal_loan_on_us,max_loan_amt_short_term_personal_loan_off_us,max_loan_amt_short_term_personal_loan_on_us,min_days_since_last_payment_consumer_loan_off_us,min_days_since_last_payment_credit_card_off_us,min_days_since_last_payment_gold_loan_off_us,min_days_since_last_payment_personal_loan_off_us,min_days_since_last_payment_personal_loan_on_us,min_days_since_last_payment_short_term_personal_loan_off_us,min_days_since_last_payment_short_term_personal_loan_on_us,min_days_since_loan_closure_consumer_loan_off_us,min_days_since_loan_closure_credit_card_off_us,min_days_since_loan_closure_gold_loan_off_us,min_days_since_loan_closure_personal_loan_off_us,min_days_since_loan_closure_personal_loan_on_us,min_days_since_loan_closure_short_term_personal_loan_off_us,min_days_since_loan_closure_short_term_personal_loan_on_us,min_loan_amt_consumer_loan_off_us,min_loan_amt_credit_card_off_us,min_loan_amt_gold_loan_off_us,min_loan_amt_personal_loan_off_us,min_loan_amt_personal_loan_on_us,min_loan_amt_short_term_personal_loan_off_us,min_loan_amt_short_term_personal_loan_on_us,total_active_loans_consumer_loan_off_us,total_active_loans_credit_card_off_us,total_active_loans_gold_loan_off_us,total_active_loans_personal_loan_off_us,total_active_loans_personal_loan_on_us,total_active_loans_short_term_personal_loan_off_us,total_active_loans_short_term_personal_loan_on_us,total_active_loans_last_3m_consumer_loan_off_us,total_active_loans_last_3m_credit_card_off_us,total_active_loans_last_3m_gold_loan_off_us,total_active_loans_last_3m_personal_loan_off_us,total_active_loans_last_3m_personal_loan_on_us,total_active_loans_last_3m_short_term_personal_loan_off_us,total_active_loans_last_3m_short_term_personal_loan_on_us,total_active_loans_last_6m_consumer_loan_off_us,total_active_loans_last_6m_credit_card_off_us,total_active_loans_last_6m_gold_loan_off_us,total_active_loans_last_6m_personal_loan_off_us,total_active_loans_last_6m_personal_loan_on_us,total_active_loans_last_6m_short_term_personal_loan_off_us,total_active_loans_last_6m_short_term_personal_loan_on_us,total_closed_loans_consumer_loan_off_us,total_closed_loans_credit_card_off_us,total_closed_loans_gold_loan_off_us,total_closed_loans_personal_loan_off_us,total_closed_loans_personal_loan_on_us,total_closed_loans_short_term_personal_loan_off_us,total_closed_loans_short_term_personal_loan_on_us,total_emi_loans_consumer_loan_off_us,total_emi_loans_credit_card_off_us,total_emi_loans_gold_loan_off_us,total_emi_loans_personal_loan_off_us,total_emi_loans_personal_loan_on_us,total_emi_loans_short_term_personal_loan_off_us,total_emi_loans_short_term_personal_loan_on_us,total_loans_consumer_loan_off_us,total_loans_credit_card_off_us,total_loans_gold_loan_off_us,total_loans_personal_loan_off_us,total_loans_personal_loan_on_us,total_loans_short_term_personal_loan_off_us,total_loans_short_term_personal_loan_on_us,count_inq_on_us_3m,count_inq_on_us_6m,count_inq_on_us_12m,count_inq_off_us_3m,count_inq_off_us_6m,count_inq_off_us_12m
0,Mar-2025,3147505,1,2024-11-19,0,NaT,7795408678,2025-02-28,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,Mar-2025,3094942,1,2024-11-18,0,NaT,7763821758,2025-02-28,0.0,1.0,,,40.0,,,,,,,,,,,,24714.0,,,,,,,25.0,,,,,,,,,,,,,,24714.0,,,,,,,1.0,,,,,,,0.0,,,,,,,0.0,,,,,,,0.0,,,,,,,1.0,,,,,,,1.0,,,,,,,,,,,,
2,Mar-2025,3051618,1,2024-11-16,0,NaT,9348064956,2025-02-28,,2.0,,,21.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,Mar-2025,3046097,1,2024-11-16,0,NaT,9799906851,2025-02-28,,2.0,,,48.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,Mar-2025,3109445,1,2024-11-18,0,NaT,9125945369,2025-02-28,,2.0,,,32.0,,,,,,,,,,,,13500.0,,,9000.0,,1240.0,,423.0,,,363.0,,241.0,,386.0,,,363.0,,241.0,,12499.0,,,1215.0,,1000.0,,0.0,,,0.0,,0.0,,0.0,,,0.0,,0.0,,0.0,,,0.0,,0.0,,2.0,,,4.0,,2.0,,0.0,,,0.0,,0.0,,2.0,,,4.0,,2.0,,,,,,,


In [124]:
from datetime import datetime

xsell_pl_df['registration_date'] = pd.to_datetime(xsell_pl_df['registration_date'], errors='coerce')

In [125]:
xsell_pl_df.shape

(1673937, 100)

In [62]:
xsell_pl_df

Unnamed: 0,snapshot_month,customer_id,install_flag,registration_date,PL_target,personal_details_complete_datetime,mobilenumber,REF_MONTH,cc_consent_flag,GENDER,OCCUPATION,Annual_Income,age_in_years,count_13_last_3m,count_13_last_6m,count_13_last_12m,count_COB_last_3m,count_RRB_last_3m,count_COB_last_6m,count_RRB_last_6m,count_COB_last_12m,count_RRB_last_12m,MAX_AMT_INQ,AVG_AMT_INQ,max_loan_amt_consumer_loan_off_us,max_loan_amt_credit_card_off_us,max_loan_amt_gold_loan_off_us,max_loan_amt_personal_loan_off_us,max_loan_amt_personal_loan_on_us,max_loan_amt_short_term_personal_loan_off_us,max_loan_amt_short_term_personal_loan_on_us,min_days_since_last_payment_consumer_loan_off_us,min_days_since_last_payment_credit_card_off_us,min_days_since_last_payment_gold_loan_off_us,min_days_since_last_payment_personal_loan_off_us,min_days_since_last_payment_personal_loan_on_us,min_days_since_last_payment_short_term_personal_loan_off_us,min_days_since_last_payment_short_term_personal_loan_on_us,min_days_since_loan_closure_consumer_loan_off_us,min_days_since_loan_closure_credit_card_off_us,min_days_since_loan_closure_gold_loan_off_us,min_days_since_loan_closure_personal_loan_off_us,min_days_since_loan_closure_personal_loan_on_us,min_days_since_loan_closure_short_term_personal_loan_off_us,min_days_since_loan_closure_short_term_personal_loan_on_us,min_loan_amt_consumer_loan_off_us,min_loan_amt_credit_card_off_us,min_loan_amt_gold_loan_off_us,min_loan_amt_personal_loan_off_us,min_loan_amt_personal_loan_on_us,min_loan_amt_short_term_personal_loan_off_us,min_loan_amt_short_term_personal_loan_on_us,total_active_loans_consumer_loan_off_us,total_active_loans_credit_card_off_us,total_active_loans_gold_loan_off_us,total_active_loans_personal_loan_off_us,total_active_loans_personal_loan_on_us,total_active_loans_short_term_personal_loan_off_us,total_active_loans_short_term_personal_loan_on_us,total_active_loans_last_3m_consumer_loan_off_us,total_active_loans_last_3m_credit_card_off_us,total_active_loans_last_3m_gold_loan_off_us,total_active_loans_last_3m_personal_loan_off_us,total_active_loans_last_3m_personal_loan_on_us,total_active_loans_last_3m_short_term_personal_loan_off_us,total_active_loans_last_3m_short_term_personal_loan_on_us,total_active_loans_last_6m_consumer_loan_off_us,total_active_loans_last_6m_credit_card_off_us,total_active_loans_last_6m_gold_loan_off_us,total_active_loans_last_6m_personal_loan_off_us,total_active_loans_last_6m_personal_loan_on_us,total_active_loans_last_6m_short_term_personal_loan_off_us,total_active_loans_last_6m_short_term_personal_loan_on_us,total_closed_loans_consumer_loan_off_us,total_closed_loans_credit_card_off_us,total_closed_loans_gold_loan_off_us,total_closed_loans_personal_loan_off_us,total_closed_loans_personal_loan_on_us,total_closed_loans_short_term_personal_loan_off_us,total_closed_loans_short_term_personal_loan_on_us,total_emi_loans_consumer_loan_off_us,total_emi_loans_credit_card_off_us,total_emi_loans_gold_loan_off_us,total_emi_loans_personal_loan_off_us,total_emi_loans_personal_loan_on_us,total_emi_loans_short_term_personal_loan_off_us,total_emi_loans_short_term_personal_loan_on_us,total_loans_consumer_loan_off_us,total_loans_credit_card_off_us,total_loans_gold_loan_off_us,total_loans_personal_loan_off_us,total_loans_personal_loan_on_us,total_loans_short_term_personal_loan_off_us,total_loans_short_term_personal_loan_on_us,count_inq_on_us_3m,count_inq_on_us_6m,count_inq_on_us_12m,count_inq_off_us_3m,count_inq_off_us_6m,count_inq_off_us_12m
0,Apr-2025,4072175,1,2024-12-26,0,NaT,9398641378,2025-03-31,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,Apr-2025,4083452,1,2024-12-27,0,NaT,9927476735,2025-03-31,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,Apr-2025,3830259,1,2024-12-12,0,NaT,9010019028,2025-03-31,,1.0,4.0,,48.0,,,,,,,,,,,,,,72000.0,200000.0,,,,,,93.0,550.0,,,,,,90.0,654.0,,,,,,0.0,100000.0,,,,,,1.0,1.0,,,,,,1.0,0.0,,,,,,1.0,0.0,,,,,,5.0,1.0,,,,,,0.0,1.0,,,,,,6.0,2.0,,,,,,,,,
3,Apr-2025,4112295,1,2024-12-29,0,NaT,8779449781,2025-03-31,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,Apr-2025,4092267,1,2024-12-28,0,NaT,9166512107,2025-03-31,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1784154,Apr-2025,1327145,1,2024-08-25,0,NaT,7386228768,2025-03-31,,2.0,,,30.0,,,,,,,,,,,,14499.0,,,25000.0,,,,302.0,,,112.0,,,,269.0,,,,,,,14499.0,,,25000.0,,,,0.0,,,1.0,,,,0.0,,,0.0,,,,0.0,,,0.0,,,,1.0,,,0.0,,,,0.0,,,1.0,,,,1.0,,,1.0,,,,,,,,,
1784155,Apr-2025,1246937,1,2024-08-23,0,NaT,9263894861,2025-03-31,,2.0,,,29.0,0.0,6.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,50000,50000.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,6.0,6.0,0.0,0.0,0.0
1784156,Apr-2025,4558674,1,2025-02-26,0,NaT,6299235312,2025-03-31,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1784157,Apr-2025,4532523,1,2025-02-25,0,NaT,9654189260,2025-03-31,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [63]:
# start_date = '2025-03-28'
# end_date = '2025-03-29'

# scoring_df = xsell_pl_df[~(xsell_pl_df['registration_date'] >= start_date) & (xsell_pl_df['registration_date'] <= end_date)]

In [127]:
scoring_df.shape

(1673937, 93)

In [65]:
xsell_pl_df.shape

(1784159, 100)

In [126]:
scoring_df = xsell_pl_df.drop(['snapshot_month',
                               'customer_id',
                               'install_flag',
                               'registration_date',
                               'PL_target',
                               'personal_details_complete_datetime',
                               'mobilenumber'], axis = 1)

In [128]:
cols_on_us = ['count_inq_on_us_3m', 'count_inq_on_us_6m', 'count_inq_on_us_12m']
scoring_df.loc[(scoring_df['total_active_loans_personal_loan_on_us'].notna()) & 
       (scoring_df['total_active_loans_personal_loan_on_us'] > 0), cols_on_us] = \
    scoring_df.loc[(scoring_df['total_active_loans_personal_loan_on_us'].notna()) & 
           (scoring_df['total_active_loans_personal_loan_on_us'] > 0), cols_on_us].fillna(0)

# Fill NA with 0 where total_active_loans_personal_loan_off_us is NOT NULL and > 0
cols_off_us = ['count_inq_off_us_3m', 'count_inq_off_us_6m', 'count_inq_off_us_12m']
scoring_df.loc[(scoring_df['total_active_loans_personal_loan_off_us'].notna()) & 
       (scoring_df['total_active_loans_personal_loan_off_us'] > 0), cols_off_us] = \
    scoring_df.loc[(scoring_df['total_active_loans_personal_loan_off_us'].notna()) & 
           (scoring_df['total_active_loans_personal_loan_off_us'] > 0), cols_off_us].fillna(0)

In [134]:
xsell_pl_df.head()

Unnamed: 0,snapshot_month,customer_id,install_flag,registration_date,PL_target,personal_details_complete_datetime,mobilenumber,REF_MONTH,cc_consent_flag,GENDER,OCCUPATION,Annual_Income,age_in_years,count_13_last_3m,count_13_last_6m,count_13_last_12m,count_COB_last_3m,count_RRB_last_3m,count_COB_last_6m,count_RRB_last_6m,count_COB_last_12m,count_RRB_last_12m,MAX_AMT_INQ,AVG_AMT_INQ,max_loan_amt_consumer_loan_off_us,max_loan_amt_credit_card_off_us,max_loan_amt_gold_loan_off_us,max_loan_amt_personal_loan_off_us,max_loan_amt_personal_loan_on_us,max_loan_amt_short_term_personal_loan_off_us,max_loan_amt_short_term_personal_loan_on_us,min_days_since_last_payment_consumer_loan_off_us,min_days_since_last_payment_credit_card_off_us,min_days_since_last_payment_gold_loan_off_us,min_days_since_last_payment_personal_loan_off_us,min_days_since_last_payment_personal_loan_on_us,min_days_since_last_payment_short_term_personal_loan_off_us,min_days_since_last_payment_short_term_personal_loan_on_us,min_days_since_loan_closure_consumer_loan_off_us,min_days_since_loan_closure_credit_card_off_us,min_days_since_loan_closure_gold_loan_off_us,min_days_since_loan_closure_personal_loan_off_us,min_days_since_loan_closure_personal_loan_on_us,min_days_since_loan_closure_short_term_personal_loan_off_us,min_days_since_loan_closure_short_term_personal_loan_on_us,min_loan_amt_consumer_loan_off_us,min_loan_amt_credit_card_off_us,min_loan_amt_gold_loan_off_us,min_loan_amt_personal_loan_off_us,min_loan_amt_personal_loan_on_us,min_loan_amt_short_term_personal_loan_off_us,min_loan_amt_short_term_personal_loan_on_us,total_active_loans_consumer_loan_off_us,total_active_loans_credit_card_off_us,total_active_loans_gold_loan_off_us,total_active_loans_personal_loan_off_us,total_active_loans_personal_loan_on_us,total_active_loans_short_term_personal_loan_off_us,total_active_loans_short_term_personal_loan_on_us,total_active_loans_last_3m_consumer_loan_off_us,total_active_loans_last_3m_credit_card_off_us,total_active_loans_last_3m_gold_loan_off_us,total_active_loans_last_3m_personal_loan_off_us,total_active_loans_last_3m_personal_loan_on_us,total_active_loans_last_3m_short_term_personal_loan_off_us,total_active_loans_last_3m_short_term_personal_loan_on_us,total_active_loans_last_6m_consumer_loan_off_us,total_active_loans_last_6m_credit_card_off_us,total_active_loans_last_6m_gold_loan_off_us,total_active_loans_last_6m_personal_loan_off_us,total_active_loans_last_6m_personal_loan_on_us,total_active_loans_last_6m_short_term_personal_loan_off_us,total_active_loans_last_6m_short_term_personal_loan_on_us,total_closed_loans_consumer_loan_off_us,total_closed_loans_credit_card_off_us,total_closed_loans_gold_loan_off_us,total_closed_loans_personal_loan_off_us,total_closed_loans_personal_loan_on_us,total_closed_loans_short_term_personal_loan_off_us,total_closed_loans_short_term_personal_loan_on_us,total_emi_loans_consumer_loan_off_us,total_emi_loans_credit_card_off_us,total_emi_loans_gold_loan_off_us,total_emi_loans_personal_loan_off_us,total_emi_loans_personal_loan_on_us,total_emi_loans_short_term_personal_loan_off_us,total_emi_loans_short_term_personal_loan_on_us,total_loans_consumer_loan_off_us,total_loans_credit_card_off_us,total_loans_gold_loan_off_us,total_loans_personal_loan_off_us,total_loans_personal_loan_on_us,total_loans_short_term_personal_loan_off_us,total_loans_short_term_personal_loan_on_us,count_inq_on_us_3m,count_inq_on_us_6m,count_inq_on_us_12m,count_inq_off_us_3m,count_inq_off_us_6m,count_inq_off_us_12m
0,Mar-2025,3147505,1,2024-11-19,0,NaT,7795408678,2025-02-28,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,Mar-2025,3094942,1,2024-11-18,0,NaT,7763821758,2025-02-28,0.0,1.0,,,40.0,,,,,,,,,,,,24714.0,,,,,,,25.0,,,,,,,,,,,,,,24714.0,,,,,,,1.0,,,,,,,0.0,,,,,,,0.0,,,,,,,0.0,,,,,,,1.0,,,,,,,1.0,,,,,,,,,,,,
2,Mar-2025,3051618,1,2024-11-16,0,NaT,9348064956,2025-02-28,,2.0,,,21.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,Mar-2025,3046097,1,2024-11-16,0,NaT,9799906851,2025-02-28,,2.0,,,48.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,Mar-2025,3109445,1,2024-11-18,0,NaT,9125945369,2025-02-28,,2.0,,,32.0,,,,,,,,,,,,13500.0,,,9000.0,,1240.0,,423.0,,,363.0,,241.0,,386.0,,,363.0,,241.0,,12499.0,,,1215.0,,1000.0,,0.0,,,0.0,,0.0,,0.0,,,0.0,,0.0,,0.0,,,0.0,,0.0,,2.0,,,4.0,,2.0,,0.0,,,0.0,,0.0,,2.0,,,4.0,,2.0,,,,,,,


In [144]:
# scoring_df = xsell_pl_df[['snapshot_month',
#                                'customer_id',
#                                'install_flag',
#                                'registration_date',
#                                'PL_target',
#                                'personal_details_complete_datetime',
#                                'mobilenumber',
#                                'REF_MONTH',
#               'min_days_since_loan_closure_personal_loan_off_us',
# 'min_days_since_last_payment_personal_loan_off_us',
# 'total_active_loans_short_term_personal_loan_on_us',
# 'min_days_since_loan_closure_consumer_loan_off_us',
# 'total_active_loans_last_3m_personal_loan_off_us',
# 'total_closed_loans_personal_loan_off_us',
# 'total_loans_consumer_loan_off_us',
# 'total_loans_personal_loan_off_us',
# 'AVG_AMT_INQ',
# 'cc_consent_flag',
# 'GENDER',
# 'age_in_years',
# 'OCCUPATION',
# 'Annual_Income'
#              ]]

In [146]:
scoring_df.shape

(1673937, 22)

In [145]:
# scoring_df.to_parquet('revamped_pl_march_data.parquet')

In [129]:
X = scoring_df[[
'min_days_since_loan_closure_personal_loan_off_us',
'min_days_since_last_payment_personal_loan_off_us',
'total_active_loans_short_term_personal_loan_on_us',
'min_days_since_loan_closure_consumer_loan_off_us',
'total_active_loans_last_3m_personal_loan_off_us',
'total_closed_loans_personal_loan_off_us',
'total_loans_consumer_loan_off_us',
'total_loans_personal_loan_off_us',
'AVG_AMT_INQ',
'cc_consent_flag',
'GENDER',
'age_in_years',
'OCCUPATION',
'Annual_Income']]

In [133]:
scoring_df.head()

Unnamed: 0,REF_MONTH,cc_consent_flag,GENDER,OCCUPATION,Annual_Income,age_in_years,count_13_last_3m,count_13_last_6m,count_13_last_12m,count_COB_last_3m,count_RRB_last_3m,count_COB_last_6m,count_RRB_last_6m,count_COB_last_12m,count_RRB_last_12m,MAX_AMT_INQ,AVG_AMT_INQ,max_loan_amt_consumer_loan_off_us,max_loan_amt_credit_card_off_us,max_loan_amt_gold_loan_off_us,max_loan_amt_personal_loan_off_us,max_loan_amt_personal_loan_on_us,max_loan_amt_short_term_personal_loan_off_us,max_loan_amt_short_term_personal_loan_on_us,min_days_since_last_payment_consumer_loan_off_us,min_days_since_last_payment_credit_card_off_us,min_days_since_last_payment_gold_loan_off_us,min_days_since_last_payment_personal_loan_off_us,min_days_since_last_payment_personal_loan_on_us,min_days_since_last_payment_short_term_personal_loan_off_us,min_days_since_last_payment_short_term_personal_loan_on_us,min_days_since_loan_closure_consumer_loan_off_us,min_days_since_loan_closure_credit_card_off_us,min_days_since_loan_closure_gold_loan_off_us,min_days_since_loan_closure_personal_loan_off_us,min_days_since_loan_closure_personal_loan_on_us,min_days_since_loan_closure_short_term_personal_loan_off_us,min_days_since_loan_closure_short_term_personal_loan_on_us,min_loan_amt_consumer_loan_off_us,min_loan_amt_credit_card_off_us,min_loan_amt_gold_loan_off_us,min_loan_amt_personal_loan_off_us,min_loan_amt_personal_loan_on_us,min_loan_amt_short_term_personal_loan_off_us,min_loan_amt_short_term_personal_loan_on_us,total_active_loans_consumer_loan_off_us,total_active_loans_credit_card_off_us,total_active_loans_gold_loan_off_us,total_active_loans_personal_loan_off_us,total_active_loans_personal_loan_on_us,total_active_loans_short_term_personal_loan_off_us,total_active_loans_short_term_personal_loan_on_us,total_active_loans_last_3m_consumer_loan_off_us,total_active_loans_last_3m_credit_card_off_us,total_active_loans_last_3m_gold_loan_off_us,total_active_loans_last_3m_personal_loan_off_us,total_active_loans_last_3m_personal_loan_on_us,total_active_loans_last_3m_short_term_personal_loan_off_us,total_active_loans_last_3m_short_term_personal_loan_on_us,total_active_loans_last_6m_consumer_loan_off_us,total_active_loans_last_6m_credit_card_off_us,total_active_loans_last_6m_gold_loan_off_us,total_active_loans_last_6m_personal_loan_off_us,total_active_loans_last_6m_personal_loan_on_us,total_active_loans_last_6m_short_term_personal_loan_off_us,total_active_loans_last_6m_short_term_personal_loan_on_us,total_closed_loans_consumer_loan_off_us,total_closed_loans_credit_card_off_us,total_closed_loans_gold_loan_off_us,total_closed_loans_personal_loan_off_us,total_closed_loans_personal_loan_on_us,total_closed_loans_short_term_personal_loan_off_us,total_closed_loans_short_term_personal_loan_on_us,total_emi_loans_consumer_loan_off_us,total_emi_loans_credit_card_off_us,total_emi_loans_gold_loan_off_us,total_emi_loans_personal_loan_off_us,total_emi_loans_personal_loan_on_us,total_emi_loans_short_term_personal_loan_off_us,total_emi_loans_short_term_personal_loan_on_us,total_loans_consumer_loan_off_us,total_loans_credit_card_off_us,total_loans_gold_loan_off_us,total_loans_personal_loan_off_us,total_loans_personal_loan_on_us,total_loans_short_term_personal_loan_off_us,total_loans_short_term_personal_loan_on_us,count_inq_on_us_3m,count_inq_on_us_6m,count_inq_on_us_12m,count_inq_off_us_3m,count_inq_off_us_6m,count_inq_off_us_12m
0,2025-02-28,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2025-02-28,0.0,1.0,,,40.0,,,,,,,,,,,,24714.0,,,,,,,25.0,,,,,,,,,,,,,,24714.0,,,,,,,1.0,,,,,,,0.0,,,,,,,0.0,,,,,,,0.0,,,,,,,1.0,,,,,,,1.0,,,,,,,,,,,,
2,2025-02-28,,2.0,,,21.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2025-02-28,,2.0,,,48.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2025-02-28,,2.0,,,32.0,,,,,,,,,,,,13500.0,,,9000.0,,1240.0,,423.0,,,363.0,,241.0,,386.0,,,363.0,,241.0,,12499.0,,,1215.0,,1000.0,,0.0,,,0.0,,0.0,,0.0,,,0.0,,0.0,,0.0,,,0.0,,0.0,,2.0,,,4.0,,2.0,,0.0,,,0.0,,0.0,,2.0,,,4.0,,2.0,,,,,,,


In [None]:
scoring_df[[
'min_days_since_loan_closure_personal_loan_off_us',
'min_days_since_last_payment_personal_loan_off_us',
'total_active_loans_short_term_personal_loan_on_us',
'min_days_since_loan_closure_consumer_loan_off_us',
'total_active_loans_last_3m_personal_loan_off_us',
'total_closed_loans_personal_loan_off_us',
'total_loans_consumer_loan_off_us',
'total_loans_personal_loan_off_us',
'AVG_AMT_INQ',
'cc_consent_flag',
'GENDER',
'age_in_years',
'OCCUPATION',
'Annual_Income']]

In [130]:
X.notna().mean()*100

min_days_since_loan_closure_personal_loan_off_us     24.926326
min_days_since_last_payment_personal_loan_off_us     28.464632
total_active_loans_short_term_personal_loan_on_us     5.363284
min_days_since_loan_closure_consumer_loan_off_us     20.597609
total_active_loans_last_3m_personal_loan_off_us      30.987546
total_closed_loans_personal_loan_off_us              30.987546
total_loans_consumer_loan_off_us                     28.100819
total_loans_personal_loan_off_us                     30.987546
AVG_AMT_INQ                                          10.888104
cc_consent_flag                                      16.965394
GENDER                                               48.684090
age_in_years                                         50.218318
OCCUPATION                                            1.081164
Annual_Income                                         0.510772
dtype: float64

In [70]:
with open('xsell_pl_xgboost_model_with_demo_features.pkl', 'rb') as f:
    xgb_model = pickle.load(f)

In [71]:
y_predict_score =xgb_model.predict_proba(X)
y_predict_score=y_predict_score[:,1]

In [72]:
score_comb = pd.concat([xsell_pl_df, pd.Series(y_predict_score, index=xsell_pl_df.index, name="pred_prob1")], axis=1)

In [73]:
score_comb.head()

Unnamed: 0,snapshot_month,customer_id,install_flag,registration_date,PL_target,personal_details_complete_datetime,mobilenumber,REF_MONTH,cc_consent_flag,GENDER,OCCUPATION,Annual_Income,age_in_years,count_13_last_3m,count_13_last_6m,count_13_last_12m,count_COB_last_3m,count_RRB_last_3m,count_COB_last_6m,count_RRB_last_6m,count_COB_last_12m,count_RRB_last_12m,MAX_AMT_INQ,AVG_AMT_INQ,max_loan_amt_consumer_loan_off_us,max_loan_amt_credit_card_off_us,max_loan_amt_gold_loan_off_us,max_loan_amt_personal_loan_off_us,max_loan_amt_personal_loan_on_us,max_loan_amt_short_term_personal_loan_off_us,max_loan_amt_short_term_personal_loan_on_us,min_days_since_last_payment_consumer_loan_off_us,min_days_since_last_payment_credit_card_off_us,min_days_since_last_payment_gold_loan_off_us,min_days_since_last_payment_personal_loan_off_us,min_days_since_last_payment_personal_loan_on_us,min_days_since_last_payment_short_term_personal_loan_off_us,min_days_since_last_payment_short_term_personal_loan_on_us,min_days_since_loan_closure_consumer_loan_off_us,min_days_since_loan_closure_credit_card_off_us,min_days_since_loan_closure_gold_loan_off_us,min_days_since_loan_closure_personal_loan_off_us,min_days_since_loan_closure_personal_loan_on_us,min_days_since_loan_closure_short_term_personal_loan_off_us,min_days_since_loan_closure_short_term_personal_loan_on_us,min_loan_amt_consumer_loan_off_us,min_loan_amt_credit_card_off_us,min_loan_amt_gold_loan_off_us,min_loan_amt_personal_loan_off_us,min_loan_amt_personal_loan_on_us,min_loan_amt_short_term_personal_loan_off_us,min_loan_amt_short_term_personal_loan_on_us,total_active_loans_consumer_loan_off_us,total_active_loans_credit_card_off_us,total_active_loans_gold_loan_off_us,total_active_loans_personal_loan_off_us,total_active_loans_personal_loan_on_us,total_active_loans_short_term_personal_loan_off_us,total_active_loans_short_term_personal_loan_on_us,total_active_loans_last_3m_consumer_loan_off_us,total_active_loans_last_3m_credit_card_off_us,total_active_loans_last_3m_gold_loan_off_us,total_active_loans_last_3m_personal_loan_off_us,total_active_loans_last_3m_personal_loan_on_us,total_active_loans_last_3m_short_term_personal_loan_off_us,total_active_loans_last_3m_short_term_personal_loan_on_us,total_active_loans_last_6m_consumer_loan_off_us,total_active_loans_last_6m_credit_card_off_us,total_active_loans_last_6m_gold_loan_off_us,total_active_loans_last_6m_personal_loan_off_us,total_active_loans_last_6m_personal_loan_on_us,total_active_loans_last_6m_short_term_personal_loan_off_us,total_active_loans_last_6m_short_term_personal_loan_on_us,total_closed_loans_consumer_loan_off_us,total_closed_loans_credit_card_off_us,total_closed_loans_gold_loan_off_us,total_closed_loans_personal_loan_off_us,total_closed_loans_personal_loan_on_us,total_closed_loans_short_term_personal_loan_off_us,total_closed_loans_short_term_personal_loan_on_us,total_emi_loans_consumer_loan_off_us,total_emi_loans_credit_card_off_us,total_emi_loans_gold_loan_off_us,total_emi_loans_personal_loan_off_us,total_emi_loans_personal_loan_on_us,total_emi_loans_short_term_personal_loan_off_us,total_emi_loans_short_term_personal_loan_on_us,total_loans_consumer_loan_off_us,total_loans_credit_card_off_us,total_loans_gold_loan_off_us,total_loans_personal_loan_off_us,total_loans_personal_loan_on_us,total_loans_short_term_personal_loan_off_us,total_loans_short_term_personal_loan_on_us,count_inq_on_us_3m,count_inq_on_us_6m,count_inq_on_us_12m,count_inq_off_us_3m,count_inq_off_us_6m,count_inq_off_us_12m,pred_prob1
0,Apr-2025,4072175,1,2024-12-26,0,NaT,9398641378,2025-03-31,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.258372
1,Apr-2025,4083452,1,2024-12-27,0,NaT,9927476735,2025-03-31,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.314552
2,Apr-2025,3830259,1,2024-12-12,0,NaT,9010019028,2025-03-31,,1.0,4.0,,48.0,,,,,,,,,,,,,,72000.0,200000.0,,,,,,93.0,550.0,,,,,,90.0,654.0,,,,,,0.0,100000.0,,,,,,1.0,1.0,,,,,,1.0,0.0,,,,,,1.0,0.0,,,,,,5.0,1.0,,,,,,0.0,1.0,,,,,,6.0,2.0,,,,,,,,,,0.202787
3,Apr-2025,4112295,1,2024-12-29,0,NaT,8779449781,2025-03-31,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.258372
4,Apr-2025,4092267,1,2024-12-28,0,NaT,9166512107,2025-03-31,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.258372


In [74]:
score_comb = score_comb.sort_values('pred_prob1', ascending = False)

In [76]:
# Rank the predictions by probability
score_comb["rank"] = score_comb["pred_prob1"].rank(method="first")
score_comb["Decile"] = pd.qcut(score_comb["rank"], 10, labels=False) + 1

In [77]:
gain = score_comb.groupby('Decile').agg(
    min_prob=('pred_prob1', 'min'),
    max_prob=('pred_prob1', 'max')).reset_index()

In [78]:
gain

Unnamed: 0,Decile,min_prob,max_prob
0,1,8e-06,0.126535
1,2,0.126535,0.233158
2,3,0.233158,0.258372
3,4,0.258372,0.258372
4,5,0.258372,0.258372
5,6,0.258372,0.314552
6,7,0.314552,0.336795
7,8,0.336795,0.42231
8,9,0.422311,0.552907
9,10,0.552907,0.993112


In [79]:
score_comb_v1 = score_comb[['snapshot_month','customer_id',
            'install_flag',
            'registration_date',
            'PL_target',
            'personal_details_complete_datetime',
            'mobilenumber','REF_MONTH',
            'min_days_since_loan_closure_personal_loan_off_us',
            'min_days_since_last_payment_personal_loan_off_us',
            'total_active_loans_short_term_personal_loan_on_us',
            'min_days_since_loan_closure_consumer_loan_off_us',
            'total_active_loans_last_3m_personal_loan_off_us',
            'total_closed_loans_personal_loan_off_us',
            'total_loans_consumer_loan_off_us',
            'total_loans_personal_loan_off_us',
            'AVG_AMT_INQ',
            'cc_consent_flag',
            'GENDER',
            'age_in_years',
            'OCCUPATION',
            'Annual_Income',
           'pred_prob1','rank','Decile']]

score_comb_v1["mobilenumber"] = score_comb_v1["mobilenumber"].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  score_comb_v1["mobilenumber"] = score_comb_v1["mobilenumber"].astype(str)


In [80]:
score_comb_v1.shape

(1784159, 25)

In [81]:
table_name = 't_dg_xsell_model_pl_scoring'
pandas_gbq.to_gbq(
    dataframe=score_comb_v1,
    destination_table=f'abcd_data_science_app.{table_name}',
    project_id='abcd-dataplatform',
    if_exists='append'
)

100%|██████████| 1/1 [00:00<00:00, 8473.34it/s]


# Filtering out customers with Score >= 700 & Salaried

In [46]:
QUERY = f"""

select * FROM `abcd-dataplatform.abcd_data_science_app.t_dg_xsell_model_pl_scoring`

"""

score_comb_v1 = client.query(QUERY).to_dataframe()
score_comb_v1.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1784159 entries, 0 to 1784158
Columns: 103 entries, snapshot_month to Decile
dtypes: Int64(5), datetime64[us](2), dbdate(1), float64(92), object(3)
memory usage: 1.7 GB


In [47]:
score_comb_v1.head()

Unnamed: 0,snapshot_month,customer_id,install_flag,registration_date,PL_target,personal_details_complete_datetime,mobilenumber,REF_MONTH,cc_consent_flag,GENDER,OCCUPATION,Annual_Income,age_in_years,count_13_last_3m,count_13_last_6m,count_13_last_12m,count_COB_last_3m,count_RRB_last_3m,count_COB_last_6m,count_RRB_last_6m,count_COB_last_12m,count_RRB_last_12m,MAX_AMT_INQ,AVG_AMT_INQ,max_loan_amt_consumer_loan_off_us,max_loan_amt_credit_card_off_us,max_loan_amt_gold_loan_off_us,max_loan_amt_personal_loan_off_us,max_loan_amt_personal_loan_on_us,max_loan_amt_short_term_personal_loan_off_us,max_loan_amt_short_term_personal_loan_on_us,min_days_since_last_payment_consumer_loan_off_us,min_days_since_last_payment_credit_card_off_us,min_days_since_last_payment_gold_loan_off_us,min_days_since_last_payment_personal_loan_off_us,min_days_since_last_payment_personal_loan_on_us,min_days_since_last_payment_short_term_personal_loan_off_us,min_days_since_last_payment_short_term_personal_loan_on_us,min_days_since_loan_closure_consumer_loan_off_us,min_days_since_loan_closure_credit_card_off_us,min_days_since_loan_closure_gold_loan_off_us,min_days_since_loan_closure_personal_loan_off_us,min_days_since_loan_closure_personal_loan_on_us,min_days_since_loan_closure_short_term_personal_loan_off_us,min_days_since_loan_closure_short_term_personal_loan_on_us,min_loan_amt_consumer_loan_off_us,min_loan_amt_credit_card_off_us,min_loan_amt_gold_loan_off_us,min_loan_amt_personal_loan_off_us,min_loan_amt_personal_loan_on_us,min_loan_amt_short_term_personal_loan_off_us,min_loan_amt_short_term_personal_loan_on_us,total_active_loans_consumer_loan_off_us,total_active_loans_credit_card_off_us,total_active_loans_gold_loan_off_us,total_active_loans_personal_loan_off_us,total_active_loans_personal_loan_on_us,total_active_loans_short_term_personal_loan_off_us,total_active_loans_short_term_personal_loan_on_us,total_active_loans_last_3m_consumer_loan_off_us,total_active_loans_last_3m_credit_card_off_us,total_active_loans_last_3m_gold_loan_off_us,total_active_loans_last_3m_personal_loan_off_us,total_active_loans_last_3m_personal_loan_on_us,total_active_loans_last_3m_short_term_personal_loan_off_us,total_active_loans_last_3m_short_term_personal_loan_on_us,total_active_loans_last_6m_consumer_loan_off_us,total_active_loans_last_6m_credit_card_off_us,total_active_loans_last_6m_gold_loan_off_us,total_active_loans_last_6m_personal_loan_off_us,total_active_loans_last_6m_personal_loan_on_us,total_active_loans_last_6m_short_term_personal_loan_off_us,total_active_loans_last_6m_short_term_personal_loan_on_us,total_closed_loans_consumer_loan_off_us,total_closed_loans_credit_card_off_us,total_closed_loans_gold_loan_off_us,total_closed_loans_personal_loan_off_us,total_closed_loans_personal_loan_on_us,total_closed_loans_short_term_personal_loan_off_us,total_closed_loans_short_term_personal_loan_on_us,total_emi_loans_consumer_loan_off_us,total_emi_loans_credit_card_off_us,total_emi_loans_gold_loan_off_us,total_emi_loans_personal_loan_off_us,total_emi_loans_personal_loan_on_us,total_emi_loans_short_term_personal_loan_off_us,total_emi_loans_short_term_personal_loan_on_us,total_loans_consumer_loan_off_us,total_loans_credit_card_off_us,total_loans_gold_loan_off_us,total_loans_personal_loan_off_us,total_loans_personal_loan_on_us,total_loans_short_term_personal_loan_off_us,total_loans_short_term_personal_loan_on_us,count_inq_on_us_3m,count_inq_on_us_6m,count_inq_on_us_12m,count_inq_off_us_3m,count_inq_off_us_6m,count_inq_off_us_12m,pred_prob1,rank,Decile
0,Apr-2025,3639051,1,2024-12-04,0,NaT,9814324221,2025-03-31,0.0,2.0,1.0,15500.0,31.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,50000,50000.0,,,0.0,,,,,,,275.0,,,,,,,275.0,,,,,,,0.0,,,,,,,0.0,,,,,,,0.0,,,,,,,0.0,,,,,,,1.0,,,,,,,0.0,,,,,,,1.0,,,,,0.0,1.0,1.0,0.0,0.0,0.0,0.983334,1784140.0,10
1,Apr-2025,5128979,1,2025-03-27,0,NaT,7489310001,2025-03-31,0.0,1.0,1.0,,32.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,100000,75000.0,8000.0,145000.0,,539999.0,,22500.0,,,31.0,,25.0,,420.0,,,497.0,,26.0,,118.0,,2000.0,0.0,,500.0,,2000.0,,2.0,5.0,,17.0,,1.0,,0.0,1.0,,12.0,,1.0,,0.0,1.0,,13.0,,1.0,,0.0,1.0,,21.0,,1.0,,0.0,0.0,,7.0,,0.0,,2.0,6.0,,38.0,,2.0,,0.0,1.0,1.0,0.0,0.0,1.0,0.97941,1784127.0,10
2,Apr-2025,2055230,1,2024-09-13,0,NaT,9468702165,2025-03-31,0.0,2.0,1.0,30000.0,29.0,0.0,54.0,108.0,0.0,0.0,0.0,0.0,0.0,0.0,50000,35333.333333,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,54.0,108.0,0.0,0.0,0.0,0.976947,1784110.0,10
3,Apr-2025,2393668,1,2024-10-08,1,2025-04-02 13:38:38.932,9818491107,2025-03-31,0.0,1.0,,,41.0,0.0,64.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0,600000,325000.0,,220000.0,,,300000.0,,,,52.0,,,,,,,,,,,,,,20000.0,,,300000.0,,,,5.0,,,1.0,,,,0.0,,,0.0,,,,0.0,,,0.0,,,,0.0,,,0.0,,,,0.0,,,1.0,,,,5.0,,,1.0,,,0.0,64.0,64.0,0.0,0.0,0.0,0.976107,1784103.0,10
4,Apr-2025,4981131,1,2025-03-20,0,NaT,9447968349,2025-03-31,0.0,2.0,4.0,100000.0,49.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,50000,50000.0,,,486210.0,,,,,,,37.0,,,,,,,37.0,,,,,,,16743.0,,,,,,,4.0,,,,,,,0.0,,,,,,,4.0,,,,,,,63.0,,,,,,,0.0,,,,,,,67.0,,,,,0.0,1.0,1.0,0.0,0.0,0.0,0.972492,1784057.0,10


In [48]:
start_date = '2025-03-28'
end_date = '2025-03-29'

score_comb_v1 = score_comb_v1[~(score_comb_v1['registration_date'] >= start_date) & (score_comb_v1['registration_date'] <= end_date)]

In [49]:
score_comb_v1.shape

(1744790, 103)

In [50]:
score_comb_v1 = score_comb_v1[['snapshot_month','customer_id',
            'install_flag',
            'registration_date',
            'PL_target',
            'personal_details_complete_datetime',
            'mobilenumber','REF_MONTH',
            'min_days_since_loan_closure_personal_loan_off_us',
            'min_days_since_last_payment_personal_loan_off_us',
            'total_active_loans_short_term_personal_loan_on_us',
            'min_days_since_loan_closure_consumer_loan_off_us',
            'total_active_loans_last_3m_personal_loan_off_us',
            'total_closed_loans_personal_loan_off_us',
            'total_loans_consumer_loan_off_us',
            'total_loans_personal_loan_off_us',
            'AVG_AMT_INQ',
            'cc_consent_flag',
            'GENDER',
            'age_in_years',
            'OCCUPATION',
            'Annual_Income',
           'pred_prob1','rank','Decile']]

score_comb_v1["mobilenumber"] = score_comb_v1["mobilenumber"].astype(str)

In [51]:
QUERY = f"""

select * FROM `abffsl-dataplatform-uat.abfssl_central_analytics.EXPERIAN_RPT_SCORE_DAILY_BASE`

"""

exp_score = client.query(QUERY).to_dataframe()
exp_score.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3076991 entries, 0 to 3076990
Data columns (total 4 columns):
 #   Column         Dtype              
---  ------         -----              
 0   CUST_ID        object             
 1   Score_V3       object             
 2   SCRUB_DATE     dbdate             
 3   INGESTION_TMS  datetime64[us, UTC]
dtypes: datetime64[us, UTC](1), dbdate(1), object(2)
memory usage: 419.6 MB


In [52]:
QUERY = f"""

select 

a.CUST_ID,
a.Score_V3,
a.SCRUB_DATE
FROM `abffsl-dataplatform-uat.abfssl_central_analytics.EXPERIAN_RPT_SCORE_ADHOC_BASE` a
where upper(remarks) like '%APP%'
QUALIFY RANK() OVER (PARTITION BY CUST_ID ORDER BY SCRUB_DATE DESC) = 1;

"""

exp_score_adhoc = client.query(QUERY).to_dataframe()
exp_score_adhoc.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3058319 entries, 0 to 3058318
Data columns (total 3 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   CUST_ID     object
 1   Score_V3    object
 2   SCRUB_DATE  dbdate
dtypes: dbdate(1), object(2)
memory usage: 393.7 MB


In [53]:
print(exp_score.shape)
print(exp_score_adhoc.shape)

(3076991, 4)
(3058319, 3)


In [54]:
exp_score.rename({'CUST_ID':'mobilenumber'}, axis=1,inplace = True)
exp_score_adhoc.rename({'CUST_ID':'mobilenumber'}, axis=1,inplace = True)

In [55]:
exp_score_adhoc['mobilenumber'] = exp_score_adhoc['mobilenumber'].astype('string')
exp_score['mobilenumber'] = exp_score['mobilenumber'].astype('string')
score_comb_v1['mobilenumber'] = score_comb_v1['mobilenumber'].astype('string')

In [56]:
score_comb_v1.shape

(1744790, 25)

In [57]:
score_comb_v1 = score_comb_v1.merge(exp_score, on ='mobilenumber', how = 'left')

In [58]:
score_comb_v1.shape

(1744790, 28)

In [59]:
exp_score_adhoc.rename({'Score_V3':'Score_V3_AD', 'SCRUB_DATE':'SCRUB_DATE_AD'}, axis=1,inplace=True)

In [60]:
score_comb_v1 = score_comb_v1.merge(exp_score_adhoc, on ='mobilenumber', how = 'left')

In [61]:
score_comb_v1.shape
score_comb_v1 = score_comb_v1.drop_duplicates()

In [62]:
score_comb_v1.shape

(1744790, 30)

In [63]:
QUERY = f"""

select distinct a.customer_id,1 as occupation
from `abcd-dataplatform.abcd_data_model.Salary_Tags` a
where a.concat != '0000'

"""

sms_salary_df = client.query(QUERY).to_dataframe()
sms_salary_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 850085 entries, 0 to 850084
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype
---  ------       --------------   -----
 0   customer_id  850085 non-null  Int64
 1   occupation   850085 non-null  Int64
dtypes: Int64(2)
memory usage: 14.6 MB


In [64]:
sms_salary_df.head()

Unnamed: 0,customer_id,occupation
0,35591,1
1,54381,1
2,4916855,1
3,1490077,1
4,1360183,1


In [65]:
sms_salary_df.rename({"occupation":"occupation_sms"}, axis=1,inplace=True)

In [66]:
score_comb_v1['customer_id'] = score_comb_v1['customer_id'].astype('string')
sms_salary_df['customer_id'] = sms_salary_df['customer_id'].astype('string')

In [67]:
score_comb_v1 = score_comb_v1.merge(sms_salary_df, on ='customer_id', how = 'left')

In [68]:
score_comb_v1.shape

(1744790, 31)

In [69]:
score_comb_v1.rename({'occupation_sms_x':'occupation_sms'},axis=1,inplace=True)

In [70]:
score_comb_v1[score_comb_v1['occupation_sms'].notnull()].head()

Unnamed: 0,snapshot_month,customer_id,install_flag,registration_date,PL_target,personal_details_complete_datetime,mobilenumber,REF_MONTH,min_days_since_loan_closure_personal_loan_off_us,min_days_since_last_payment_personal_loan_off_us,total_active_loans_short_term_personal_loan_on_us,min_days_since_loan_closure_consumer_loan_off_us,total_active_loans_last_3m_personal_loan_off_us,total_closed_loans_personal_loan_off_us,total_loans_consumer_loan_off_us,total_loans_personal_loan_off_us,AVG_AMT_INQ,cc_consent_flag,GENDER,age_in_years,OCCUPATION,Annual_Income,pred_prob1,rank,Decile,Score_V3,SCRUB_DATE,INGESTION_TMS,Score_V3_AD,SCRUB_DATE_AD,occupation_sms
0,Apr-2025,3639051,1,2024-12-04,0,NaT,9814324221,2025-03-31,,,,,,,,,50000.0,0.0,2.0,31.0,1.0,15500.0,0.983334,1784140.0,10,482,2025-03-07,2025-03-10 09:23:54.692860+00:00,482.0,2025-01-27,1
1,Apr-2025,5128979,1,2025-03-27,0,NaT,7489310001,2025-03-31,26.0,25.0,,,12.0,21.0,2.0,38.0,75000.0,0.0,1.0,32.0,1.0,,0.97941,1784127.0,10,740,2025-03-28,2025-03-29 16:08:00.332966+00:00,,NaT,1
5,Apr-2025,1965655,1,2024-09-11,0,NaT,9035089796,2025-03-31,,,,,,,,,40000.0,0.0,1.0,43.0,1.0,36000.0,0.971499,1784040.0,10,469,2025-03-15,2025-03-18 10:10:31.829390+00:00,477.0,2025-01-27,1
6,Apr-2025,5126359,1,2025-03-26,0,NaT,9052122529,2025-03-31,,,,,,,,,310000.0,0.0,1.0,29.0,,,0.971275,1784031.0,10,743,2025-03-27,2025-04-01 06:38:21.850220+00:00,,NaT,1
7,Apr-2025,4723045,1,2025-03-07,0,NaT,9840094621,2025-03-31,49.0,46.0,,,7.0,22.0,,43.0,50000.0,0.0,2.0,46.0,1.0,122000.0,0.970935,1784026.0,10,739,2025-03-08,2025-03-18 07:24:32.309849+00:00,,NaT,1


In [71]:
score_comb_v1['Final_Occupation'] = (
    (score_comb_v1['occupation_sms'].fillna(0) == 1) | 
    (score_comb_v1['OCCUPATION'].fillna(0) == 1.0)
).astype(int)

In [72]:
score_comb_v1['Final_Score'] = score_comb_v1['Score_V3'].fillna(score_comb_v1['Score_V3_AD'])

In [73]:
score_comb_v1.head()

Unnamed: 0,snapshot_month,customer_id,install_flag,registration_date,PL_target,personal_details_complete_datetime,mobilenumber,REF_MONTH,min_days_since_loan_closure_personal_loan_off_us,min_days_since_last_payment_personal_loan_off_us,total_active_loans_short_term_personal_loan_on_us,min_days_since_loan_closure_consumer_loan_off_us,total_active_loans_last_3m_personal_loan_off_us,total_closed_loans_personal_loan_off_us,total_loans_consumer_loan_off_us,total_loans_personal_loan_off_us,AVG_AMT_INQ,cc_consent_flag,GENDER,age_in_years,OCCUPATION,Annual_Income,pred_prob1,rank,Decile,Score_V3,SCRUB_DATE,INGESTION_TMS,Score_V3_AD,SCRUB_DATE_AD,occupation_sms,Final_Occupation,Final_Score
0,Apr-2025,3639051,1,2024-12-04,0,NaT,9814324221,2025-03-31,,,,,,,,,50000.0,0.0,2.0,31.0,1.0,15500.0,0.983334,1784140.0,10,482,2025-03-07,2025-03-10 09:23:54.692860+00:00,482.0,2025-01-27,1.0,1,482
1,Apr-2025,5128979,1,2025-03-27,0,NaT,7489310001,2025-03-31,26.0,25.0,,,12.0,21.0,2.0,38.0,75000.0,0.0,1.0,32.0,1.0,,0.97941,1784127.0,10,740,2025-03-28,2025-03-29 16:08:00.332966+00:00,,NaT,1.0,1,740
2,Apr-2025,2055230,1,2024-09-13,0,NaT,9468702165,2025-03-31,,,,,,,,,35333.333333,0.0,2.0,29.0,1.0,30000.0,0.976947,1784110.0,10,497,2025-03-17,2025-03-26 06:25:25.419488+00:00,497.0,2025-03-17,,1,497
3,Apr-2025,2393668,1,2024-10-08,1,2025-04-02 13:38:38.932,9818491107,2025-03-31,,,,,,,,,325000.0,0.0,1.0,41.0,,,0.976107,1784103.0,10,776,2025-02-24,2025-03-03 11:32:35.367252+00:00,776.0,2025-01-27,,0,776
4,Apr-2025,4981131,1,2025-03-20,0,NaT,9447968349,2025-03-31,,,,,,,,,50000.0,0.0,2.0,49.0,4.0,100000.0,0.972492,1784057.0,10,513,2025-03-21,2025-03-31 05:53:14.530531+00:00,,NaT,,0,513


In [74]:
QUERY = f"""

with registration_inJFM as
(Select mobilenumber from abcd-dataplatform-prod.abcd_mobileapp_transformed.ABCDPRODDB_t_customer
where format_timestamp('%Y-%m',createddate) in('2025-01','2025-03','2025-02'))
 
,login_inJFM as
(
  Select mdn from
  (
  Select mdn,date(max(login_time)) llogindate from `abcd-dataplatform-prod.abcd_mobileapp_raw.ABCDPRODDB_t_customer_login_details`
  group by 1
)
  where format_timestamp('%Y-%m',llogindate) in('2025-01','2025-03','2025-02'))
 
  ,base as
  (
    Select * from registration_inJFM
    union distinct
    Select * from login_inJFM
  )
 
 select * FROM base
"""

jfm_df = client.query(QUERY).to_dataframe()
jfm_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1589785 entries, 0 to 1589784
Data columns (total 1 columns):
 #   Column        Non-Null Count    Dtype 
---  ------        --------------    ----- 
 0   mobilenumber  1589785 non-null  object
dtypes: object(1)
memory usage: 101.6 MB


In [75]:
score_comb_v1.shape

(1744790, 33)

In [76]:
jfm_df.head()

Unnamed: 0,mobilenumber
0,8935906363
1,6399638264
2,8887762401
3,9354635183
4,9633894813


In [77]:
score_comb_v1.head()

Unnamed: 0,snapshot_month,customer_id,install_flag,registration_date,PL_target,personal_details_complete_datetime,mobilenumber,REF_MONTH,min_days_since_loan_closure_personal_loan_off_us,min_days_since_last_payment_personal_loan_off_us,total_active_loans_short_term_personal_loan_on_us,min_days_since_loan_closure_consumer_loan_off_us,total_active_loans_last_3m_personal_loan_off_us,total_closed_loans_personal_loan_off_us,total_loans_consumer_loan_off_us,total_loans_personal_loan_off_us,AVG_AMT_INQ,cc_consent_flag,GENDER,age_in_years,OCCUPATION,Annual_Income,pred_prob1,rank,Decile,Score_V3,SCRUB_DATE,INGESTION_TMS,Score_V3_AD,SCRUB_DATE_AD,occupation_sms,Final_Occupation,Final_Score
0,Apr-2025,3639051,1,2024-12-04,0,NaT,9814324221,2025-03-31,,,,,,,,,50000.0,0.0,2.0,31.0,1.0,15500.0,0.983334,1784140.0,10,482,2025-03-07,2025-03-10 09:23:54.692860+00:00,482.0,2025-01-27,1.0,1,482
1,Apr-2025,5128979,1,2025-03-27,0,NaT,7489310001,2025-03-31,26.0,25.0,,,12.0,21.0,2.0,38.0,75000.0,0.0,1.0,32.0,1.0,,0.97941,1784127.0,10,740,2025-03-28,2025-03-29 16:08:00.332966+00:00,,NaT,1.0,1,740
2,Apr-2025,2055230,1,2024-09-13,0,NaT,9468702165,2025-03-31,,,,,,,,,35333.333333,0.0,2.0,29.0,1.0,30000.0,0.976947,1784110.0,10,497,2025-03-17,2025-03-26 06:25:25.419488+00:00,497.0,2025-03-17,,1,497
3,Apr-2025,2393668,1,2024-10-08,1,2025-04-02 13:38:38.932,9818491107,2025-03-31,,,,,,,,,325000.0,0.0,1.0,41.0,,,0.976107,1784103.0,10,776,2025-02-24,2025-03-03 11:32:35.367252+00:00,776.0,2025-01-27,,0,776
4,Apr-2025,4981131,1,2025-03-20,0,NaT,9447968349,2025-03-31,,,,,,,,,50000.0,0.0,2.0,49.0,4.0,100000.0,0.972492,1784057.0,10,513,2025-03-21,2025-03-31 05:53:14.530531+00:00,,NaT,,0,513


In [78]:
score_comb_v1['mobilenumber'] = score_comb_v1['mobilenumber'].astype('string')
jfm_df['mobilenumber'] = jfm_df['mobilenumber'].astype('string')

In [79]:
print(score_comb_v1.shape)
top_decile_cust = score_comb_v1[score_comb_v1['mobilenumber'].isin(jfm_df['mobilenumber'])]
print(top_decile_cust.shape)

(1744790, 33)
(474453, 33)


In [80]:
top_decile_cust.groupby('Decile')['customer_id'].count()

Decile
1      45330
2      38992
3      24383
4      16660
5      16621
6      37630
7     100549
8      45367
9      72030
10     76891
Name: customer_id, dtype: int64

In [81]:
top_decile_cust = top_decile_cust[(score_comb_v1['Decile'].isin([10,9,8])) & (top_decile_cust['Final_Score'].fillna(0).astype(int) >= 700) & (top_decile_cust['Final_Occupation'] == 1)]

  top_decile_cust = top_decile_cust[(score_comb_v1['Decile'].isin([10,9,8])) & (top_decile_cust['Final_Score'].fillna(0).astype(int) >= 700) & (top_decile_cust['Final_Occupation'] == 1)]
  top_decile_cust = top_decile_cust[(score_comb_v1['Decile'].isin([10,9,8])) & (top_decile_cust['Final_Score'].fillna(0).astype(int) >= 700) & (top_decile_cust['Final_Occupation'] == 1)]


In [82]:
top_decile_cust.shape

(54730, 33)

In [83]:
top_decile_cust.head()

Unnamed: 0,snapshot_month,customer_id,install_flag,registration_date,PL_target,personal_details_complete_datetime,mobilenumber,REF_MONTH,min_days_since_loan_closure_personal_loan_off_us,min_days_since_last_payment_personal_loan_off_us,total_active_loans_short_term_personal_loan_on_us,min_days_since_loan_closure_consumer_loan_off_us,total_active_loans_last_3m_personal_loan_off_us,total_closed_loans_personal_loan_off_us,total_loans_consumer_loan_off_us,total_loans_personal_loan_off_us,AVG_AMT_INQ,cc_consent_flag,GENDER,age_in_years,OCCUPATION,Annual_Income,pred_prob1,rank,Decile,Score_V3,SCRUB_DATE,INGESTION_TMS,Score_V3_AD,SCRUB_DATE_AD,occupation_sms,Final_Occupation,Final_Score
1,Apr-2025,5128979,1,2025-03-27,0,NaT,7489310001,2025-03-31,26.0,25.0,,,12.0,21.0,2.0,38.0,75000.0,0.0,1.0,32.0,1.0,,0.97941,1784127.0,10,740,2025-03-28,2025-03-29 16:08:00.332966+00:00,,NaT,1,1,740
6,Apr-2025,5126359,1,2025-03-26,0,NaT,9052122529,2025-03-31,,,,,,,,,310000.0,0.0,1.0,29.0,,,0.971275,1784031.0,10,743,2025-03-27,2025-04-01 06:38:21.850220+00:00,,NaT,1,1,743
7,Apr-2025,4723045,1,2025-03-07,0,NaT,9840094621,2025-03-31,49.0,46.0,,,7.0,22.0,,43.0,50000.0,0.0,2.0,46.0,1.0,122000.0,0.970935,1784026.0,10,739,2025-03-08,2025-03-18 07:24:32.309849+00:00,,NaT,1,1,739
10,Apr-2025,3082127,1,2024-11-17,0,NaT,9773077230,2025-03-31,,,,,,,,,283333.333333,0.0,1.0,52.0,,,0.969951,1784004.0,10,764,2025-02-18,2025-03-03 08:35:13.822667+00:00,764.0,2025-01-27,1,1,764
25,Apr-2025,2715908,1,2024-10-30,0,NaT,9822393546,2025-03-31,,,,,,,,,600000.0,0.0,2.0,36.0,4.0,,0.967119,1783857.0,10,775,2025-03-18,2025-03-19 16:05:25.189763+00:00,775.0,2025-03-17,1,1,775


In [84]:
top_decile_cust.groupby('Decile')['customer_id'].count()

Decile
8     12172
9     19465
10    23093
Name: customer_id, dtype: int64

In [85]:
top_decile_cust[['snapshot_month','customer_id',
            'install_flag',
            'registration_date',
            'PL_target',
            'personal_details_complete_datetime',
            'mobilenumber','REF_MONTH',
            'min_days_since_loan_closure_personal_loan_off_us',
            'min_days_since_last_payment_personal_loan_off_us',
            'total_active_loans_short_term_personal_loan_on_us',
            'min_days_since_loan_closure_consumer_loan_off_us',
            'total_active_loans_last_3m_personal_loan_off_us',
            'total_closed_loans_personal_loan_off_us',
            'total_loans_consumer_loan_off_us',
            'total_loans_personal_loan_off_us',
            'AVG_AMT_INQ',
            'cc_consent_flag',
            'GENDER',
            'age_in_years',
            'OCCUPATION',
            'Annual_Income',
            'Final_Score',        
           'pred_prob1',
            'Decile']]

Unnamed: 0,snapshot_month,customer_id,install_flag,registration_date,PL_target,personal_details_complete_datetime,mobilenumber,REF_MONTH,min_days_since_loan_closure_personal_loan_off_us,min_days_since_last_payment_personal_loan_off_us,total_active_loans_short_term_personal_loan_on_us,min_days_since_loan_closure_consumer_loan_off_us,total_active_loans_last_3m_personal_loan_off_us,total_closed_loans_personal_loan_off_us,total_loans_consumer_loan_off_us,total_loans_personal_loan_off_us,AVG_AMT_INQ,cc_consent_flag,GENDER,age_in_years,OCCUPATION,Annual_Income,Final_Score,pred_prob1,Decile
1,Apr-2025,5128979,1,2025-03-27,0,NaT,7489310001,2025-03-31,26.0,25.0,,,12.0,21.0,2.0,38.0,75000.000000,0.0,1.0,32.0,1.0,,740,0.979410,10
6,Apr-2025,5126359,1,2025-03-26,0,NaT,9052122529,2025-03-31,,,,,,,,,310000.000000,0.0,1.0,29.0,,,743,0.971275,10
7,Apr-2025,4723045,1,2025-03-07,0,NaT,9840094621,2025-03-31,49.0,46.0,,,7.0,22.0,,43.0,50000.000000,0.0,2.0,46.0,1.0,122000,739,0.970935,10
10,Apr-2025,3082127,1,2024-11-17,0,NaT,9773077230,2025-03-31,,,,,,,,,283333.333333,0.0,1.0,52.0,,,764,0.969951,10
25,Apr-2025,2715908,1,2024-10-30,0,NaT,9822393546,2025-03-31,,,,,,,,,600000.000000,0.0,2.0,36.0,4.0,,775,0.967119,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
556908,Apr-2025,4174259,1,2025-01-03,0,NaT,9975991673,2025-03-31,,,,,,,,,,,2.0,32.0,,,842,0.336795,8
558424,Apr-2025,783015,1,2024-07-27,0,NaT,7756005611,2025-03-31,,,,,,,,,,,2.0,32.0,,,776,0.336795,8
558438,Apr-2025,1164827,1,2024-08-20,0,NaT,8303116768,2025-03-31,,,,,,,,,,,2.0,32.0,,,762,0.336795,8
558468,Apr-2025,3467359,1,2024-11-29,0,NaT,8319250939,2025-03-31,,,,,,,,,,,2.0,32.0,,,743,0.336795,8


In [86]:
top_decile_cust.pivot_table(index='Decile', columns='PL_target', values='customer_id', aggfunc='count')

PL_target,0,1
Decile,Unnamed: 1_level_1,Unnamed: 2_level_1
8,12127,45
9,19401,64
10,22952,141


In [87]:
QUERY = f"""

SELECT
    mdn as mobilenumber,
    CASE
      WHEN ac.installation_type = "uninstalled" THEN FALSE
      ELSE TRUE
  END
    AS active_flag
  FROM
 (
    SELECT
      mdn,
      CASE
        WHEN installation_type IS NULL THEN 'NA'
        ELSE installation_type
    END
      AS installation_type,
      created_date
    FROM (
      SELECT
        mdn,
        installation_type,
        created_date,
        ROW_NUMBER() OVER (PARTITION BY mdn ORDER BY created_date DESC) AS row_number
      FROM
        `abcd-dataplatform-prod.abcd_mobileapp_raw.ABCDPRODDB_t_customer_activation`)
    WHERE
      row_number = 1) ac

"""

active_inactive_df = client.query(QUERY).to_dataframe()
active_inactive_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4764262 entries, 0 to 4764261
Data columns (total 2 columns):
 #   Column        Dtype  
---  ------        -----  
 0   mobilenumber  object 
 1   active_flag   boolean
dtypes: boolean(1), object(1)
memory usage: 313.5 MB


In [91]:
active_inactive_df.head()

Unnamed: 0,mobilenumber,active_flag
0,6000231782,False
1,6000399519,False
2,6000541005,False
3,6000832007,False
4,6000862340,False


In [92]:
active_inactive_df['mobilenumber'] = active_inactive_df['mobilenumber'].astype('string')
top_decile_cust['mobilenumber'] = top_decile_cust['mobilenumber'].astype('string')

In [99]:
top_decile_cust = top_decile_cust.merge(active_inactive_df, on = 'mobilenumber', how='left')

In [101]:
top_decile_cust['active_flag'] = top_decile_cust['active_flag'].fillna(True)

In [103]:
top_decile_cust.head()

Unnamed: 0,snapshot_month,customer_id,install_flag,registration_date,PL_target,personal_details_complete_datetime,mobilenumber,REF_MONTH,min_days_since_loan_closure_personal_loan_off_us,min_days_since_last_payment_personal_loan_off_us,total_active_loans_short_term_personal_loan_on_us,min_days_since_loan_closure_consumer_loan_off_us,total_active_loans_last_3m_personal_loan_off_us,total_closed_loans_personal_loan_off_us,total_loans_consumer_loan_off_us,total_loans_personal_loan_off_us,AVG_AMT_INQ,cc_consent_flag,GENDER,age_in_years,OCCUPATION,Annual_Income,pred_prob1,rank,Decile,Score_V3,SCRUB_DATE,INGESTION_TMS,Score_V3_AD,SCRUB_DATE_AD,occupation_sms,Final_Occupation,Final_Score,active_flag
0,Apr-2025,5128979,1,2025-03-27,0,NaT,7489310001,2025-03-31,26.0,25.0,,,12.0,21.0,2.0,38.0,75000.0,0.0,1.0,32.0,1.0,,0.97941,1784127.0,10,740,2025-03-28,2025-03-29 16:08:00.332966+00:00,,NaT,1,1,740,True
1,Apr-2025,5126359,1,2025-03-26,0,NaT,9052122529,2025-03-31,,,,,,,,,310000.0,0.0,1.0,29.0,,,0.971275,1784031.0,10,743,2025-03-27,2025-04-01 06:38:21.850220+00:00,,NaT,1,1,743,False
2,Apr-2025,4723045,1,2025-03-07,0,NaT,9840094621,2025-03-31,49.0,46.0,,,7.0,22.0,,43.0,50000.0,0.0,2.0,46.0,1.0,122000.0,0.970935,1784026.0,10,739,2025-03-08,2025-03-18 07:24:32.309849+00:00,,NaT,1,1,739,True
3,Apr-2025,3082127,1,2024-11-17,0,NaT,9773077230,2025-03-31,,,,,,,,,283333.333333,0.0,1.0,52.0,,,0.969951,1784004.0,10,764,2025-02-18,2025-03-03 08:35:13.822667+00:00,764.0,2025-01-27,1,1,764,True
4,Apr-2025,2715908,1,2024-10-30,0,NaT,9822393546,2025-03-31,,,,,,,,,600000.0,0.0,2.0,36.0,4.0,,0.967119,1783857.0,10,775,2025-03-18,2025-03-19 16:05:25.189763+00:00,775.0,2025-03-17,1,1,775,True


In [104]:
top_decile_cust['product'] = 'PL'
top_decile_cust['cohort'] = 'CO_0'

In [107]:
top_decile_cust = top_decile_cust[['customer_id','active_flag','product','cohort']]

In [111]:
top_decile_cust.shape

(54730, 4)

In [110]:
top_decile_cust.active_flag.value_counts()

active_flag
True     54171
False      559
Name: count, dtype: Int64

In [108]:
table_name = 't_dg_pl_xsell_apr2025_scores_final'
pandas_gbq.to_gbq(
    dataframe=top_decile_cust,
    destination_table=f'abcd_data_science_app.{table_name}',
    project_id='abcd-dataplatform',
    if_exists='append'
)

100%|██████████| 1/1 [00:00<00:00, 8035.07it/s]


In [None]:
# table_name = 't_dg_pl_xell_march2025_scores_final'
# pandas_gbq.to_gbq(
#     dataframe=top_decile_cust,
#     destination_table=f'abcd_data_science_app.{table_name}',
#     project_id='abcd-dataplatform',
#     if_exists='append'
# )