In [8]:
# ===== Step 1: Setup (run this) =====

import os
from pathlib import Path
import pandas as pd
import numpy as np
from statistics import mode
from dateutil.relativedelta import relativedelta
import datetime as dt

# --- Paths (edit if your folders differ) ---
TRAIN_FILE = Path(r"D:\PDM\ILCV_2DD_PDM\V2_ALL_DD\ILCV_ALL_DD_MERGED.csv")
ALLOC_PATH = Path(r"D:/PDM/ALLOCATION_X")  # where data_YYYYMM.xlsx live
OUTPUT_PATH = Path(r"D:\PDM\ILCV_2DD_PDM\V2_ALL_DD\TARGET_VAR_CREATED_DATA")
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)

# --- Month list: iterate from JUN25 ‚Üí ... ‚Üí JAN24 ---
month_variable1 = [
    'AUG25','JUL25','JUN25','MAY25','APR25','MAR25','FEB25','JAN25',
    'DEC24','NOV24','OCT24','SEP24','AUG24','JUL24',
    'JUN24','MAY24','APR24','MAR24','FEB24','JAN24'
]

# --- Master bounce timeline (most-recent first, then going back) ---
bounce_timeline_master = [
    '202507','202506','202505','202504','202503','202502','202501','202412','202411','202410',
    '202409','202408','202407','202406','202405','202404','202403','202402',
    '202401','202312','202311','202310','202309','202308','202307','202306',
    '202305','202304','202303','202302','202301','202212','202211','202210',
    '202209','202208','202207','202206','202205','202204','202203','202202',
    '202201','202112','202111','202110','202109','202108','202107','202106',
    '202105','202104','202103','202102','202101','202012','202011','202010',
    '202009','202008','202007','202006','202005','202004','202003','202002',
    '202001','201912','201911','201910'
]

# Dictionary to store results for each month
results = {}
 
# --- Map each month_variable to its 24-month bounce window (shifted by index) ---
results = {}
for m in month_variable1:
    idx = month_variable1.index(m)
    window = bounce_timeline_master[idx: idx + 24]
    if len(window) < 24:
        raise ValueError(f"Bounce window for {m} is shorter than 24 months. Check bounce_timeline_master length.")
    results[m] = window

# --- month_variable ('JUN25') ‚Üí target_variable ('data_202507') i.e., +1 month ---
def month_to_target(month_variable: str) -> str:
    # Parse like 'JUN25' ‚Üí datetime
    month_dt = dt.datetime.strptime(month_variable, "%b%y")
    target_dt = month_dt + relativedelta(months=1)   # +1 month
    return f"data_{target_dt.strftime('%Y%m')}"

# --- Tiny self-checks (prints only; safe to keep) ---
print("First 3 month mappings to target_variable (+1M):")
for m in month_variable1[:3]:
    print(f"  {m} ‚Üí {month_to_target(m)}")

print("\nBounce window sanity check (show first & last 3 months in each window):")
for m in month_variable1[:3]:
    w = results[m]
    print(f"  {m}: {w[:3]} ... {w[-3:]}")


First 3 month mappings to target_variable (+1M):
  AUG25 ‚Üí data_202509
  JUL25 ‚Üí data_202508
  JUN25 ‚Üí data_202507

Bounce window sanity check (show first & last 3 months in each window):
  AUG25: ['202507', '202506', '202505'] ... ['202310', '202309', '202308']
  JUL25: ['202506', '202505', '202504'] ... ['202309', '202308', '202307']
  JUN25: ['202505', '202504', '202503'] ... ['202308', '202307', '202306']


# FINAL WORKED CODE

In [12]:
def process_month(month_variable: str, train_df: pd.DataFrame) -> pd.DataFrame:
    print("=" * 70)
    print(f"üöÄ Starting processing for: {month_variable}")
    
    # Helper to clean CONNO (remove trailing .0 if present)
    def clean_conno(series: pd.Series) -> pd.Series:
        return series.astype(str).str.replace(r"\.0$", "", regex=True)
    
    # 1. Slice training data for the current month
    print("üîπ Step 1: Filtering training data...")
    data = train_df.loc[train_df["MONTH"] == month_variable].copy()
    print(f"   ‚Üí Found {data.shape[0]:,} rows and {data.shape[1]} columns for {month_variable}")

    # 2. Load Target variable (+1 month)
    target_file = ALLOC_PATH / f"{month_to_target(month_variable)}.xlsx"
    print(f"üîπ Step 2: Loading target variable from {target_file.name} ...")
    if not target_file.exists():
        raise FileNotFoundError(f"‚ùå Target file not found: {target_file}")

    target = pd.read_excel(target_file, dtype=str)[["contract_code","allocation_bucket"]]
    target = target.rename(columns={"contract_code": "CONNO"})
    print(f"   ‚Üí Loaded {target.shape[0]:,} target rows")

    # ‚úÖ Clean CONNO
    data["CONNO"] = clean_conno(data["CONNO"])
    target["CONNO"] = clean_conno(target["CONNO"])

    # üîé Debug prints for CONNO
    print("   Train CONNO sample:", data["CONNO"].head().tolist())
    print("   Target CONNO sample:", target["CONNO"].head().tolist())
    print("   Train CONNO dtype:", data["CONNO"].dtype)
    print("   Target CONNO dtype:", target["CONNO"].dtype)
    print("   Unique CONNO counts ‚Üí Train:", data["CONNO"].nunique(), 
          "Target:", target["CONNO"].nunique())

    # Merge target
    data_month = data.merge(target, on="CONNO", how="left")
    data_month["target_variable"] = np.where(
        data_month["allocation_bucket"].isna(), 0,
        np.where(data_month["allocation_bucket"] == "Bucket X", 1, 0)
    )
    data_month.drop(columns=["allocation_bucket"], inplace=True)
    print(f"   ‚Üí After merging target: {data_month.shape}")
    print(f"   Matched target rows (sum=1s): {data_month['target_variable'].sum()}")
    print(f"   Unmatched target rows (NaN): {data_month['target_variable'].isna().sum()}")

    # 3. Build bounce history
    bounce_timeline = results[month_variable]
    print(f"üîπ Step 3: Building bounce history (24 months) for {month_variable}")
    base = data_month[["CONNO"]].drop_duplicates().copy()
    print(f"   ‚Üí Starting base with {base.shape[0]:,} unique CONNOs")

    for i in bounce_timeline:
        file = ALLOC_PATH / f"data_{i}.xlsx"
        if not file.exists():
            print(f"   ‚ö† Skipping missing bounce file: {file.name}")
            continue

        temp = pd.read_excel(file, dtype=str)[["contract_code","allocation_bucket"]]
        temp = temp.rename(columns={"contract_code":"CONNO"})
        temp["CONNO"] = clean_conno(temp["CONNO"])

        # üîé Debug sample
        print(f"      Bounce {i} CONNO sample:", temp["CONNO"].head().tolist())

        temp["bkt_x"] = np.where(
            temp["allocation_bucket"].isna(), 0,
            np.where(temp["allocation_bucket"] == "Bucket X", 1, 0)
        )
        temp = temp.drop(columns=["allocation_bucket"])
        temp = temp.rename(columns={"bkt_x": f"bkt_{i}"})
        base = base.merge(temp, on="CONNO", how="left")

        print(f"      ‚úî Added bounce month {i} ‚Üí base now {base.shape}")

    base = base.fillna(0).drop_duplicates()
    print(f"   ‚Üí Bounce history complete: {base.shape}")

    # Collapse all bounce cols ‚Üí XBktString_L24M
    bounce_cols = [c for c in base.columns if c.startswith("bkt_")]
    base["XBktString_L24M"] = base[bounce_cols].astype(int).astype(str).agg(",".join, axis=1)
    XBktString = base[["CONNO","XBktString_L24M"]]
    print(f"   ‚Üí XBktString_L24M created for {XBktString.shape[0]:,} CONNOs")

    # Merge back into data_month
    data_month = data_month.merge(XBktString, on="CONNO", how="left")
    print(f"üîπ Step 4: Final merged dataset shape: {data_month.shape}")

    print(f"‚úÖ Finished processing for {month_variable}")
    print("=" * 70)
    return data_month


In [18]:
def main():
    all_outputs = []

    for month in results.keys():   # runs in the order of your dict
        try:
            df_month = process_month(month, train_df)
            all_outputs.append(df_month)
        except Exception as e:
            print(f"‚ùå Error processing {month}: {e}")
            continue

    # Concatenate all months into one big DataFrame
    if all_outputs:
        out = pd.concat(all_outputs, ignore_index=True)
        print("\nüöÄ Step 2 completed for ALL months!")
        print(f"‚úÖ Final concatenated dataset shape: {out.shape}")
        print(f"   Total months processed: {len(all_outputs)}")

        return out
    else:
        print("‚ùå No dataframes created, something went wrong.")
        return None


In [19]:
out = main()


üöÄ Starting processing for: AUG25
üîπ Step 1: Filtering training data...
   ‚Üí Found 17,731 rows and 61 columns for AUG25
üîπ Step 2: Loading target variable from data_202509.xlsx ...
‚ùå Error processing AUG25: ‚ùå Target file not found: D:\PDM\ALLOCATION_X\data_202509.xlsx
üöÄ Starting processing for: JUL25
üîπ Step 1: Filtering training data...
   ‚Üí Found 17,999 rows and 61 columns for JUL25
üîπ Step 2: Loading target variable from data_202508.xlsx ...
   ‚Üí Loaded 50,863 target rows
   Train CONNO sample: ['5002223826', '5002227349', '5002228725', '5002235182', '5002246083']
   Target CONNO sample: ['5002474518', '5002481467', '5002482387', '5002486088', '5002486118']
   Train CONNO dtype: object
   Target CONNO dtype: object
   Unique CONNO counts ‚Üí Train: 17999 Target: 50863
   ‚Üí After merging target: (17999, 62)
   Matched target rows (sum=1s): 3246
   Unmatched target rows (NaN): 0
üîπ Step 3: Building bounce history (24 months) for JUL25
   ‚Üí Starting base wi

In [20]:
out

Unnamed: 0,CONNO,TENURE,SCHEME_FINPROD,IRR_CUSTOMER,ASSET_COST,PDC_FLAG,COMPANY_CODE,COLLECTION_TILL_DATE,SOHP,BPNO,...,REGION,CUSTOMER_SEGMENT,CUST_SEG,LTV,COMPANY_INDIVIDUAL,DSA_DLR,SOURCING,ASSET_GROUP,target_variable,XBktString_L24M
0,5002223826,59,LOAN,12.901,999000.0,NACH,5000,1290169.0,0.00,2002881244,...,EAST-1,FTU-PROPERTY,FTU,0.849850,INDIVIDUAL,80145,DDSA,LCV,0,000000000000000000000000
1,5002227349,67,LOAN,13.883,1496644.0,NACH,5000,1881164.0,0.00,2002884020,...,SOUTH-1,FTU-EXP,FTU,0.801794,INDIVIDUAL,DNA,DIRECT,ICV TRK & TIP,0,000000000000000000000000
2,5002228725,68,LOAN,12.909,1835000.0,Auto Deb,5000,2363931.0,0.00,2002884258,...,SOUTH-1,RTL CAPTIVE,RTL & CAP,0.844687,COMPANY,DNA,DIRECT,ICV BUS,0,000000000000000000000000
3,5002235182,54,LOAN,12.453,1265141.0,NACH,5000,1476082.0,0.00,2002890107,...,EAST-2,SUB RTL-RTR,SUB RETAIL,0.849708,INDIVIDUAL,82071,DDSA,ICV TRK & TIP,0,000000000000000000000000
4,5002246083,47,LOAN,16.125,658282.0,NACH,5000,795813.0,0.00,2002898654,...,EAST-2,FTU-PROPERTY,FTU,0.789935,INDIVIDUAL,0001004780,DEALER,LCV,0,000000000000000000000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
426766,5004637991,46,LOAN,9.243,2400000.0,NPDC,5000,0.0,2199999.98,2001253415,...,EAST-1,SUPER STR,STRATEGIC,0.916667,COMPANY,DNA,DIRECT,ICV TRK & TIP,0,000000000000000000000000
426767,5004638150,72,LOAN,11.630,3023000.0,NPDC,5000,0.0,2592510.00,2004168869,...,NORTH-2,SUB RTL-NTR,SUB RETAIL,0.826993,INDIVIDUAL,82061,DDSA,ICV BUS,0,000000000000000000000000
426768,5004638359,60,LOAN,12.512,2115000.0,NPDC,5000,0.0,1797750.04,2004563459,...,WEST-2,FTU-PROPERTY,FTU,0.850000,INDIVIDUAL,DNA,DIRECT,ICV BUS,0,000000000000000000000000
426769,5004638477,35,LOAN,10.486,1635303.0,NPDC,5000,0.0,1389999.97,2004563155,...,EAST-1,RETAIL,RTL & CAP,0.849995,INDIVIDUAL,DNA,DIRECT,LCV,0,000000000000000000000000


In [21]:
out.to_csv(r"D:\PDM\ILCV_2DD_PDM\V2_ALL_DD\TARGET_VAR_CREATED_DATA\PDM_2DD_TARGET_&_X_BKT_STRING_DATA_V2.csv")