In [7]:
# Home_Credit_Production_Notebook.ipynb - COMPLETE PRODUCTION SCORING PIPELINE

import pandas as pd
import numpy as np
import joblib                         
import gc
import os # <--- NEW: Import os module for file system operations
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
# Removed unused imports: SGDClassifier, LogisticRegression, roc_auc_score, train_test_split

# --- 1. CONFIGURATION ---
DATA_ROOT = './data/'
MAIN_FILE = DATA_ROOT + 'application_test.csv'  # CRITICAL: Using test data for scoring
MODEL_SAVE_PATH = './models/full_pipeline.joblib'
TARGET_COLUMN = 'TARGET' 
RANDOM_SEED = 42
OUTPUT_PATH = './submission/credit_risk_scores.csv' # Output file for final scores

print("--- Starting Production Scoring Pipeline ---")

# ====================================================================
# 2. UTILITY AND FEATURE ENGINEERING FUNCTIONS (REMAIN UNCHANGED)
# ====================================================================

def downcast_dtypes(df):
    """Memory optimization: downcast numerical columns to smaller types."""
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > -128 and c_max < 127:
                    df[col] = df[col].astype(np.int8)
                elif c_min > -32768 and c_max < 32767:
                    df[col] = df[col].astype(np.int16)
                elif c_min > -2147483648 and c_max < 2147483647:
                    df[col] = df[col].astype(np.int32)
                else:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print(f'Memory reduced from {start_mem:.2f} MB to {end_mem:.2f} MB')
    return df

def feature_engineer_application_data(df):
    """Applies cleaning and feature engineering to the main application data."""
    
    cols_to_drop = ['FLAG_MOBIL', 'FLAG_DOCUMENT_2']
    df = df.drop(columns=cols_to_drop, errors='ignore')

    # FIX: DAYS_EMPLOYED Outlier Treatment (CRITICAL)
    df['DAYS_EMPLOYED_ANOM'] = (df['DAYS_EMPLOYED'] == 365243).astype(np.int8)
    df['DAYS_EMPLOYED'] = df['DAYS_EMPLOYED'].replace({365243: np.nan})
    
    time_cols_to_abs = [c for c in df.columns if c.startswith('DAYS_')]
    df[time_cols_to_abs] = df[time_cols_to_abs].abs()

    df['AGE_YEARS'] = df['DAYS_BIRTH'] / 365.25
    df['EMPLOYED_TO_AGE_RATIO'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    df['ANNUITY_TO_INCOME_RATIO'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
    
    redundant_suffixes = ['_MEDI', '_MODE']
    cols_to_drop = [c for c in df.columns if any(c.endswith(s) and not c.endswith('_AVG') for s in redundant_suffixes)]
    cols_to_drop.extend(['YEARS_BEGINEXPLUATATION_MEDI'])
    df = df.drop(columns=list(set(cols_to_drop)), errors='ignore')
    
    bureau_req_cols = [col for col in df.columns if col.startswith('AMT_REQ_CREDIT_BUREAU')]
    df[bureau_req_cols] = df[bureau_req_cols].fillna(0)
    
    return df

# ====================================================================
# 3. FEATURE AGGREGATION FUNCTIONS (REMAIN UNCHANGED)
# ====================================================================

def merge_bureau_data(df_main):
    """Processes bureau data and merges."""
    print("\nProcessing Bureau data (bureau.csv + bureau_balance.csv) with advanced features...")
    bb = pd.read_csv(DATA_ROOT + 'bureau_balance.csv'); bb = downcast_dtypes(bb)
    bb_cat = pd.get_dummies(bb, columns=['STATUS'], dummy_na=True)
    bb_agg_month = bb_cat.groupby('SK_ID_BUREAU')[
        ['MONTHS_BALANCE', 'STATUS_0', 'STATUS_1', 'STATUS_C', 'STATUS_X']
    ].agg(['min', 'max', 'mean', 'sum', 'count'])
    bb_agg_month.columns = pd.Index(['BB_' + e[0] + "_" + e[1].upper() for e in bb_agg_month.columns.tolist()])
    bb_agg_month = bb_agg_month.reset_index()
    del bb, bb_cat; gc.collect()

    bureau = pd.read_csv(DATA_ROOT + 'bureau.csv'); bureau = downcast_dtypes(bureau)
    bureau = bureau.merge(bb_agg_month, on='SK_ID_BUREAU', how='left')
    del bb_agg_month; gc.collect()
    
    bureau_cat = pd.get_dummies(bureau, columns=['CREDIT_ACTIVE', 'CREDIT_CURRENCY', 'CREDIT_TYPE'], dummy_na=True)
    num_agg = {
        'DAYS_CREDIT': ['mean', 'max'], 'AMT_CREDIT_MAX_OVERDUE': ['max', 'mean'], 'AMT_CREDIT_SUM': ['sum', 'mean']
    }
    cat_agg = {
        'CREDIT_ACTIVE_Active': ['mean', 'sum'], 'CREDIT_ACTIVE_Closed': ['mean', 'sum'], 
        'CREDIT_TYPE_Consumer credit': ['mean'], 'CREDIT_TYPE_Credit card': ['mean'], 
        'BB_MONTHS_BALANCE_COUNT': ['mean'] 
    }
    bureau_agg = bureau_cat.groupby('SK_ID_CURR').agg({**num_agg, **cat_agg})
    bureau_agg.columns = pd.Index(['BUREAU_' + '_'.join(col).upper() for col in bureau_agg.columns.ravel()])
    bureau_agg = bureau_agg.reset_index()

    df_main = df_main.merge(bureau_agg, on='SK_ID_CURR', how='left')
    del bureau, bureau_cat, bureau_agg; gc.collect()
    
    print(f"Bureau merge complete. Final Main Dataframe shape: {df_main.shape}")
    return df_main


def get_prev_application_features(df_main):
    """Processes previous_application data and merges."""
    print("\nProcessing Previous Applications data (with high-value ratios and status)...")
    prev = pd.read_csv(DATA_ROOT + 'previous_application.csv'); prev = downcast_dtypes(prev)
    prev['CREDIT_TO_APP_RATIO'] = prev['AMT_CREDIT'] / prev['AMT_APPLICATION']
    prev['ANNUITY_TO_CREDIT_RATIO'] = prev['AMT_ANNUITY'] / prev['AMT_CREDIT']
    prev = prev.replace([np.inf, -np.inf], np.nan) 
    
    prev_cat = pd.get_dummies(prev, columns=['NAME_CONTRACT_STATUS', 'NAME_YIELD_GROUP'], dummy_na=True)
    num_agg = {
        'AMT_CREDIT': ['mean', 'sum', 'max'], 'AMT_ANNUITY': ['mean', 'sum'],
        'AMT_APPLICATION': ['mean', 'sum'], 'RATE_DOWN_PAYMENT': ['mean'],
        'DAYS_DECISION': ['mean', 'min', 'max'], 'CREDIT_TO_APP_RATIO': ['mean'],
    }
    cat_agg = {
        'NAME_CONTRACT_STATUS_Approved': ['mean', 'sum'], 'NAME_CONTRACT_STATUS_Refused': ['mean', 'sum'],
        'NAME_YIELD_GROUP_low_action': ['mean'], 'NAME_YIELD_GROUP_high': ['mean']
    }
    prev_agg = prev_cat.groupby('SK_ID_CURR').agg({**num_agg, **cat_agg})
    prev_agg.columns = pd.Index(['PREV_' + '_'.join(col).upper() for col in prev_agg.columns.ravel()])
    prev_agg = prev_agg.reset_index()

    del prev, prev_cat; gc.collect()
    df_main = df_main.merge(prev_agg, on='SK_ID_CURR', how='left')
    return df_main

def get_temporal_features(df_main):
    """Processes the three temporal balance files and merges."""
    print("\nProcessing Temporal Balance data (Installments, POS_CASH, Credit Card)...")
    
    # Installments Payments
    install = pd.read_csv(DATA_ROOT + 'installments_payments.csv'); install = downcast_dtypes(install)
    install['DPD'] = install['DAYS_ENTRY_PAYMENT'] - install['DAYS_INSTALMENT']
    install['DPD'] = install['DPD'].apply(lambda x: x if x > 0 else 0)
    install_agg = install.groupby('SK_ID_CURR')[['DPD', 'AMT_PAYMENT']].agg(['mean', 'sum']).reset_index() 
    install_agg.columns = ['SK_ID_CURR', 'INST_DPD_MEAN', 'INST_DPD_SUM', 'INST_PAYMENT_MEAN', 'INST_PAYMENT_SUM']
    df_main = df_main.merge(install_agg, on='SK_ID_CURR', how='left')
    del install, install_agg; gc.collect()
    
    # POS_CASH Balance
    pos = pd.read_csv(DATA_ROOT + 'POS_CASH_balance.csv'); pos = downcast_dtypes(pos)
    pos_agg = pos.groupby('SK_ID_CURR')[['CNT_INSTALMENT_FUTURE']].agg(['min', 'max', 'mean']).reset_index() 
    pos_agg.columns = ['SK_ID_CURR', 'POS_INST_FUTURE_MIN', 'POS_INST_FUTURE_MAX', 'POS_INST_FUTURE_MEAN']
    df_main = df_main.merge(pos_agg, on='SK_ID_CURR', how='left')
    del pos, pos_agg; gc.collect()

    # Credit Card Balance
    cc = pd.read_csv(DATA_ROOT + 'credit_card_balance.csv'); cc = downcast_dtypes(cc)
    cc_agg = cc.groupby('SK_ID_CURR')[['AMT_BALANCE']].agg(['mean', 'max']).reset_index()
    cc_agg.columns = ['SK_ID_CURR', 'CC_BALANCE_MEAN', 'CC_BALANCE_MAX']
    df_main = df_main.merge(cc_agg, on='SK_ID_CURR', how='left')
    del cc, cc_agg; gc.collect()
    
    return df_main

# ====================================================================
# 4. SCORING PIPELINE EXECUTION BLOCK (Runs Automatically)
# ====================================================================

try:
    # --- STEP 1: LOAD TEST DATA & INITIAL CLEANING ---
    df_full = pd.read_csv(MAIN_FILE)
    df_full = downcast_dtypes(df_full)
    df_full = feature_engineer_application_data(df_full)
    df_full = pd.get_dummies(df_full)
    
    X_raw = df_full 
    del df_full; gc.collect()

    # --- STEP 2: GENERATE FULL FEATURE SET ---
    print("\n--- STEP 2: GENERATE FULL FEATURE SET FOR SCORING ---")
    
    df_current = X_raw.copy()
    
    # Execute all feature engineering functions in sequence
    df_current = merge_bureau_data(df_current) 
    df_current = get_prev_application_features(df_current)
    df_current = get_temporal_features(df_current)

    # --- STEP 3: LOAD MODEL AND SCORE DATA (FEATURE RECONCILIATION) ---
    print("\n--- STEP 3: LOAD MODEL AND SCORE DATA ---")
    
    # 1. Load the saved pipeline
    print(f"\nLoading final production model from {MODEL_SAVE_PATH}...")
    final_pipeline = joblib.load(MODEL_SAVE_PATH)
    
    # CRITICAL FIX 1: Retrieve the 241 feature names from the saved model artifact
    TRAIN_FEATURES_241 = list(final_pipeline['preprocessor'].named_steps['imputer'].feature_names_in_)
    
    # 2. Final data preparation: Drop SK_ID_CURR and REINDEX against the 241 list
    X_test_final = df_current.drop(columns='SK_ID_CURR', errors='ignore')
    
    # CRITICAL FIX 2: Reindex against the 241 features from training. 
    X_test_final = X_test_final.reindex(columns=TRAIN_FEATURES_241, fill_value=0)
    
    # 3. Predict probabilities (the final risk score)
    y_pred_proba = final_pipeline.predict_proba(X_test_final.values)[:, 1] 

    # 4. Save the final output
    
    # CRITICAL FIX 3: Create the submission directory if it doesn't exist
    os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True) 
    
    output_df = pd.DataFrame({
        'SK_ID_CURR': df_current['SK_ID_CURR'],
        'TARGET': y_pred_proba
    })

    output_df.to_csv(OUTPUT_PATH, index=False)
    
    print(f"\nâœ… Production Scoring Complete. {len(output_df)} risk scores generated.")
    print(f"Results saved to {OUTPUT_PATH} (File: credit_risk_scores.csv)")
    del df_current, X_test_final; gc.collect()

except Exception as e:
    print(f"\nðŸ›‘ FATAL ERROR DURING SCORING PIPELINE EXECUTION: {e}")

--- Starting Production Scoring Pipeline ---
Memory reduced from 45.00 MB to 14.60 MB


  df['DAYS_EMPLOYED_ANOM'] = (df['DAYS_EMPLOYED'] == 365243).astype(np.int8)
  df['AGE_YEARS'] = df['DAYS_BIRTH'] / 365.25
  df['EMPLOYED_TO_AGE_RATIO'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
  df['ANNUITY_TO_INCOME_RATIO'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']



--- STEP 2: GENERATE FULL FEATURE SET FOR SCORING ---

Processing Bureau data (bureau.csv + bureau_balance.csv) with advanced features...
Memory reduced from 624.85 MB to 338.46 MB
Memory reduced from 222.62 MB to 112.95 MB
Bureau merge complete. Final Main Dataframe shape: (48744, 212)

Processing Previous Applications data (with high-value ratios and status)...
Memory reduced from 471.48 MB to 309.01 MB

Processing Temporal Balance data (Installments, POS_CASH, Credit Card)...
Memory reduced from 830.41 MB to 311.40 MB
Memory reduced from 610.43 MB to 238.45 MB
Memory reduced from 673.88 MB to 289.33 MB

--- STEP 3: LOAD MODEL AND SCORE DATA ---

Loading final production model from ./models/full_pipeline.joblib...





âœ… Production Scoring Complete. 48744 risk scores generated.
Results saved to ./submission/credit_risk_scores.csv (File: credit_risk_scores.csv)
