In [17]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
import xgboost as xgb
import joblib
import os

# === Load and preprocess training data ===
df = pd.read_csv('./training_recovery_data.csv')
df['txn_date'] = pd.to_datetime(df['txn_date'])
df['success_flag'] = (df['transaction_status'] == 'success').astype(int)

# Label Encoding
le_card = LabelEncoder()
le_brand = LabelEncoder()
le_currency = LabelEncoder()
df['card_number_enc'] = le_card.fit_transform(df['card_number'])
df['card_brand_enc'] = le_brand.fit_transform(df['card_brand'])
df['currency_enc'] = le_currency.fit_transform(df['currency_code'])

# Sort for time-based features
df.sort_values(by=['card_number', 'txn_date'], inplace=True)
df['txn_index'] = df.groupby('card_number').cumcount()

# Rolling stats per card
df['rolling_success_7d'] = (
    df.groupby('card_number')['success_flag']
    .rolling(7, min_periods=1).mean().reset_index(level=0, drop=True)
)

# Build date-based features
df['weekday'] = df['txn_date'].dt.weekday
df['month'] = df['txn_date'].dt.month
df['is_weekend'] = df['weekday'].isin([5, 6]).astype(int)
df['is_month_start'] = (df['txn_date'].dt.day <= 3).astype(int)
df['is_month_end'] = (df['txn_date'].dt.day >= 28).astype(int)

# Aggregate historical features
agg = df.groupby('card_number').agg({
    'success_flag': ['sum', 'mean'],
    'transaction_amount': 'mean',
    'txn_date': 'max'
})
agg.columns = ['total_success', 'avg_success_rate', 'avg_amount', 'last_txn_date']

card_stats = agg.reset_index()

# === Prepare final training set ===
df = df.merge(card_stats, on='card_number', how='left')
df['days_since_last_txn'] = (df['txn_date'] - df['last_txn_date']).dt.days.abs()

features = [
    'card_number_enc', 'card_brand_enc', 'currency_enc', 'transaction_amount',
    'weekday', 'month', 'is_weekend', 'is_month_start', 'is_month_end',
    'txn_index', 'rolling_success_7d', 'avg_success_rate', 'days_since_last_txn'
]
target = 'success_flag'

X = df[features]
y = df[target]

# Split train/test for evaluation
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
model.fit(X_train, y_train)

# Remove existing pkl files if they exist
model_path = './enterprise_retry_model.pkl'
enc_path = './enterprise_encoders.pkl'

if os.path.exists(model_path):
    os.remove(model_path)
if os.path.exists(enc_path):
    os.remove(enc_path)

joblib.dump(model, model_path)
joblib.dump((le_card, le_brand, le_currency), enc_path)
print("Enterprise-level retry model saved.")

# === Evaluation ===
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]
print("\n--- Classification Report ---")
print(classification_report(y_test, y_pred))
auc = roc_auc_score(y_test, y_proba)
print(f"ROC-AUC Score: {auc:.4f}")

# === RUNTIME: Recommend top retry dates with confidence ===
def get_top_retry_dates_after(card_number, card_brand, currency_code, amount, after_date, days_ahead=30, top_k=4):
    model = joblib.load(model_path)
    le_card, le_brand, le_currency = joblib.load(enc_path)

    if card_number not in le_card.classes_:
        return ["Unknown card"]

    card_id = le_card.transform([card_number])[0]
    brand_id = le_brand.transform([card_brand])[0]
    currency_id = le_currency.transform([currency_code])[0]

    start_date = pd.to_datetime(after_date).date()
    rows = []
    for i in range(1, days_ahead + 1):
        date = start_date + timedelta(days=i)
        weekday = date.weekday()
        month = date.month
        is_weekend = int(weekday in [5, 6])
        is_start = int(date.day <= 3)
        is_end = int(date.day >= 28)

        row = {
            'card_number_enc': card_id,
            'card_brand_enc': brand_id,
            'currency_enc': currency_id,
            'transaction_amount': amount,
            'weekday': weekday,
            'month': month,
            'is_weekend': is_weekend,
            'is_month_start': is_start,
            'is_month_end': is_end,
            'txn_index': 500 + i,
            'rolling_success_7d': 0.5,
            'avg_success_rate': 0.5,
            'days_since_last_txn': i
        }
        rows.append((date, row))

    df_future = pd.DataFrame([r[1] for r in rows])
    probs = model.predict_proba(df_future)[:, 1]
    ranked = sorted(zip([r[0] for r in rows], probs), key=lambda x: -x[1])
    return [(d.strftime('%Y-%m-%d'), round(p * 100, 2)) for d, p in ranked[:top_k]]

# === Test the model with a real card from training data ===
sample = df[df['success_flag'] == 0].iloc[0]
sample_card = sample['card_number']
sample_brand = sample['card_brand']
sample_currency = sample['currency_code']
sample_amount = sample['transaction_amount']

print("\nTesting retry recommendation after today for sample card:")
print("Card:", sample_card, "Brand:", sample_brand, "Currency:", sample_currency, "Amount:", sample_amount)
best_dates = get_top_retry_dates_after(sample_card, sample_brand, sample_currency, sample_amount, datetime.today())
print("Recommended retry dates with confidence:")
for date, confidence in best_dates:
    print(f"{date} - Confidence: {confidence}%")

Enterprise-level retry model saved.

--- Classification Report ---
              precision    recall  f1-score   support

           0       0.59      0.22      0.32      1998
           1       0.83      0.96      0.89      8002

    accuracy                           0.81     10000
   macro avg       0.71      0.59      0.61     10000
weighted avg       0.78      0.81      0.78     10000

ROC-AUC Score: 0.7879

Testing retry recommendation after today for sample card:
Card: card_0 Brand: VISA Currency: EUR Amount: 110.01
Recommended retry dates with confidence:
2025-03-24 - Confidence: 60.619998931884766%
2025-04-03 - Confidence: 57.15999984741211%
2025-04-21 - Confidence: 56.90999984741211%
2025-04-07 - Confidence: 56.77000045776367%
