# Baseline: Задача №2 — мониторинг и переобучение по батчам

Цель: модель обучается на ранних батчах, тестируется на поздних; считаем сдвиги и правило переобучения.

In [1]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score

pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 140)


In [2]:
df = pd.read_csv('financial_loan.csv')
for c in ['issue_date','last_credit_pull_date','last_payment_date','next_payment_date']:
    df[c] = pd.to_datetime(df[c], dayfirst=True, errors='coerce')

print(df.shape)
df.head()


(38576, 24)


Unnamed: 0,id,address_state,application_type,emp_length,emp_title,grade,home_ownership,issue_date,last_credit_pull_date,last_payment_date,loan_status,next_payment_date,member_id,purpose,sub_grade,term,verification_status,annual_income,dti,installment,int_rate,loan_amount,total_acc,total_payment
0,1077430,GA,INDIVIDUAL,< 1 year,Ryder,C,RENT,2021-02-11,2021-09-13,2021-04-13,Charged Off,2021-05-13,1314167,car,C4,60 months,Source Verified,30000.0,0.01,59.83,0.1527,2500,4,1009
1,1072053,CA,INDIVIDUAL,9 years,MKC Accounting,E,RENT,2021-01-01,2021-12-14,2021-01-15,Fully Paid,2021-02-15,1288686,car,E1,36 months,Source Verified,48000.0,0.0535,109.43,0.1864,3000,4,3939
2,1069243,CA,INDIVIDUAL,4 years,Chemat Technology Inc,C,RENT,2021-01-05,2021-12-12,2021-01-09,Charged Off,2021-02-09,1304116,car,C5,36 months,Not Verified,50000.0,0.2088,421.65,0.1596,12000,11,3522
3,1041756,TX,INDIVIDUAL,< 1 year,barnes distribution,B,MORTGAGE,2021-02-25,2021-12-12,2021-03-12,Fully Paid,2021-04-12,1272024,car,B2,60 months,Source Verified,42000.0,0.054,97.06,0.1065,4500,9,4911
4,1068350,IL,INDIVIDUAL,10+ years,J&J Steel Inc,A,MORTGAGE,2021-01-01,2021-12-14,2021-01-15,Fully Paid,2021-02-15,1302971,car,A1,36 months,Verified,83000.0,0.0231,106.53,0.0603,3500,28,3835


## Данные, таргет, признаки

In [3]:
work = df[df['loan_status'].isin(['Fully Paid', 'Charged Off'])].copy()
work['target'] = (work['loan_status'] == 'Charged Off').astype(int)

features = [
    'annual_income', 'dti', 'loan_amount', 'term', 'purpose', 'home_ownership',
    'verification_status', 'address_state', 'application_type', 'emp_length', 'emp_title'
]

work['term_months'] = work['term'].str.extract(r'(\d+)').astype(float)
features = [c for c in features if c != 'term'] + ['term_months']

X = work[features].copy()
y = work['target'].copy()
work['issue_month'] = work['issue_date'].dt.to_period('M')

print('months:', work['issue_month'].nunique())
print('target rate:', y.mean())


months: 12
target rate: 0.14229681413095682


## Батчи: ранние/поздние

In [4]:
months_sorted = work['issue_month'].dropna().sort_values().unique()
cut = int(len(months_sorted) * 0.75)
train_months = months_sorted[:cut]
test_months = months_sorted[cut:]

train_idx = work['issue_month'].isin(train_months)
test_idx = work['issue_month'].isin(test_months)

X_train = X[train_idx].copy()
y_train = y[train_idx].copy()
X_test = X[test_idx].copy()
y_test = y[test_idx].copy()

print('train months:', train_months.min(), '->', train_months.max())
print('test months:', test_months.min(), '->', test_months.max())
print('train size:', len(X_train), 'test size:', len(X_test))


train months: 2021-01 -> 2021-09
test months: 2021-10 -> 2021-12
train size: 25864 test size: 11614


## emp_title: группировка редких

In [5]:
N = 25
vc = X_train['emp_title'].fillna('MISSING').value_counts()
keep = set(vc.head(N).index)

for part in [X_train, X_test]:
    part['emp_title'] = part['emp_title'].fillna('MISSING')
    part['emp_title'] = part['emp_title'].where(part['emp_title'].isin(keep), 'Other')


## Модель

In [6]:
cat_cols = [
    'purpose','home_ownership','verification_status','address_state',
    'application_type','emp_length','emp_title'
]
num_cols = ['annual_income','dti','loan_amount','term_months']

preprocess = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), num_cols),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), cat_cols)
    ]
)

model = LogisticRegression(max_iter=1000, class_weight='balanced')
clf = Pipeline(steps=[('prep', preprocess), ('model', model)])

clf.fit(X_train, y_train)

proba = clf.predict_proba(X_test)[:, 1]
print('ROC AUC:', roc_auc_score(y_test, proba))
print('PR AUC:', average_precision_score(y_test, proba))


ROC AUC: 0.6670766455420963
PR AUC: 0.26783897446365496


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Дрейф признаков (PSI)

In [7]:
def psi_numeric(expected, actual, bins=10):
    expected = expected.replace([np.inf, -np.inf], np.nan).dropna()
    actual = actual.replace([np.inf, -np.inf], np.nan).dropna()
    if expected.nunique() < 2:
        return 0.0
    quantiles = np.linspace(0, 1, bins + 1)
    breaks = expected.quantile(quantiles).values
    breaks = np.unique(breaks)
    if len(breaks) <= 2:
        return 0.0
    exp_counts = pd.cut(expected, bins=breaks, include_lowest=True).value_counts(normalize=True)
    act_counts = pd.cut(actual, bins=breaks, include_lowest=True).value_counts(normalize=True)
    exp_counts, act_counts = exp_counts.align(act_counts, fill_value=1e-6)
    return float(np.sum((act_counts - exp_counts) * np.log(act_counts / exp_counts)))


def psi_categorical(expected, actual):
    exp_counts = expected.fillna('MISSING').value_counts(normalize=True)
    act_counts = actual.fillna('MISSING').value_counts(normalize=True)
    exp_counts, act_counts = exp_counts.align(act_counts, fill_value=1e-6)
    return float(np.sum((act_counts - exp_counts) * np.log(act_counts / exp_counts)))

psi_rows = []
for c in num_cols:
    psi_rows.append((c, psi_numeric(X_train[c], X_test[c])))
for c in cat_cols:
    psi_rows.append((c, psi_categorical(X_train[c], X_test[c])))

psi_df = pd.DataFrame(psi_rows, columns=['feature','psi']).sort_values('psi', ascending=False)
psi_df


Unnamed: 0,feature,psi
4,purpose,0.025669
2,loan_amount,0.020009
7,address_state,0.010982
10,emp_title,0.006098
5,home_ownership,0.003902
9,emp_length,0.003073
6,verification_status,0.00209
1,dti,0.001298
0,annual_income,0.000849
3,term_months,0.0


## Правило переобучения (на текущем батче)

In [8]:
psi_threshold = 0.2
auc_threshold = 0.65

auc_test = roc_auc_score(y_test, proba)
retrain_needed = (psi_df['psi'].max() > psi_threshold) or (auc_test < auc_threshold)

print('max PSI:', psi_df['psi'].max())
print('AUC test:', auc_test)
print('retrain_needed:', retrain_needed)


max PSI: 0.025668864893368767
AUC test: 0.6670766455420963
retrain_needed: False
