# Baseline: Задача №1 — предсказание `loan_status` до выдачи

Цель: базовая модель для предсказания `Charged Off` vs `Fully Paid` **только по заявочным признакам** (без утечек).

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score, average_precision_score, accuracy_score,
    precision_score, recall_score, f1_score, confusion_matrix
)

pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 140)


In [None]:
df = pd.read_csv('financial_loan.csv')
print(df.shape)
df.head()


## Подготовка данных и таргета

In [None]:
# Оставляем только финальные статусы
work = df[df['loan_status'].isin(['Fully Paid', 'Charged Off'])].copy()

# Таргет: 1 = Charged Off, 0 = Fully Paid
work['target'] = (work['loan_status'] == 'Charged Off').astype(int)

print(work['loan_status'].value_counts())
print(work['target'].mean())


## Признаки (заявочные) и исключения

In [None]:
# Заявочные признаки (как в README)
features = [
    'annual_income', 'dti', 'loan_amount', 'term', 'purpose', 'home_ownership',
    'verification_status', 'address_state', 'application_type', 'emp_length', 'emp_title'
]

# Исключаем утечки и постфактум
leak_cols = [
    'loan_status', 'total_payment', 'last_payment_date', 'next_payment_date',
    'last_credit_pull_date', 'issue_date', 'id', 'member_id'
]

# Приводим term к числу месяцев
work['term_months'] = work['term'].str.extract(r'(\d+)').astype(float)
features = [c for c in features if c != 'term'] + ['term_months']

X = work[features].copy()
y = work['target'].copy()

print('features:', features)
X.head()


## Обработка `emp_title` (высокая кардинальность)

In [None]:
# Группируем редкие значения emp_title в Other на основе train-сета
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

N = 25
vc = X_train['emp_title'].fillna('MISSING').value_counts()
keep = set(vc.head(N).index)

for part in [X_train, X_test]:
    part['emp_title'] = part['emp_title'].fillna('MISSING')
    part['emp_title'] = part['emp_title'].where(part['emp_title'].isin(keep), 'Other')

print('emp_title unique (train after grouping):', X_train['emp_title'].nunique())


## Модель (baseline)

In [None]:
# Разделяем признаки
cat_cols = [
    'purpose','home_ownership','verification_status','address_state',
    'application_type','emp_length','emp_title'
]
num_cols = ['annual_income','dti','loan_amount','term_months']

preprocess = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), num_cols),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), cat_cols)
    ]
)

model = LogisticRegression(max_iter=1000, class_weight='balanced')

clf = Pipeline(steps=[('prep', preprocess), ('model', model)])

clf


## Обучение и оценка

In [None]:
clf.fit(X_train, y_train)

proba = clf.predict_proba(X_test)[:, 1]
pred = (proba >= 0.5).astype(int)

metrics = {
    'ROC AUC': roc_auc_score(y_test, proba),
    'PR AUC': average_precision_score(y_test, proba),
    'Accuracy': accuracy_score(y_test, pred),
    'Precision': precision_score(y_test, pred),
    'Recall': recall_score(y_test, pred),
    'F1': f1_score(y_test, pred)
}

metrics


In [None]:
cm = confusion_matrix(y_test, pred)
cm


## Быстрые выводы

- Это baseline без тонкой настройки порога и без сложного feature engineering.
- Следующий шаг — калибровка порога (например, по бизнес‑метрике) и сравнение с другими моделями.
- Можно добавить инженерные признаки: loan_to_income, installment_to_income (если допустимо), нормализация income, бининг.
