### Задача B — антифрод через proxy-label / anomaly detection
**Цель:** выявлять *подозрительные* кредиты/заявки, похожие на схему “early default”.

**Вариант B1 (proxy-label):**
- Формируем прокси-метку мошенничества:
  - `fraud_proxy = 1`, если `loan_status=Charged Off` и `payment_ratio < τ` (например, τ=0.1),
  - иначе `fraud_proxy = 0`.
- Обучаем модель, которая по **заявочным** признакам (без leakage) предсказывает вероятность `fraud_proxy`.

In [24]:
import pandas as pd
import numpy as np
import re
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import (
    average_precision_score, roc_auc_score, 
    precision_recall_curve, f1_score, recall_score,
    precision_score, confusion_matrix, classification_report,
    PrecisionRecallDisplay, RocCurveDisplay
)
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# Для разреженных матриц
from scipy.sparse import hstack, csr_matrix

import matplotlib.pyplot as plt
import seaborn as sns

In [25]:
df = pd.read_csv('financial_loan.csv')

print(f"Всего записей: {len(df)}")
print(f"\nРаспределение loan_status:")
print(df['loan_status'].value_counts())


Всего записей: 38576

Распределение loan_status:
loan_status
Fully Paid     32145
Charged Off     5333
Current         1098
Name: count, dtype: int64


Target

In [26]:

dfB1 = df[df["loan_status"].isin(["Fully Paid", "Charged Off"])].copy()
dfB1["payment_ratio"] = dfB1["total_payment"] / dfB1["loan_amount"]
dfB1["y"] = ((dfB1["loan_status"] == "Charged Off") & (dfB1["payment_ratio"] < 0.1)).astype(int)
fraud_count = dfB1["y"].sum()
total_charged_off = (dfB1['loan_status'] == 'Charged Off').sum()

print(f"\n{'='*50}")
print("СТАТИСТИКА ПО МЕТКАМ:")
print(f"Charged Off: {total_charged_off} ({total_charged_off/len(dfB1)*100:.1f}%)")
print(f"Fraud proxy (Charged Off + payment_ratio < 0.1): {fraud_count} ({fraud_count/len(dfB1)*100:.2f}%)")
print(f"Из Charged Off как fraud: {fraud_count/total_charged_off*100:.1f}%")
print(f"{'='*50}")


СТАТИСТИКА ПО МЕТКАМ:
Charged Off: 5333 (14.2%)
Fraud proxy (Charged Off + payment_ratio < 0.1): 206 (0.55%)
Из Charged Off как fraud: 3.9%


In [27]:

leak_cols_drop = [
    "total_payment", "last_payment_date", "next_payment_date",
    "last_credit_pull_date", "id", "member_id", "application_type",
    "loan_status", "payment_ratio"  
]
dfB1 = dfB1.drop(columns=leak_cols_drop, errors='ignore')


dfB1["issue_date"] = pd.to_datetime(dfB1["issue_date"], dayfirst=True)


dates_to_drop = [
    '2021-01-01', '2021-01-05', '2021-02-25', '2021-07-17',
    '2021-11-19', '2021-09-02', '2021-07-22', '2021-12-02', '2021-12-12', '2021-02-02'
]
dates_to_drop = pd.to_datetime(dates_to_drop)
dfB1 = dfB1[~dfB1["issue_date"].isin(dates_to_drop)]


dfB1["issue_month"] = dfB1["issue_date"].dt.month.astype("int16")
dfB1 = dfB1.drop(columns=["issue_date"], axis=1)


In [28]:
dfB1["term_months"] = dfB1["term"].astype(str).str.extract(r"(\d+)").astype("int16")
dfB1 = dfB1.drop(columns=["term"], errors='ignore')


p99_income = dfB1["annual_income"].quantile(0.99)
dfB1["annual_income_cap"] = dfB1["annual_income"].clip(upper=p99_income)
dfB1["log_income"] = np.log10(dfB1["annual_income_cap"].replace(0, 1))
dfB1 = dfB1.drop(columns=["annual_income", "annual_income_cap"], errors='ignore')

большие значения у среднего кридитного рейтинга

In [29]:
grade_map = {g: i+1 for i, g in enumerate(list("ABCDEFG"))}
dfB1["sub_grade_num"] = dfB1["sub_grade"].astype(str).apply(
    lambda s: 5*(grade_map.get(s[0], np.nan)-1) + int(s[1]) if len(s) >= 2 and s[0] in grade_map else np.nan
)

CENTER = 13     
MAX_SCORE = 20  

dfB1["sub_grade_bell"] = (
    MAX_SCORE - (dfB1["sub_grade_num"] - CENTER).abs()
)


dfB1["sub_grade_bell"] = dfB1["sub_grade_bell"].clip(lower=0)
dfB1["sub_grade_num"] = dfB1["sub_grade_bell"].copy()
dfB1 = dfB1.drop(columns=["grade", "sub_grade", "sub_grade_bell"], errors='ignore')

In [30]:

def annuity_payment(L, annual_rate, n_months):
    r = annual_rate / 12.0
    if r == 0:
        return L / n_months
    return L * (r * (1 + r)**n_months) / ((1 + r)**n_months - 1)

dfB1["installment_expected"] = dfB1.apply(
    lambda row: annuity_payment(row["loan_amount"], row["int_rate"], row["term_months"]),
    axis=1
)
dfB1["installment_rel_err"] = (dfB1["installment"] - dfB1["installment_expected"]) / dfB1["installment_expected"].replace(0, np.nan)
dfB1 = dfB1.drop(columns=["installment_expected", "installment"], errors='ignore')

In [31]:
dfB1["loan_to_income"] = dfB1["loan_amount"] / df["annual_income"].replace(0, np.nan)
dfB1 = dfB1.drop(columns=["loan_amount"], errors='ignore')

In [32]:
def clean_title(x):
    if pd.isna(x):
        return "unknown"
    x = str(x).lower()
    x = re.sub(r"[^a-z\s]", " ", x)
    x = re.sub(r"\s+", " ", x).strip()
    return x if x else "unknown"

dfB1["emp_title_clean"] = dfB1["emp_title"].apply(clean_title)
dfB1["emp_title_is_unknown"] = (dfB1["emp_title_clean"] == "unknown").astype(int)
dfB1["emp_title_len"] = dfB1["emp_title_clean"].str.len()
dfB1 = dfB1.drop(columns=["emp_title"], errors='ignore')

In [33]:
def parse_emp_length(x):
    if pd.isna(x):
        return np.nan
    s = str(x).strip().lower()
    if s in ["n/a", "na", "none", "null", "", "unknown"]:
        return np.nan
    if "<" in s:
        return 0.5
    if "10" in s:
        return 10
    m = re.search(r"(\d+)", s)
    return float(m.group(1)) if m else np.nan

dfB1["emp_length_years"] = dfB1["emp_length"].apply(parse_emp_length)
dfB1 = dfB1.drop(columns=["emp_length"], errors='ignore')

# Удаляем int_rate (использовали для расчёта)
dfB1 = dfB1.drop(columns=["int_rate"], errors='ignore')

print(f"\nРазмерность после feature engineering: {dfB1.shape}")
print(f"Колонки: {list(dfB1.columns)}")


Размерность после feature engineering: (37467, 17)
Колонки: ['address_state', 'home_ownership', 'purpose', 'verification_status', 'dti', 'total_acc', 'y', 'issue_month', 'term_months', 'log_income', 'sub_grade_num', 'installment_rel_err', 'loan_to_income', 'emp_title_clean', 'emp_title_is_unknown', 'emp_title_len', 'emp_length_years']


In [34]:
train_months = [1, 2, 3, 4, 5, 6, 7, 8]
val_months = [9, 10]
test_months = [11, 12]

train_df = dfB1[dfB1["issue_month"].isin(train_months)].copy()
val_df = dfB1[dfB1["issue_month"].isin(val_months)].copy()
test_df = dfB1[dfB1["issue_month"].isin(test_months)].copy()

print(f"\nРазмеры выборок:")
print(f"Train: {train_df.shape}, Fraud rate: {train_df['y'].mean()*100:.2f}%")
print(f"Val: {val_df.shape}, Fraud rate: {val_df['y'].mean()*100:.2f}%")
print(f"Test: {test_df.shape}, Fraud rate: {test_df['y'].mean()*100:.2f}%")


Размеры выборок:
Train: (22469, 17), Fraud rate: 0.58%
Val: (7018, 17), Fraud rate: 0.58%
Test: (7980, 17), Fraud rate: 0.43%


In [35]:

ver_map = {"Not Verified": 0, "Source Verified": 2, "Verified": 1}
for d in [train_df, val_df, test_df]:
    d["verif_ord"] = d["verification_status"].map(ver_map).fillna(-1).astype(int)
    d.drop("verification_status", axis=1, inplace=True, errors='ignore')


In [36]:

def map_home(x):
    x = str(x)
    if x in ["RENT"]: return 2
    if x in ["MORTGAGE"]: return 3
    if x in ["OWN"]: return 1
    return 0

for d in [train_df, val_df, test_df]:
    d["home_grp"] = d["home_ownership"].apply(map_home)
    d.drop("home_ownership", axis=1, inplace=True, errors='ignore')

In [37]:
topK = 8
top_p = train_df["purpose"].value_counts().head(topK).index

for d in [train_df, val_df, test_df]:
    d["purpose_grp"] = d["purpose"].where(d["purpose"].isin(top_p), "OTHER")
    
grp2id = {g: i for i, g in enumerate(list(top_p) + ["OTHER"], start=1)}

for d in [train_df, val_df, test_df]:
    d["purpose_grp_id"] = d["purpose_grp"].map(grp2id).fillna(grp2id["OTHER"])
    d.drop(["purpose", "purpose_grp"], axis=1, inplace=True, errors='ignore')

In [38]:
NORTHEAST = set(["CT", "ME", "MA", "NH", "RI", "VT", "NJ", "NY", "PA"])
MIDWEST = set(["IL", "IN", "MI", "OH", "WI", "IA", "KS", "MN", "MO", "NE", "ND", "SD"])
SOUTH = set(["DE", "FL", "GA", "MD", "NC", "SC", "VA", "DC", "WV", "AL", "KY", "MS", "TN", "AR", "LA", "OK", "TX"])
WEST = set(["AZ", "CO", "ID", "MT", "NV", "NM", "UT", "WY", "AK", "CA", "HI", "OR", "WA"])

def state_region(s):
    s = str(s)
    if s in NORTHEAST: return "NE"
    if s in MIDWEST: return "MW"
    if s in SOUTH: return "S"
    if s in WEST: return "W"
    return "UNK"

for d in [train_df, val_df, test_df]:
    d["state_region"] = d["address_state"].apply(state_region)
    
reg_map = {"NE": 0, "MW": 1, "S": 2, "W": 3, "UNK": 4}
for d in [train_df, val_df, test_df]:
    d["state_region_ord"] = d["state_region"].map(reg_map).fillna(4).astype(int)
    d.drop(columns=["state_region", "address_state"], axis=1, inplace=True, errors='ignore')


In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    min_df=20,
    max_features=20000,
    ngram_range=(1,2)
)

X_title_train = tfidf.fit_transform(train_df["emp_title_clean"])
X_title_val   = tfidf.transform(val_df["emp_title_clean"])
X_title_test  = tfidf.transform(test_df["emp_title_clean"])

train_df = train_df.drop(columns=["emp_title_clean"], axis = 1)
val_df   = val_df.drop(columns=["emp_title_clean"], axis = 1)
test_df  = test_df.drop(columns=["emp_title_clean"], axis = 1)


In [40]:
dfB1.columns

Index(['address_state', 'home_ownership', 'purpose', 'verification_status',
       'dti', 'total_acc', 'y', 'issue_month', 'term_months', 'log_income',
       'sub_grade_num', 'installment_rel_err', 'loan_to_income',
       'emp_title_clean', 'emp_title_is_unknown', 'emp_title_len',
       'emp_length_years'],
      dtype='object')

In [41]:
print(train_df.dtypes[train_df.dtypes == "object"])

Series([], dtype: object)


In [42]:
y_train = train_df["y"].values
y_val   = val_df["y"].values
y_test  = test_df["y"].values


X_train = csr_matrix(train_df.drop(columns=["y"]).values)
X_val   = csr_matrix(val_df.drop(columns=["y"]).values)
X_test  = csr_matrix(test_df.drop(columns=["y"]).values)

In [43]:
y_train = train_df["y"].values
y_val = val_df["y"].values
y_test = test_df["y"].values


train_df = train_df.drop(columns=["y"])
val_df = val_df.drop(columns=["y"])
test_df = test_df.drop(columns=["y"])


for col in train_df.columns:
    if train_df[col].dtype in ['float64', 'int64', 'int16']:
        median_val = train_df[col].median()
        train_df[col] = train_df[col].fillna(median_val)
        val_df[col] = val_df[col].fillna(median_val)
        test_df[col] = test_df[col].fillna(median_val)


X_train_num = csr_matrix(train_df.values)
X_val_num = csr_matrix(val_df.values)
X_test_num = csr_matrix(test_df.values)

X_train = hstack([X_train_num, X_title_train]).tocsr()
X_val = hstack([X_val_num, X_title_val]).tocsr()
X_test = hstack([X_test_num, X_title_test]).tocsr()

print(f"\nФинальные размерности:")
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}, fraud: {y_train.sum()}")
print(f"X_val: {X_val.shape}, y_val: {y_val.shape}, fraud: {y_val.sum()}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}, fraud: {y_test.sum()}")


Финальные размерности:
X_train: (22469, 490), y_train: (22469,), fraud: 131
X_val: (7018, 490), y_val: (7018,), fraud: 41
X_test: (7980, 490), y_test: (7980,), fraud: 34


In [87]:
from sklearn.metrics import (
    average_precision_score, roc_auc_score, f1_score, 
    precision_recall_curve, confusion_matrix, precision_score, recall_score
)

def evaluate_model(model, X, y, dataset_name="Test", threshold=0.01):
    """Оценка модели с выводом метрик и интерпретацией Precision/Recall"""
    
    # Предсказанные вероятности и классы
    if hasattr(model, 'predict_proba'):
        y_pred_proba = model.predict_proba(X)[:, 1]
    else:  # для моделей типа SVM
        y_pred_proba = model.decision_function(X)
    
    y_pred = (y_pred_proba >= threshold).astype(int)
    
    # Основные метрики
    pr_auc = average_precision_score(y, y_pred_proba)
    roc_auc = roc_auc_score(y, y_pred_proba)
    f1 = f1_score(y, y_pred, zero_division=0)
    
    # Precision и Recall при текущем threshold
    precision = precision_score(y, y_pred, zero_division=0)
    recall = recall_score(y, y_pred, zero_division=0)

    
    # Confusion matrix
    cm = confusion_matrix(y, y_pred)
    
    # Вывод
    print(f"\n{'='*60}")
    print(f"МЕТРИКИ: {dataset_name}")
    print(f"{'='*60}")
    print(f"PR-AUC (Average Precision): {pr_auc:.4f} ⭐ КЛЮЧЕВАЯ")
    print(f"ROC-AUC: {roc_auc:.4f}")
    print(f"F1-score: {f1:.4f}")
    print(f"Precision: {precision}  → доля предсказанных fraud, которые реально fraud")
    print(f"Recall: {recall}  → доля реальных fraud, которые модель поймала")
    print(f"\nConfusion Matrix:")
    print(f"                 Pred 0   Pred 1")
    print(f"Actual 0:      {cm[0,0]:6d}   {cm[0,1]:6d}  (Specificity: {cm[0,0]/(cm[0,0]+cm[0,1]):.3f})")
    print(f"Actual 1:      {cm[1,0]:6d}   {cm[1,1]:6d}  (Recall: {cm[1,1]/(cm[1,0]+cm[1,1]) if (cm[1,0]+cm[1,1])>0 else 0:.3f})")
    
    return {
        'pr_auc': pr_auc,
        'roc_auc': roc_auc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'y_pred_proba': y_pred_proba,
        'y_pred': y_pred,
        'cm': cm
    }

In [88]:
# Словарь для хранения результатов
all_results = {}

# БЕЗ БАЛАНСИРОВКИ (baseline)

In [89]:
# 10.1.1 Логистическая регрессия
print("\n--- Logistic Regression (no balancing) ---")
lr_baseline = LogisticRegression(max_iter=1000, random_state=42, n_jobs=-1)
lr_baseline.fit(X_train, y_train)
res_lr_base = evaluate_model(lr_baseline, X_test, y_test, "LR Baseline")
res_lr_base['y_true'] = y_test
all_results['LR_Baseline'] = res_lr_base


--- Logistic Regression (no balancing) ---

МЕТРИКИ: LR Baseline
PR-AUC (Average Precision): 0.0121 ⭐ КЛЮЧЕВАЯ
ROC-AUC: 0.7009
F1-score: 0.0218
Precision: 0.011138613861386138  → доля предсказанных fraud, которые реально fraud
Recall: 0.5294117647058824  → доля реальных fraud, которые модель поймала

Confusion Matrix:
                 Pred 0   Pred 1
Actual 0:        6348     1598  (Specificity: 0.799)
Actual 1:          16       18  (Recall: 0.529)


In [90]:
# 10.1.2 Decision Tree
print("\n--- Decision Tree (no balancing) ---")
dt_baseline = DecisionTreeClassifier(random_state=42, max_depth=10, min_samples_leaf=50)
dt_baseline.fit(X_train, y_train)
res_dt_base = evaluate_model(dt_baseline, X_test, y_test, "DT Baseline")
res_dt_base['y_true'] = y_test
all_results['DT_Baseline'] = res_dt_base


--- Decision Tree (no balancing) ---

МЕТРИКИ: DT Baseline
PR-AUC (Average Precision): 0.0054 ⭐ КЛЮЧЕВАЯ
ROC-AUC: 0.4948
F1-score: 0.0109
Precision: 0.005623242736644799  → доля предсказанных fraud, которые реально fraud
Recall: 0.17647058823529413  → доля реальных fraud, которые модель поймала

Confusion Matrix:
                 Pred 0   Pred 1
Actual 0:        6885     1061  (Specificity: 0.866)
Actual 1:          28        6  (Recall: 0.176)


In [91]:

# 10.1.3 HistGradientBoosting
print("\n--- HistGradientBoosting (no balancing) ---")
hgb_baseline = HistGradientBoostingClassifier(random_state=42, max_iter=100, early_stopping=True, 
                                               validation_fraction=0.1, n_iter_no_change=10)
hgb_baseline.fit(X_train.toarray(), y_train)
res_hgb_base = evaluate_model(hgb_baseline, X_test.toarray(), y_test, "HGB Baseline")
res_hgb_base['y_true'] = y_test
all_results['HGB_Baseline'] = res_hgb_base



--- HistGradientBoosting (no balancing) ---

МЕТРИКИ: HGB Baseline
PR-AUC (Average Precision): 0.0062 ⭐ КЛЮЧЕВАЯ
ROC-AUC: 0.6222
F1-score: 0.0145
Precision: 0.007598784194528876  → доля предсказанных fraud, которые реально fraud
Recall: 0.14705882352941177  → доля реальных fraud, которые модель поймала

Confusion Matrix:
                 Pred 0   Pred 1
Actual 0:        7293      653  (Specificity: 0.918)
Actual 1:          29        5  (Recall: 0.147)


балансировка + oversampling

In [92]:
ros = RandomOverSampler(sampling_strategy=0.03, random_state=42)  # fraud = 3% от датасета
X_train_ros, y_train_ros = ros.fit_resample(X_train.toarray(), y_train)

print(f"Размер после ROS: {X_train_ros.shape}, распределение: {np.bincount(y_train_ros)}")


Размер после ROS: (23008, 490), распределение: [22338   670]


In [93]:
print("\n--- Logistic Regression (балансировка + oversampling) ---")

lr_balanced = LogisticRegression(
    max_iter=1000,
    random_state=42,
    n_jobs=-1,
    class_weight={0: 1, 1: 20}
)

# ✅ обучение — oversampled train
lr_balanced.fit(X_train_ros, y_train_ros)


# ✅ оценка — ЧИСТЫЙ TEST
res_lr_bal = evaluate_model(
    lr_balanced,
    X_test,
    y_test,
    "LR Balanced + Oversampling (TEST)"
)

all_results['LR_Balanced + Oversampling'] = res_lr_bal


--- Logistic Regression (балансировка + oversampling) ---

МЕТРИКИ: LR Balanced + Oversampling (TEST)
PR-AUC (Average Precision): 0.0064 ⭐ КЛЮЧЕВАЯ
ROC-AUC: 0.5661
F1-score: 0.0089
Precision: 0.0044601862783680965  → доля предсказанных fraud, которые реально fraud
Recall: 1.0  → доля реальных fraud, которые модель поймала

Confusion Matrix:
                 Pred 0   Pred 1
Actual 0:         357     7589  (Specificity: 0.045)
Actual 1:           0       34  (Recall: 1.000)


In [94]:
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression

pos_weights = [5, 10, 15, 20, 30]
sampling_strategies = [0.05, 0.1, 0.2, 0.3]

for ratio in sampling_strategies:
    print(f"\n{'='*70}")
    print(f"OVERSAMPLING ratio = {ratio}")
    print(f"{'='*70}")

    ros = RandomOverSampler(
        random_state=42,
        sampling_strategy=ratio
    )

    X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

    print(f"Train size after ROS: {X_train_ros.shape}")
    print(f"Class distribution: {np.bincount(y_train_ros)}")

    for w in pos_weights:
        model_name = f"LR_ros{ratio}_w{w}"

        print(f"\n--- {model_name} ---")

        lr = LogisticRegression(
            max_iter=1000,
            random_state=42,
            n_jobs=-1,
            class_weight={0: 1, 1: w}
        )

        lr.fit(X_train_ros, y_train_ros)

        res = evaluate_model(
            lr,
            X_test,        # ❗ test, не train
            y_test,
            dataset_name=model_name
        )

        res["oversampling_ratio"] = ratio
        res["pos_weight"] = w

        all_results[model_name] = res


OVERSAMPLING ratio = 0.05
Train size after ROS: (23454, 490)
Class distribution: [22338  1116]

--- LR_ros0.05_w5 ---

МЕТРИКИ: LR_ros0.05_w5
PR-AUC (Average Precision): 0.0077 ⭐ КЛЮЧЕВАЯ
ROC-AUC: 0.5971
F1-score: 0.0088
Precision: 0.004442702208284333  → доля предсказанных fraud, которые реально fraud
Recall: 1.0  → доля реальных fraud, которые модель поймала

Confusion Matrix:
                 Pred 0   Pred 1
Actual 0:         327     7619  (Specificity: 0.041)
Actual 1:           0       34  (Recall: 1.000)

--- LR_ros0.05_w10 ---

МЕТРИКИ: LR_ros0.05_w10
PR-AUC (Average Precision): 0.0068 ⭐ КЛЮЧЕВАЯ
ROC-AUC: 0.5728
F1-score: 0.0088
Precision: 0.004435746901500326  → доля предсказанных fraud, которые реально fraud
Recall: 1.0  → доля реальных fraud, которые модель поймала

Confusion Matrix:
                 Pred 0   Pred 1
Actual 0:         315     7631  (Specificity: 0.040)
Actual 1:           0       34  (Recall: 1.000)

--- LR_ros0.05_w15 ---

МЕТРИКИ: LR_ros0.05_w15
PR-AUC (Ave

In [95]:
print("\n--- Decision Tree (балансировка + oversampling) ---")
weights = {0:1, 1:15}
dt_balanced = DecisionTreeClassifier(
    random_state=42,
    max_depth=10,
    min_samples_leaf=50,
    class_weight=weights
)
dt_balanced.fit(X_train_ros, y_train_ros)
res_dt_bal = evaluate_model(dt_balanced, X_test, y_test, "DT Balanced")
res_dt_bal['y_true'] = y_test
all_results['DT_Balanced + Oversampling'] = res_dt_bal


--- Decision Tree (балансировка + oversampling) ---

МЕТРИКИ: DT Balanced
PR-AUC (Average Precision): 0.0061 ⭐ КЛЮЧЕВАЯ
ROC-AUC: 0.5586
F1-score: 0.0105
Precision: 0.005323505323505323  → доля предсказанных fraud, которые реально fraud
Recall: 0.38235294117647056  → доля реальных fraud, которые модель поймала

Confusion Matrix:
                 Pred 0   Pred 1
Actual 0:        5517     2429  (Specificity: 0.694)
Actual 1:          21       13  (Recall: 0.382)


HistGradientBoosting class_weight

In [96]:
print("\n--- HistGradientBoosting (балансировка) ---")
hgb_balanced = HistGradientBoostingClassifier(
    max_iter=200,
    max_depth=10,
    min_samples_leaf=20,
    random_state=42,
    class_weight={0:1, 1:50}  
)
hgb_balanced.fit(X_train.toarray(), y_train)
res_hgb_balanced = evaluate_model(hgb_balanced, X_test.toarray(), y_test, "HGB + ROS")
res_hgb_balanced['y_true'] = y_test
all_results['HGB_balanced'] = res_hgb_balanced


--- HistGradientBoosting (балансировка) ---

МЕТРИКИ: HGB + ROS
PR-AUC (Average Precision): 0.0078 ⭐ КЛЮЧЕВАЯ
ROC-AUC: 0.6538
F1-score: 0.0085
Precision: 0.004260651629072682  → доля предсказанных fraud, которые реально fraud
Recall: 1.0  → доля реальных fraud, которые модель поймала

Confusion Matrix:
                 Pred 0   Pred 1
Actual 0:           0     7946  (Specificity: 0.000)
Actual 1:           0       34  (Recall: 1.000)


In [97]:
# 10.3.3 HistGradientBoosting
print("\n--- HistGradientBoosting (Random Oversampling) ---")
hgb_ros = HistGradientBoostingClassifier(random_state=42, max_iter=100)
hgb_ros.fit(X_train_ros.toarray(), y_train_ros)
res_hgb_ros = evaluate_model(hgb_ros, X_test.toarray(), y_test, "HGB + ROS")
res_hgb_ros['y_true'] = y_test
all_results['HGB_ROS'] = res_hgb_ros


--- HistGradientBoosting (Random Oversampling) ---

МЕТРИКИ: HGB + ROS
PR-AUC (Average Precision): 0.0058 ⭐ КЛЮЧЕВАЯ
ROC-AUC: 0.5497
F1-score: 0.0098
Precision: 0.0049504950495049506  → доля предсказанных fraud, которые реально fraud
Recall: 0.47058823529411764  → доля реальных fraud, которые модель поймала

Confusion Matrix:
                 Pred 0   Pred 1
Actual 0:        4730     3216  (Specificity: 0.595)
Actual 1:          18       16  (Recall: 0.471)


 SMOTE

In [98]:
smote = SMOTE(sampling_strategy=0.03, random_state=42, k_neighbors=3)
X_train_smote, y_train_smote = smote.fit_resample(X_train.toarray(), y_train)

In [99]:
lr_balanced_smote = LogisticRegression(max_iter=1000, random_state=42, n_jobs=-1, class_weight={0:1, 1:15})
lr_balanced_smote.fit(X_train_smote, y_train_smote)
res_lr_bal_smote = evaluate_model(lr_balanced_smote, X_test, y_test, "LR Balanced + SMOTE")
res_lr_bal_smote['y_true'] = y_test
all_results['LR_Balanced + SMOTE'] = res_lr_bal_smote


МЕТРИКИ: LR Balanced + SMOTE
PR-AUC (Average Precision): 0.0073 ⭐ КЛЮЧЕВАЯ
ROC-AUC: 0.5842
F1-score: 0.0088
Precision: 0.004418453541260559  → доля предсказанных fraud, которые реально fraud
Recall: 1.0  → доля реальных fraud, которые модель поймала

Confusion Matrix:
                 Pred 0   Pred 1
Actual 0:         285     7661  (Specificity: 0.036)
Actual 1:           0       34  (Recall: 1.000)


In [100]:
results_df = pd.DataFrame([
    {
        'Model': name,
        'PR-AUC': res['pr_auc'],
        'ROC-AUC': res['roc_auc'],
        'F1': res['f1'],
        'Recall': res['recall'],
        'Precision:': res['precision']
        
    }
    for name, res in all_results.items()
        if name != 'LR_VAL'
])

results_df = results_df.sort_values('PR-AUC', ascending=False)
print("\n" + results_df.to_string(index=False))


                     Model   PR-AUC  ROC-AUC       F1   Recall  Precision:
               LR_Baseline 0.012066 0.700937 0.021818 0.529412    0.011139
              LR_ros0.1_w5 0.007939 0.586614 0.008703 1.000000    0.004371
              HGB_balanced 0.007826 0.653751 0.008485 1.000000    0.004261
             LR_ros0.05_w5 0.007693 0.597100 0.008846 1.000000    0.004443
       LR_Balanced + SMOTE 0.007317 0.584234 0.008798 1.000000    0.004418
             LR_ros0.2_w30 0.007281 0.581795 0.008647 1.000000    0.004342
             LR_ros0.2_w10 0.007174 0.580281 0.008645 1.000000    0.004341
              LR_ros0.2_w5 0.006935 0.564550 0.009033 1.000000    0.004537
             LR_ros0.3_w10 0.006934 0.570568 0.008787 1.000000    0.004413
              LR_ros0.3_w5 0.006910 0.570813 0.008852 1.000000    0.004446
            LR_ros0.05_w15 0.006900 0.566230 0.008799 1.000000    0.004419
             LR_ros0.1_w15 0.006890 0.574344 0.008833 1.000000    0.004436
             LR_ros0.3_w

In [None]:
ros = RandomOverSampler(sampling_strategy=0.03, random_state=42)  # fraud = 3% от датасета
X_train_ros, y_train_ros = ros.fit_resample(X_train.toarray(), y_train)

print("\n--- Logistic Regression (балансировка + oversampling) ---")
lr_balanced = LogisticRegression(max_iter=1000, random_state=42, n_jobs=-1, class_weight={0:1, 1:100})
lr_balanced.fit(X_train_ros, y_train_ros)
res_lr_bal = evaluate_model(lr_balanced, X_train_ros, y_train_ros, "LR Balanced + Oversampling")
res_lr_bal['y_true'] = y_test
all_results['LR_Balanced + Oversampling'] = res_lr_bal


In [103]:
y_val_proba = lr_baseline.predict_proba(X_val)[:, 1]

In [104]:
from sklearn.metrics import precision_recall_curve

precision, recall, thresholds = precision_recall_curve(y_val, y_val_proba)

In [105]:
import numpy as np

f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)

best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]

print("Best threshold:", best_threshold)
print("Precision:", precision[best_idx])
print("Recall:", recall[best_idx])
print("F1:", f1_scores[best_idx])

Best threshold: 0.013674828836965747
Precision: 0.016270337922403004
Recall: 0.3170731707317073
F1: 0.030952380023837895


In [106]:
target_precision = 0.2

valid_idxs = np.where(precision[:-1] >= target_precision)[0]

if len(valid_idxs) > 0:
    idx = valid_idxs[np.argmax(recall[valid_idxs])]
    print("Threshold:", thresholds[idx])
    print("Precision:", precision[idx])
    print("Recall:", recall[idx])
else:
    print("Нет threshold с таким precision")

Нет threshold с таким precision
