In [None]:

import pandas as pd
import numpy as np
# from pathlib import Path

# 数据不平衡处理：SMOTE过采样+随机欠采样
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

# ML相关
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.utils.class_weight import compute_class_weight
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier


pwd ='../data/data(processed)/'
output = '../data/output/'
output_filename = 'baseline_submission.csv'

submission_column_names = ['id', 'prob']
isout = 0 # 是否导出结果
output = '../data/output/'
output_filename = 'baseline_submission.csv'

balance_method = 'UCO'  # 过采样+欠采样结合
target_list_name = 'target'
unused_features = ['id']
id_feature = 'id'
use_model = 'CatBOOST'



In [16]:

models = {
    'LightGBM': LGBMClassifier(
        n_estimators=600,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        max_depth=-1,
        random_state=42,
        n_jobs=-1
    ),
    'CatBoost': CatBoostClassifier(
        depth=6,
        learning_rate=0.05,
        iterations=600,
        loss_function='Logloss',
        eval_metric='AUC',
        random_seed=42,
        verbose=0
    ),
    'XGBoost': XGBClassifier(
        n_estimators=600,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        random_state=42,
        n_jobs=-1,
        use_label_encoder=False,
        eval_metric='logloss'
    )
}

In [None]:
train_df = pd.read_csv(pwd + 'train.csv')
test_df = pd.read_csv(pwd + 'test.csv')

test_df.head()

Unnamed: 0,id,amount,length,housing,income,purpose,overdue_times,default_times,total_default_number,last_overdue_months,...,mortage_number,account_number,loan_history,recent_loan_number,recent_account_months,credict_used_amount,credict_limit,half_used_credict_card,total_credict_card_number,last_credict_card_months
0,501,5000,1,1,1600,2,0,0,0,31,...,0,1,0,0,21,1378,25000,0,0,22
1,502,37000,9,3,10200,2,0,0,0,18,...,1,2,2,0,18,2812,47000,0,1,20
2,503,11000,7,3,0,8,0,0,0,68,...,3,4,4,1,9,3488,36500,0,3,41
3,504,9000,10,3,4553,2,1,0,0,4,...,1,3,1,0,21,1614,25500,1,2,80
4,505,12000,10,2,9500,2,1,0,0,3,...,0,3,2,0,21,1026,31500,0,3,3


In [None]:
# 样本分布查看
train_df[target_list_name].value_counts()

target
0    490
1     10
Name: count, dtype: int64

In [None]:
y_train = train_df[target_list_name]
X_train = train_df.drop(columns=[target_list_name] + unused_features)

# y_test = test_df[target_list_name]
X_test = test_df.drop(columns=[target_list_name] + unused_features)

y_train.value_counts()

In [None]:
# 创建新特征
##

In [None]:
# 数据不平衡处理：SMOTE过采样+随机欠采样
# over_sampler = SMOTE(sampling_strategy=0.3, random_state=42)


# UCO不平衡处理
# Step A: 欠采样多数类（这里采用随机欠采样，可换成其他策略）
def uco_resample(X_train, y_train):
    rus = RandomUnderSampler(random_state=42)
    X_rus, y_rus = rus.fit_resample(X_train, y_train)

    # Step B: 在欠采样后的数据上进行 SMOTE 过采样
    smote = SMOTE(random_state=42, k_neighbors=5)
    X_train, y_train = smote.fit_resample(X_rus, y_rus)

    print("Balanced training distribution:", Counter(y_train))
    return X_train, y_train
if balance_method == 'None':
    print("No balancing applied. Training distribution:", Counter(y_train))
elif balance_method == 'UCO':
    X_train, y_train = uco_resample(X_train, y_train)
elif balance_method == 'SMOTE':
    smote = SMOTE(random_state=42, k_neighbors=5)
    X_train, y_train = smote.fit_resample(X_train, y_train)
    print("Balanced training distribution:", Counter(y_train))

# # ---------- 可选：计算类别权重，以结合代价敏感训练 ----------
# classes = np.unique(y_train)
# class_weights = compute_class_weight(class_weight='balanced',
#                                      classes=classes,
#                                      y=y_train)
# # 转换为 dict 形式，方便后续传入模型
# class_weight_dict = dict(zip(classes, class_weights))
# print("Computed class weights:", class_weight_dict)



In [None]:

model = models[use_model]
y_test_pred = model.predict_proba(X_test)[:, 1]

output_df = pd.DataFrame({
    submission_column_names[0]: test_df[id_feature],
    submission_column_names[1]: y_test_pred
})

#输出结果
if isout:
    output_df.to_csv(output + output_filename, index=False)