# 基于stacking融合模型的老年人健康状况预测—代码部分

# 1.数据预处理

In [None]:
#导入所需包
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
%matplotlib inline
import seaborn as sns
import gc
import re as re
from collections import Counter

from tqdm.auto import tqdm
import math
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay
from sklearn.preprocessing import LabelEncoder,StandardScaler

import warnings
warnings.filterwarnings('ignore')

from glob import glob
from pathlib import Path
import joblib
import pickle
import os
import random

import time
from xgboost import XGBClassifier
from sklearn.metrics import log_loss
%matplotlib inline
import optuna
from optuna.samplers import TPESampler
import pickle

In [None]:
#导入数据
train             = pd.read_csv('train.csv')
test              = pd.read_csv('test.csv')
greeks            = pd.read_csv('greeks.csv')
submission_df = pd.read_csv('sample_submission.csv')

### 1.1数据可视化

In [None]:
#特征数量
data = {
    "Feature": [],
    "Counts": []
}

for feature in train.columns:
    if feature != "null_count":
        data["Feature"].append(feature)
        data["Counts"].append(train[feature].nunique())

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(20, 10))

sns.set_style('darkgrid')
cmap = sns.color_palette(sns.light_palette("#6ecdff", n_colors=len(train.columns), reverse=True))
sns.set_palette(cmap)

counts = pd.DataFrame(data)
_ = sns.barplot(x=counts.Feature, y=counts.Counts, ax=ax, order=counts.sort_values('Counts', ascending=False).Feature)
for p in ax.patches:
    ax.text(x=p.get_x()+(p.get_width()/2), y=p.get_height(), s="{:,d}".format(round(p.get_height())), ha="center")
_ = ax.set_title("Unique Values by Feature", fontsize=15)
_ = ax.set_ylabel("Number of Unique Values", fontsize=15)
_ = ax.set_xlabel("Feature", fontsize=15)
for label in ax.get_xticklabels():
    label.set_rotation(90)

In [None]:
#分布图（小提琴图）
features = [
    'AB', 'AF', 'AH', 'AM', 'AR', 'AX', 'AY', 'AZ', 'BC', 'BD ', 'BN',
    'BP', 'BQ', 'BR', 'BZ', 'CB', 'CC', 'CD ', 'CF', 'CH', 'CL', 'CR', 'CS',
    'CU', 'CW ', 'DA', 'DE', 'DF', 'DH', 'DI', 'DL', 'DN', 'DU', 'DV', 'DY',
    'EB', 'EE', 'EG', 'EH', 'EL', 'EP', 'EU', 'FC', 'FD ', 'FE', 'FI',
    'FL', 'FR', 'FS', 'GB', 'GE', 'GF', 'GH', 'GI', 'GL'
]

fig, axs = plt.subplots(nrows=11, ncols=5, figsize=(20, 30))

axs = axs.flatten()

sns.set_style('darkgrid')

axis_counter = 0
for feature in features:
    ax = axs[axis_counter]
    _ = sns.violinplot(y=train[feature], ax=ax)
    _ = ax.set_title("{}".format(feature))
    _ = ax.set_ylabel("")
    _ = ax.set_xlabel("")
    axis_counter += 1

In [None]:
#预测Class的类别
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(15, 8))

sns.set_style('darkgrid')

cmap = sns.color_palette(sns.light_palette("#6495ED", n_colors=8, reverse=True))
sns.set_palette(cmap)
counts = pd.DataFrame(greeks["Alpha"].value_counts())
_ = sns.barplot(x=counts.index, y=counts.Alpha, ax=axs[0])
for p in axs[0].patches:
    axs[0].text(x=p.get_x()+(p.get_width()/2), y=p.get_height(), s="{:,d}".format(round(p.get_height())), ha="center")
_ = axs[0].set_title("Class Balance (Supplemental)", fontsize=15)
_ = axs[0].set_ylabel("Number of Records", fontsize=15)
_ = axs[0].set_xlabel("Age-Related Condition", fontsize=15)

targets = greeks["Alpha"].unique()
data = [greeks[(greeks["Alpha"] == target)]["Id"].count() for target in targets]
cmap = sns.color_palette(sns.light_palette("#6495ED", n_colors=8, reverse=True))
_ = axs[1].pie(
    data, labels=targets,
    autopct=lambda x: "{:,.0f} = {:.2f}%".format(x * sum(data)/100, x),
    explode=[0.20] * len(data), 
    colors=cmap,
)
_ = axs[1].set_title("Class Balance (Supplemental)", fontsize=15)

In [None]:
#train数据集的矩阵热图
features = [
    'AB', 'AF', 'AH', 'AM', 'AR', 'AX', 'AY', 'AZ', 'BC', 'BD ', 'BN',
    'BP', 'BQ', 'BR', 'BZ', 'CB', 'CC', 'CD ', 'CF', 'CH', 'CL', 'CR', 'CS',
    'CU', 'CW ', 'DA', 'DE', 'DF', 'DH', 'DI', 'DL', 'DN', 'DU', 'DV', 'DY',
    'EB', 'EE', 'EG', 'EH', 'EJ', 'EL', 'EP', 'EU', 'FC', 'FD ', 'FE', 'FI',
    'FL', 'FR', 'FS', 'GB', 'GE', 'GF', 'GH', 'GI', 'GL', 'Class'
]

correlation_matrix = train[features].corr(method="spearman")

from matplotlib.colors import SymLogNorm

f, ax = plt.subplots(figsize=(20, 20))
_ = sns.heatmap(
    correlation_matrix, 
    mask=np.triu(np.ones_like(correlation_matrix, dtype=bool)), 
    cmap=sns.diverging_palette(220, 20, l=60, sep=10, as_cmap=True), 
    center=0,
    square=True, 
    linewidths=.1, 
    cbar=False,
    ax=ax,
    annot=False,
)
_ = ax.set_title("Spearman Correlation Matrix", fontsize=15)

In [None]:
#数据集缺失值情况
null_count_labels = [train[(train["null_count"] == x)].isnull().sum().index[:-1] for x in [1, 2, 4]]
null_count_values = [train[(train["null_count"] == x)].isnull().sum().values[:-1] for x in [1, 2, 4]]
null_count_numbers = [1, 2, 4]

fig, axs = plt.subplots(nrows=3, ncols=1, figsize=(20, 15))
fig.suptitle("Null Value Breakdown", fontsize=20)

axs = axs.flatten()
axis_counter = 0

for null_labels, null_values, null_numbers in zip(null_count_labels, null_count_values, null_count_numbers):
    ax = axs[axis_counter]
    _ = sns.barplot(x=null_labels, y=null_values, ax=ax)
    _ = ax.set_title("Rows With {} Null(s)".format(null_numbers), fontsize=15)
    _ = ax.set_ylabel("")
    _ = ax.set_xlabel("")
    _ = ax.set_xticks([z for z in range(len(null_labels))], null_labels, rotation=90)
    for p in ax.patches:
        height = p.get_height()
        ax.text(x=p.get_x()+(p.get_width()/2), y=height, s="{:d}".format(int(height)), ha="center")
    axis_counter += 1

In [None]:
#数据集重复值情况
duplicates = train.pivot_table(index=[
    'AB', 'AF', 'AH', 'AM', 'AR', 'AX', 'AY', 'AZ', 'BC', 'BD ', 'BN',
    'BP', 'BQ', 'BR', 'BZ', 'CB', 'CC', 'CD ', 'CF', 'CH', 'CL', 'CR', 'CS',
    'CU', 'CW ', 'DA', 'DE', 'DF', 'DH', 'DI', 'DL', 'DN', 'DU', 'DV', 'DY',
    'EB', 'EE', 'EG', 'EH', 'EJ', 'EL', 'EP', 'EU', 'FC', 'FD ', 'FE', 'FI',
    'FL', 'FR', 'FS', 'GB', 'GE', 'GF', 'GH', 'GI', 'GL'
], aggfunc="size")
unique, counts = np.unique(duplicates, return_counts=True)
value_counts = dict(zip(unique, counts))

if len(unique) == 1:
    print(": There are no duplicated rows in the training set")
else:
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(15, 8))

    _ = sns.barplot(x=list(value_counts.keys())[1:], y=list(value_counts.values())[1:], ax=ax)
    _ = ax.set_title("Duplicate Counts in Training Set", fontsize=15)
    _ = ax.set_ylabel("Count")
    _ = ax.set_xlabel("Number of Times Row is Duplicated")
    for p in ax.patches:
        height = p.get_height()
        ax.text(
            x=p.get_x()+(p.get_width()/2),
            y=height,
            s="{:d}".format(int(height)),
            ha="center"
        )

In [None]:
#train数据集描述性统计
features = [
    'AB', 'AF', 'AH', 'AM', 'AR', 'AX', 'AY', 'AZ', 'BC', 'BD ', 'BN',
    'BP', 'BQ', 'BR', 'BZ', 'CB', 'CC', 'CD ', 'CF', 'CH', 'CL', 'CR', 'CS',
    'CU', 'CW ', 'DA', 'DE', 'DF', 'DH', 'DI', 'DL', 'DN', 'DU', 'DV', 'DY',
    'EB', 'EE', 'EG', 'EH', 'EJ', 'EL', 'EP', 'EU', 'FC', 'FD ', 'FE', 'FI',
    'FL', 'FR', 'FS', 'GB', 'GE', 'GF', 'GH', 'GI', 'GL'
]

train[features].describe().T.style.bar(subset=['mean'], color='#7BCC70')\
    .background_gradient(subset=['std'], cmap='Reds')\
    .background_gradient(subset=['50%'], cmap='coolwarm')

### 1.3数据处理

In [None]:
#定性变量转化
train['EJ'] = train['EJ'].replace({'A': 0, 'B': 1})
test['EJ'] = test['EJ'].replace({'A': 0, 'B': 1})

In [None]:
#用中位数填补缺失值
train.fillna(train.median(),inplace=True)
test.fillna(train.median(),inplace=True)
test = test.drop(['Id'], axis=True)

In [None]:
#数据分割
x_train = train.drop(['Id','Class'], axis=True)
y_train = train['Class']

# 2.基于Optuna优化框架的单模型构建

In [None]:
#导入所需包
import lightgbm as lgb
import xgboost as xgb
from catboost import Pool, CatBoostRegressor, CatBoostClassifier
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
from skopt import BayesSearchCV
from xgboost import XGBClassifier
import time
from sklearn.metrics import log_loss
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
import time

## 2.1定义评价指标balanced log loss

In [None]:
def balanced_logarithmic_loss_new(y_pred,y_true,):

    # Nc is the number of observations
    N_1 = np.sum(y_true == 1, axis=0)
    N_0 = np.sum(y_true == 0, axis=0)

    # In order to avoid the extremes of the log function, each predicted probability 𝑝 is replaced with max(min(𝑝,1−10−15),10−15)
    y_pred = np.maximum(np.minimum(y_pred, 1 - 1e-15), 1e-15)

    # balanced logarithmic loss
    loss_numerator = - (1/N_0) * np.sum((1 - y_true) * np.log(1-y_pred)) - (1/N_1) * np.sum(y_true * np.log(y_pred))

    return loss_numerator / 2

In [None]:
#划分出一个不用训练，只用来验证的test
x_train_1, x_test_1, y_train_1, y_test_1 = train_test_split(x_train, y_train, test_size=0.3, random_state=48)

## 2.2 参数优化和模型对比

### 2.2.1 Optuna+LightGBM

In [None]:
log_losses_lightGBM = []
time_lightGBM = []
best_trial_value = []
n_iterations = 10
for i in range(n_iterations):
    
    # 划分训练集和测试集
    x_train2, x_test2, y_train2, y_test2 = train_test_split(x_train_1, y_train_1, test_size=0.3, random_state=i)#设置了随机数种子
    
    
   
    def objective(trial):
        param_grid = {
            "random_state": 48,
            "n_estimators": trial.suggest_int("n_estimators", 1000, 30000),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
            "num_leaves": trial.suggest_int("num_leaves", 10, 3000, step=20),
            "max_depth": trial.suggest_int("max_depth", 3, 12),
            "reg_alpha": trial.suggest_float("reg_alpha", 0.01, 0.7),
            "reg_lambda": trial.suggest_float("reg_lambda", 0.01, 0.7),
            "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 0.95, step=0.1),
            "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
            "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 0.95, step=0.1),
            "min_child_samples": trial.suggest_int("min_child_samples", 10, 100, step=5),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 0.9)
            #'categorical_feature': categorical_features
        }

        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)######random_state

        scores = []
        
        #在训练集上进行5折交叉验证及进行参数优化
        for train_index, test_index in skf.split(x_train2, y_train2):
            #print(train_index)

            x_train3, x_test3 = x_train2.iloc[train_index], x_train2.iloc[test_index]
            y_train3, y_test3 = y_train2.iloc[train_index], y_train2.iloc[test_index]
            

            model = lgb.LGBMClassifier(**param_grid , class_weight='balanced')
            model.fit(
                x_train3,
                y_train3,
                eval_set=[(x_train3, y_train3),(x_test3, y_test3)],
                early_stopping_rounds=300,
                verbose=False,
                categorical_feature=[39]
            )
            preds = model.predict_proba(x_test3)
            #print(preds)
            #print(preds[:, 1])
            #print(y_test2)
            score = balanced_logarithmic_loss_new(preds[:, 1],y_test3)
            print(score)
            scores.append(score)

            
        func_out = np.mean(scores)

        return func_out

    study = optuna.create_study(study_name=f'study{i}', direction="minimize",storage='sqlite:///db.sqlite3')
    study.optimize(objective, n_trials=200)
    best_trial_value.append(study.best_trial.value)
    print("Number of finished trials: ", len(study.trials))
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))
    
    
    model = lgb.LGBMClassifier(**trial.params,class_weight='balanced')
    
    # 记录开始时间
    start_time = time.time()
    
    model.fit(x_train2, y_train2)
    
    # 计算训练时间
    training_time = time.time() - start_time
    
    prediction = model.predict_proba(x_test2)[:, 1]
    score_test = balanced_logarithmic_loss_new(prediction,y_test2)
    
    log_losses_lightGBM.append(score_test)
    time_lightGBM.append(training_time)



result_lightGBM = pd.DataFrame({'log_losses' : log_losses_lightGBM,'time_comsumed' : time_lightGBM,'best_value_lightGBM' : best_trial_value})
result_lightGBM.to_csv('result_lightGBM.csv')

'''
Params: 
    bagging_fraction: 0.9
    bagging_freq: 1
    colsample_bytree: 0.8755453466152356
    feature_fraction: 0.6000000000000001
    learning_rate: 0.2941129864759795
    max_depth: 4
    min_child_samples: 85
    n_estimators: 22976
    num_leaves: 2870
    reg_alpha: 0.515407087957732
    reg_lambda: 0.26493493680587554
'''

### 2.2.3 Optuna+CatBoost

In [None]:
from catboost import CatBoostClassifier
import optuna
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import numpy as np
import pandas as pd

time_catboost = []
log_losses_catboost = []
best_trial_value = []
n_iterations = 10

for i in range(n_iterations):
    # 划分训练集和测试集
    x_train2, x_test2, y_train2, y_test2 = train_test_split(x_train_1, y_train_1, test_size=0.3, random_state=i)  # 设置了随机数种子

    def objective(trial):
        param_grid = {
            
            "iterations": trial.suggest_int("iterations", 100, 1000),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
            "depth": trial.suggest_int("depth", 3, 12),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.01, 1.0),
            "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 10.0),
            "random_strength": trial.suggest_float("random_strength", 0.0, 10.0),
            "border_count": trial.suggest_int("border_count", 1, 255),
            "scale_pos_weight": sum(y_train1 == 0) / sum(y_train1 == 1),
            "use_best_model": True,
            "random_seed": 42,
            "logging_level": "Silent",
            
        }

        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

        scores = []
        # 在训练集上进行5折交叉验证及进行参数优化
        for train_index, test_index in skf.split(x_train2, y_train2):
            x_train3, x_test3 = x_train2.iloc[train_index], x_train2.iloc[test_index]
            y_train3, y_test3 = y_train2.iloc[train_index], y_train2.iloc[test_index]

            model = CatBoostClassifier(**param_grid)
            model.fit(
                x_train3,
                y_train3,
                eval_set=(x_test3, y_test3),
                early_stopping_rounds=200,
            )
            preds = model.predict_proba(x_test3)[:, 1]
            score = balanced_logarithmic_loss_new(preds, y_test3)
            print(score)
            scores.append(score)

        func_out = np.mean(scores)

        return func_out

    study = optuna.create_study(study_name=f'study{i+1}', direction="minimize",storage='sqlite:///db.sqlite3')
    study.optimize(objective, n_trials=200)
    
    best_trial_value.append(study.best_trial.value)

    best_params = study.best_params
    
    #print("Number of finished trials: ", len(study.trials))
    #print("Best trial:")
    #trial = study.best_trial

    #print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in best_params.items():
        print("    {}: {}".format(key, value))
  
    # 记录开始时间
    start_time = time.time()
    
    model = CatBoostClassifier(**best_params,auto_class_weights='Balanced')
    model.fit(x_train2, y_train2)
    
    # 计算训练时间
    training_time = time.time() - start_time

    prediction = model.predict_proba(x_test2)[:, 1]
    score_test = balanced_logarithmic_loss_new(prediction, y_test2)

    log_losses_catboost.append(score_test)
    time_catboost.append(training_time)



result_catboost = pd.DataFrame({'log_losses_catboost' : log_losses_catboost,'time_comsumed_catboost' : time_catboost,'best_value_catboost' : best_trial_value})
result_catboost.to_csv('result_catboost1.csv')

'''
Params: 
    bagging_temperature: 7.442784574066854
    border_count: 212
    depth: 3
    iterations: 689
    l2_leaf_reg: 0.8262450118748192
    learning_rate: 0.09468235278046022
    random_strength: 3.098327157242888
'''

### 2.2.4 Optuna+随机森林

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

log_losses_rf = []
best_trial_value = []
time_rf = []
n_iterations = 10

for i in range(n_iterations):
    # 划分训练集和测试集
    x_train2, x_test2, y_train2, y_test2 = train_test_split(x_train_1, y_train_1, test_size=0.3, random_state=i)  # 设置了随机数种子

    def objective(trial):
        param_grid = {
            "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
            "criterion": trial.suggest_categorical("criterion", ["gini", "entropy"]),
            "max_depth": trial.suggest_int("max_depth", 3, 12),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 5),
            "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2"]),
            "class_weight": "balanced",
            "random_state": 48,
        }

        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

        scores = []
        # 在训练集上进行5折交叉验证及进行参数优化
        for train_index, test_index in skf.split(x_train2, y_train2):
            x_train3, x_test3 = x_train2.iloc[train_index], x_train2.iloc[test_index]
            y_train3, y_test3 = y_train2.iloc[train_index], y_train2.iloc[test_index]

            model = RandomForestClassifier(**param_grid)
            model.fit(x_train3, y_train3)
            preds = model.predict_proba(x_test3)[:, 1]
            score = balanced_logarithmic_loss_new(preds, y_test3)
            print(score)
            scores.append(score)

        func_out = np.mean(scores)

        return func_out
    #study = optuna.create_study(study_name = f'study{i+4}', direction="minimize", storage='sqlite:///db.sqlite3')
    study = optuna.create_study( direction="minimize")
    study.optimize(objective, n_trials=100)
    
    best_trial_value.append(study.best_trial.value)

    best_params = study.best_params
    
    print("  Params: ")
    for key, value in best_params.items():
        print("    {}: {}".format(key, value))

        
    # 记录开始时间
    start_time = time.time()
    
    model = RandomForestClassifier(**best_params)
    model.fit(x_train2, y_train2)
    
    # 计算训练时间
    training_time = time.time() - start_time

    prediction = model.predict_proba(x_test2)[:, 1]
    score_test = balanced_logarithmic_loss_new(prediction, y_test2)

    log_losses_rf.append(score_test)
    time_rf.append(training_time)


result_rf = pd.DataFrame({'log_losses_rf' : log_losses_rf,'time_comsumed_rf' : time_rf,'best_value_rf' : best_trial_value})
result_rf.to_csv('result_rf.csv')
'''
Params: 
    n_estimators: 190
    criterion: entropy
    max_depth: 5
    min_samples_split: 9
    min_samples_leaf: 5
    max_features: sqrt
'''


### 2.2.5 Optuna+SVC

In [None]:
log_losses_svm = []
time_svm = []
best_trial_value =[]

n_iterations = 10


for i in range(n_iterations):
    # 划分训练集和测试集
    x_train2, x_test2, y_train2, y_test2 = train_test_split(x_train_1, y_train_1, test_size=0.3, random_state=i)  # 设置了随机数种子


    def objective(trial):
        
        
        param_grid = {
            "C": trial.suggest_loguniform("C", 1e-3, 1e3),
            "kernel": trial.suggest_categorical("kernel", ["linear", "rbf", "sigmoid",'poly']),
            "gamma": trial.suggest_categorical("gamma", ["scale", "auto"]),
            'coef0':0,
            "degree": trial.suggest_int("degree", 1, 5),
            "class_weight": "balanced",
            "random_state": 48,
        }
        

        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

        scores = []
        # 在训练集上进行5折交叉验证及进行参数优化
        for train_index, test_index in skf.split(x_train2, y_train2):
            x_train3, x_test3 = x_train2.iloc[train_index], x_train2.iloc[test_index]
            y_train3, y_test3 = y_train2.iloc[train_index], y_train2.iloc[test_index]

            scaler = MinMaxScaler()
            scaler.fit(x_train3)
            x_train3 = scaler.transform(x_train3)
            x_train3 = pd.DataFrame(data=x_train3)

            model = SVC(**param_grid,probability=True)
            model.fit(x_train3, y_train3)
            preds = model.decision_function(x_test3)
            score = balanced_logarithmic_loss_new(preds, y_test3)

            scores.append(score)

        func_out = np.mean(scores)

        return func_out

    study = optuna.create_study(study_name=f'study-svc-{i}',direction="minimize",storage='sqlite:///db.sqlite3')
    study.optimize(objective, n_trials=200)
    best_trial_value.append(study.best_trial.value)

    best_params = study.best_params

    # 记录开始时间
    start_time = time.time()
    
    model = SVC(**best_params, probability=True)

    scaler = MinMaxScaler()
    scaler.fit(x_train2)
    x_train1 = scaler.transform(x_train2)
    x_train1 = pd.DataFrame(data=x_train2)

    model.fit(x_train2, y_train2)

    # 计算训练时间
    training_time = time.time() - start_time

    prediction = model.predict_proba(x_test2)[:, 1]
    score_test = balanced_logarithmic_loss_new(prediction, y_test2)

    log_losses_svm.append(score_test)
    time_svm.append(training_time)

result_svm = pd.DataFrame({'log_losses' : log_losses_svm,
                                     'time_comsumed' : time_svm, 'best_value' : best_trial_value})
result_svm.to_csv('result_svm_10itrations0608.csv')




### 2.2.6 Optuna+XGBoost

In [None]:
log_losses_xgboost = []
time_xgboost = []
best_trial_value =[]
n_iterations = 10

for i in range(n_iterations):
    # 划分训练集和测试集
    x_train2, x_test2, y_train2, y_test2 = train_test_split(x_train_1, y_train_1, test_size=0.3, random_state=i)  # 设置了随机数种子

    def objective(trial):
        param_grid = {
            "objective": "binary:logistic",
            # "eval_metric": "logloss",
            "seed": 48,
            "n_estimators": trial.suggest_int("n_estimators", 1000, 30000),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
            "max_depth": trial.suggest_int("max_depth", 3, 12),
            "gamma": trial.suggest_float("gamma", 0.01, 0.7),
            "subsample": trial.suggest_float("subsample", 0.2, 1, step=0.1),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 0.9),
            "reg_alpha": trial.suggest_float("reg_alpha", 0.01, 0.7),
            "reg_lambda": trial.suggest_float("reg_lambda", 0.01, 0.7),
            "min_child_weight": trial.suggest_float("min_child_weight", 0.1, 10),
            'max_delta_step':trial.suggest_int("max_delta_step", 0, 10),
            'alpha':trial.suggest_float("alpha", 0.01, 0.7)
        }

        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

        scores = []
        # 在训练集上进行5折交叉验证及进行参数优化
        for train_index, test_index in skf.split(x_train2, y_train2):
            x_train3, x_test3 = x_train2.iloc[train_index], x_train2.iloc[test_index]
            y_train3, y_test3 = y_train2.iloc[train_index], y_train2.iloc[test_index]


            model = xgb.XGBClassifier(**param_grid,feval = balanced_logarithmic_loss_new)
            model.fit(
                x_train3,
                y_train3,
                eval_set=[(x_test3, y_test3)],
                early_stopping_rounds=200,
                verbose=False,
            )


            preds = model.predict_proba(x_test3)[:, 1]
            score = balanced_logarithmic_loss_new(preds, y_test3)
            scores.append(score)

        func_out = np.mean(scores)

        return func_out

    study = optuna.create_study(study_name=f'study{i}',direction="minimize",storage='sqlite:///db.sqlite3')
    study.optimize(objective, n_trials=200)
    best_trial_value.append(study.best_trial.value)

    best_params = study.best_params

    # 记录开始时间
    start_time = time.time()

    model = xgb.XGBClassifier(**best_params, feval = balanced_logarithmic_loss_new)
    model.fit(x_train2, y_train2)

    # 计算训练时间
    training_time = time.time() - start_time

    prediction = model.predict_proba(x_test2)[:, 1]
    score_test = balanced_logarithmic_loss_new(prediction, y_test2)

    log_losses_xgboost.append(score_test)
    time_xgboost.append(training_time)

# log_losses_xgboost = pd.DataFrame(data=log_losses_xgboost)
result_xgboost = pd.DataFrame({'log_losses' : log_losses_xgboost,
                                     'time_comsumed' : time_xgboost, 'best_value' : best_trial_value})
result_xgboost.to_csv('result_xgboost_10itrations0607.csv')



## 3.3 LightGBM模型效果分析

In [None]:
bestparams_lgbm = {
            'bagging_fraction': 0.9,
            'bagging_freq': 1,
            'colsample_bytree': 0.8755453466152356,
            'feature_fraction': 0.6000000000000001,
            'learning_rate': 0.2941129864759795,
            'max_depth': 4,
            'min_child_samples': 85,
            'n_estimators': 22976,
            'num_leaves': 2870,
            'reg_alpha': 0.515407087957732,
            'reg_lambda': 0.26493493680587554
        }

clf = lgb.LGBMClassifier(**bestparams_lgbm, class_weight='balanced')

clf.fit(x_train_1, y_train_1,categorical_feature=[39])

prediction = clf.predict_proba(x_test_1)[:, 1]
score_test = balanced_logarithmic_loss_new(prediction,y_test_1)
print('Test Score:', score_test)

In [None]:
bestparams_lgbm = {
            'bagging_fraction': 0.9,
            'bagging_freq': 1,
            'colsample_bytree': 0.8755453466152356,
            'feature_fraction': 0.6000000000000001,
            'learning_rate': 0.2941129864759795,
            'max_depth': 4,
            'min_child_samples': 85,
            'n_estimators': 22976,
            'num_leaves': 2870,
            'reg_alpha': 0.515407087957732,
            'reg_lambda': 0.26493493680587554
        }

clf = lgb.LGBMClassifier(**bestparams_lgbm, class_weight='balanced')

clf.fit(x_train_1, y_train_1,categorical_feature=[39])

prediction = clf.predict_proba(x_test_1)[:, 1]
score_test = balanced_logarithmic_loss_new(prediction,y_test_1)
print('Test Score:', score_test)

In [None]:
#SHAP
#官网https://shap.readthedocs.io/en/latest/overviews.html
#2篇较好的帖子https://zhuanlan.zhihu.com/p/83412330, https://zhuanlan.zhihu.com/p/103370775

#需要先安装shap: pip install shap
import shap
shap.initjs()  # notebook环境下，加载用于可视化的JS代码

#在SHAP中进行模型解释需要先创建一个explainer
#SHAP支持很多类型的explainer(例如deep, gradient, kernel, linear, tree, sampling)
explainer = shap.TreeExplainer(clf,model_output='raw')
shap_values = explainer.shap_values(x_train_1) # 传入特征矩阵，计算SHAP值
y_base = explainer.expected_value
print(y_base)

In [None]:
#SHAP特征重要性
shap.summary_plot(shap_values[1], x_train_1, plot_type="bar")

In [None]:
# 变量重要性 PDP
#由sklearn所得的偏依赖图

from matplotlib import pyplot as plt
from sklearn.inspection import PartialDependenceDisplay

#Outlook_Overcast和Humidity对P(Play=yes)的影响
disp1 = PartialDependenceDisplay.from_estimator(clf, x_train, features=["DA","BQ"],method="brute") #feature：选用哪些变量
#Outlook_Overcast和Humidity的交互效应对P(Play=yes)的影响
disp2 = PartialDependenceDisplay.from_estimator(clf, x_train, features=[("DA","BQ")], method="brute") #用（）二维同时展示

In [None]:
#对第1个样本观测作force plot——考虑f(x)=P(Y=1)
shap.force_plot(explainer.expected_value[1], shap_values[1][0,:], feature_names=x_train_1.columns)

In [None]:
#所有样本观测的force plot——考虑f(x)=P(Y=1)
shap.force_plot(explainer.expected_value[1], shap_values[1], x_train_1.loc[:,['DU','BQ','AB']])

# 3.基于Stacking融合模型构建

## 3.1单模型预测及预测值相关性探究

### 3.1.1 随机森林+输出预测

In [None]:
n_iterations = 1

for i in range(n_iterations):
    # 划分训练集和测试集
    x_train_2, x_test_2, y_train_2, y_test_2 = train_test_split(x_train_1, y_train_1, test_size=0.3, random_state=i)  # 设置了随机数种子

    def objective(trial):
        param_grid = {
            "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
            "criterion": trial.suggest_categorical("criterion", ["gini", "entropy"]),
            "max_depth": trial.suggest_int("max_depth", 3, 12),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 5),
            "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2"]),
            "class_weight": "balanced",
            "random_state": 48,
        }

        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

        scores = []
        # 在训练集上进行5折交叉验证及进行参数优化
        for train_index, test_index in skf.split(x_train_2, y_train_2):
            x_train_3, x_test_3 = x_train_2.iloc[train_index], x_train_2.iloc[test_index]
            y_train_3, y_test_3 = y_train_2.iloc[train_index], y_train_2.iloc[test_index]

            model = RandomForestClassifier(**param_grid)
            model.fit(x_train_3, y_train_3)
            preds = model.predict_proba(x_test_3)[:, 1]
            score = balanced_logarithmic_loss_new(preds, y_test_3)
            print(score)
            scores.append(score)

        func_out = np.mean(scores)

        return func_out
    
    #study = optuna.create_study(study_name = f'study{i+4}', direction="minimize", storage='sqlite:///db.sqlite3')
    
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=200)
    
    #best_trial_value.append(study.best_trial.value)

    best_params = study.best_params
    
    print("  Params: ")
    for key, value in best_params.items():
        print("    {}: {}".format(key, value))

        
    # 记录开始时间
    #start_time = time.time()
    
    model = RandomForestClassifier(**best_params)
    model.fit(x_train_2, y_train_2)
    
    # 计算训练时间
    #training_time = time.time() - start_time

    prediction = model.predict_proba(x_test_2)[:, 1]
    #score_test = balanced_logarithmic_loss_new(prediction, y_test_2)
    

    #log_losses_rf.append(score_test)
    #time_rf.append(training_time)
prediction = pd.DataFrame(prediction)
prediction.to_csv('predictiont_rf.csv')
'''
Params: 
    n_estimators: 749
    criterion: gini
    max_depth: 11
    min_samples_split: 10
    min_samples_leaf: 5
    max_features: sqrt
'''

### 3.1.2 catboost+输出预测

In [None]:
n_iterations = 1

for i in range(n_iterations):
    # 划分训练集和测试集
    x_train_2, x_test_2, y_train_2, y_test_2 = train_test_split(x_train_1, y_train_1, test_size=0.3, random_state=i)  # 设置了随机数种子

    def objective(trial):
        param_grid = {
            
            "iterations": trial.suggest_int("iterations", 100, 1000),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
            "depth": trial.suggest_int("depth", 3, 12),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.01, 1.0),
            "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 10.0),
            "random_strength": trial.suggest_float("random_strength", 0.0, 10.0),
            "border_count": trial.suggest_int("border_count", 1, 255),
            "scale_pos_weight": sum(y_train_2 == 0) / sum(y_train_2 == 1),
            "use_best_model": True,
            "random_seed": 48,
            "logging_level": "Silent"
            
        }

        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

        scores = []
        # 在训练集上进行5折交叉验证及进行参数优化
        for train_index, test_index in skf.split(x_train_2, y_train_2):
            x_train_3, x_test_3 = x_train_2.iloc[train_index], x_train_2.iloc[test_index]
            y_train_3, y_test_3 = y_train_2.iloc[train_index], y_train_2.iloc[test_index]


            model = CatBoostClassifier(**param_grid)
            model.fit(
                x_train_3,
                y_train_3,
                eval_set=(x_test_3, y_test_3),
                early_stopping_rounds=200,
            )
            preds = model.predict_proba(x_test_3)[:, 1]
            score = balanced_logarithmic_loss_new(preds, y_test_3)
            #print(score)
            scores.append(score)

        func_out = np.mean(scores)

        return func_out

    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=200)
    
    #best_trial_value.append(study.best_trial.value)

    best_params = study.best_params
    
    #print("Number of finished trials: ", len(study.trials))
    #print("Best trial:")
    #trial = study.best_trial

    #print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in best_params.items():
        print("    {}: {}".format(key, value))
  
    # 记录开始时间
    #start_time = time.time()
    
    model = CatBoostClassifier(**best_params,auto_class_weights='Balanced')
    model.fit(x_train_2, y_train_2)
    
    # 计算训练时间
    #training_time = time.time() - start_time

    prediction = model.predict_proba(x_test_2)[:, 1]
    score_test = balanced_logarithmic_loss_new(prediction, y_test_2)

    #log_losses_catboost.append(score_test)
    #time_catboost.append(training_time)

prediction = pd.DataFrame(prediction)
prediction.to_csv('predictiont_catboost.csv')
'''
Params: 
    iterations: 852
    learning_rate: 0.1266693440048803
    depth: 3
    l2_leaf_reg: 0.7364685775851822
    bagging_temperature: 5.05484109041439
    random_strength: 0.4104855929305867
    border_count: 177
'''

### 3.1.3 lightgbm+输出预测

In [None]:
n_iterations = 1
for i in range(n_iterations):
    
    # 划分训练集和测试集
    x_train_2, x_test_2, y_train_2, y_test_2 = train_test_split(x_train_1, y_train_1, test_size=0.3, random_state=i)#设置了随机数种子
    
    
    #x_test1_2 = lgb.Dataset(data=x_test1_1,label=y_test1,categorical_feature=['EJ'],free_raw_data=False)
    
       
    def objective(trial):
        
        param_grid = {
            "random_state": 48,
            "n_estimators": trial.suggest_int("n_estimators", 1000, 30000),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
            "num_leaves": trial.suggest_int("num_leaves", 10, 3000, step=20),
            "max_depth": trial.suggest_int("max_depth", 3, 12),
            "reg_alpha": trial.suggest_float("reg_alpha", 0.01, 0.7),
            "reg_lambda": trial.suggest_float("reg_lambda", 0.01, 0.7),
            "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 0.95, step=0.1),
            "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
            "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 0.95, step=0.1),
            "min_child_samples": trial.suggest_int("min_child_samples", 10, 100, step=5),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 0.9)
            #'categorical_feature': categorical_features
        }

        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

        scores = []
        
        #在训练集上进行5折交叉验证及进行参数优化
        for train_index, test_index in skf.split(x_train_2, y_train_2):
            #print(train_index)

            x_train_3, x_test_3 = x_train_2.iloc[train_index], x_train_2.iloc[test_index]
            y_train_3, y_test_3 = y_train_2.iloc[train_index], y_train_2.iloc[test_index]
            

            model = lgb.LGBMClassifier(**param_grid , class_weight='balanced')
            model.fit(
                x_train_3,
                y_train_3,
                eval_set=[(x_train_3, y_train_3),(x_test_3, y_test_3)],
                early_stopping_rounds=200,
                verbose=False,
                categorical_feature=[39]
            )
            preds = model.predict_proba(x_test_3)
            #print(preds)
            #print(preds[:, 1])
            #print(y_test2)
            score = balanced_logarithmic_loss_new(preds[:, 1],y_test_3)
            #print(score)
            scores.append(score)

            
        func_out = np.mean(scores)

        return func_out

    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=200)
    
    #best_trial_value.append(study.best_trial.value)
    #print("Number of finished trials: ", len(study.trials))
    #print("Best trial:")
    trial = study.best_trial

    #print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))
    
    
    model = lgb.LGBMClassifier(**trial.params,class_weight='balanced')
    
    # 记录开始时间
    #start_time = time.time()
    
    model.fit(x_train_2, y_train_2)
    
    # 计算训练时间
    #training_time = time.time() - start_time
    
    prediction = model.predict_proba(x_test_2)[:, 1]
    #score_test = balanced_logarithmic_loss_new(prediction,y_test_2)

prediction = pd.DataFrame(prediction)
prediction.to_csv('predictiont_lightgbm.csv')
'''
Params: 
    n_estimators: 26732
    learning_rate: 0.1769023557198449
    num_leaves: 2690
    max_depth: 12
    reg_alpha: 0.1833787704392688
    reg_lambda: 0.06989518570851735
    bagging_fraction: 0.9
    bagging_freq: 1
    feature_fraction: 0.9
    min_child_samples: 60
    colsample_bytree: 0.5391855748943887
'''

### 3.1.4 XGBoost+输出预测

In [None]:
n_iterations = 1

for i in range(n_iterations):
    # 划分训练集和测试集
    x_train_2, x_test_2, y_train_2, y_test_2 = train_test_split(x_train_1, y_train_1, test_size=0.3, random_state=i)

    def objective(trial):
        param_grid = {
            "objective": "binary:logistic",
            # "eval_metric": "logloss",
            "seed": 48,
            "n_estimators": trial.suggest_int("n_estimators", 1000, 30000),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
            "max_depth": trial.suggest_int("max_depth", 3, 12),
            "gamma": trial.suggest_float("gamma", 0.01, 0.7),
            "subsample": trial.suggest_float("subsample", 0.2, 1, step=0.1),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 0.9),
            "reg_alpha": trial.suggest_float("reg_alpha", 0.01, 0.7),
            "reg_lambda": trial.suggest_float("reg_lambda", 0.01, 0.7),
            "min_child_weight": trial.suggest_float("min_child_weight", 0.1, 10),
            'max_delta_step':trial.suggest_int("max_delta_step", 0, 10),
            'alpha':trial.suggest_float("alpha", 0.01, 0.7)
        }

        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

        scores = []
        # 在训练集上进行5折交叉验证及进行参数优化
        for train_index, test_index in skf.split(x_train_2, y_train_2):
            x_train_3, x_test_3 = x_train_2.iloc[train_index], x_train_2.iloc[test_index]
            y_train_3, y_test_3 = y_train_2.iloc[train_index], y_train_2.iloc[test_index]

            model = xgb.XGBClassifier(**param_grid,feval = balanced_logarithmic_loss_new)
            model.fit(x_train_3, y_train_3,
                      eval_set=[(x_test_3, y_test_3)],
                      early_stopping_rounds=200,
                      verbose=False)
            preds = model.predict_proba(x_test_3)[:, 1]
            score = balanced_logarithmic_loss_new(preds, y_test_3)

            print(score)
            scores.append(score)

        func_out = np.mean(scores)

        return func_out
    
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=200)


    best_params = study.best_params
    
    print("  Params: ")
    for key, value in best_params.items():
        print("    {}: {}".format(key, value))

    
    model = xgb.XGBClassifier(**best_params, feval = balanced_logarithmic_loss_new)
    model.fit(x_train_2, y_train_2)


    prediction = model.predict_proba(x_test_2)[:, 1]
    score_test = balanced_logarithmic_loss_new(prediction, y_test_2)


prediction = pd.DataFrame({'XBG':prediction})
prediction.to_csv('prediction_XGB.csv')

'''
Best Params:
n_estimators: 19808
learning_rate: 0.29003009973815874
max_depth: 11
gamma: 0.3584743148039589
subsample: 0.7
colsample_bytree: 0.6874234399659787
reg_alpha: 0.14303443256590267
reg_lambda: 0.30450443282388806
min_child_weight: 2.78622578591918
max_delta_step: 6
alpha: 0.43245457551855127
'''

### 3.1.5 热力图

In [None]:
# 预测效果热力图
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.read_csv('prediction_model(1).csv')
df.columns = ['XGB','CatBoost','LightGBM','RF']
sns.set(font_scale=1.5)
sns.set_context({"figure.figsize":(8,8)})
sns.heatmap(data=df.corr(),square=True,annot=True,linewidths=0.8)
plt.show(block = True)

## 3.2 Stacking融合模型构建

### 3.2.1 基学习器：LGBM, CatBoost, XGBoost	元学习器：Logistic回归

In [None]:
_N_FOLDS = 5
_N_CLASS = 2
kf = KFold(n_splits=_N_FOLDS, shuffle=True, random_state=68)

def get_oof(clfname, X_train, y_train, X_test):
    if clfname == 'xgb':
        bestparams_xgb = {
            'n_estimators': 19808,
            'learning_rate': 0.29003009973815874,
            'max_depth': 11,
            'gamma': 0.3584743148039589,
            'subsample': 0.7,
            'colsample_bytree': 0.6874234399659787,
            'reg_alpha': 0.14303443256590267,
            'reg_lambda': 0.30450443282388806,
            'min_child_weight': 2.78622578591918,
            'max_delta_step': 6,
            'alpha': 0.43245457551855127,
        }
        clf = xgb.XGBClassifier(**bestparams_xgb,feval = balanced_logarithmic_loss_new)

    if clfname == 'cat':
        bestparams_cat = {
            'bagging_temperature': 7.442784574066854,
            'border_count': 212,
            'depth': 3,
            'iterations': 689,
            'l2_leaf_reg': 0.8262450118748192,
            'learning_rate': 0.09468235278046022,
            'random_strength': 3.098327157242888
        }
        clf = CatBoostClassifier(**bestparams_cat,auto_class_weights='Balanced')

    if clfname == 'lgbm':
        bestparams_lgbm = {
            'bagging_fraction': 0.9,
            'bagging_freq': 1,
            'colsample_bytree': 0.8755453466152356,
            'feature_fraction': 0.6000000000000001,
            'learning_rate': 0.2941129864759795,
            'max_depth': 4,
            'min_child_samples': 85,
            'n_estimators': 22976,
            'num_leaves': 2870,
            'reg_alpha': 0.515407087957732,
            'reg_lambda': 0.26493493680587554
        }
        clf = lgb.LGBMClassifier(**bestparams_lgbm, class_weight='balanced')
    # X_train: *
    # y_train: 1 *
    # X_test :  *
    oof_train = np.zeros((X_train.shape[0], _N_CLASS))  # Stacking后训练数据的输出
    oof_test = np.zeros((X_test.shape[0], _N_CLASS))

    for i, (train_index, test_index) in enumerate(kf.split(X_train)): # 交叉验证划分此时的训练集和验证集
        kf_X_train = X_train.iloc[train_index]  # 训练集
        kf_y_train = y_train.iloc[train_index]  # 训练集对应的输出
        kf_X_test = X_train.iloc[test_index]  # 验证集
        kf_y_test = y_train.iloc[test_index]
        if clfname == 'lgbm':
            clf.fit(kf_X_train, kf_y_train,categorical_feature=[39],early_stopping_rounds=200,eval_set=[(kf_X_test, kf_y_test)])
        else:
            clf.fit(kf_X_train, kf_y_train,early_stopping_rounds=200,eval_set=[(kf_X_test, kf_y_test)])  # 当前模型进行训练

        oof_train[test_index] = clf.predict_proba(kf_X_test)# 对当前验证集进行预测
        oof_test += np.around(clf.predict_proba(X_test),6) # 对测试集预测

    oof_test /= 5   # 对每一则交叉验证的结果取平均
    return oof_train, oof_test  # 返回当前分类器对训练集和测试集的预测结果

# 将数据换成你的数据
X_train = x_train_1
y_train = y_train_1
X_test = x_test_1

# 将你的每个分类器都调用get_oof函数，并把它们的结果合并，就得到了新的训练和测试数据new_train,new_test
new_train, new_test = [], []
for clfname in ['lgbm','cat','xgb']:
    oof_train, oof_test = get_oof(clfname, X_train, y_train, X_test)
    new_train.append(oof_train)
    new_test.append(oof_test)

new_train = np.concatenate(new_train, axis=1)
new_test = np.concatenate(new_test, axis=1)
new_train = pd.DataFrame(new_train)
new_test = pd.DataFrame(new_test)
new_train.columns = ['lgbm0','lgbm1','cat0','cat1','xgb0','xgb1']
new_test.columns = ['lgbm0','lgbm1','cat0','cat1','xgb0','xgb1']
new_train = new_train[['lgbm1','cat1','xgb1']]
new_test = new_test[['lgbm1','cat1','xgb1']]

# 用新的训练数据new_train作为新的模型的输入，stacking第二层
x_train4, y_train4 = new_train, y_train
def objective(trial):
    param_grid = {
        "C": trial.suggest_float("C", 1e-3, 1e2),
        "solver": trial.suggest_categorical("solver", ['liblinear', 'saga']),
        'penalty': trial.suggest_categorical('penalty', ["l1", "l2"])
    }
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=72)
    scores = []
    # 在训练集上进行5折交叉验证及进行参数优化
    for train_index, test_index in skf.split(x_train4, y_train4):
        x_train5, x_test5 = x_train4.iloc[train_index], x_train4.iloc[test_index]
        y_train5, y_test5 = y_train4.iloc[train_index], y_train4.iloc[test_index]

        model = LogisticRegression(**param_grid)
        model.fit(x_train5, y_train5)
        preds = model.predict_proba(x_test5)[:, 1]
        score = balanced_logarithmic_loss_new(preds, y_test5)
        print(score)
        scores.append(score)

    func_out = np.mean(scores)

    return func_out
study = optuna.create_study(study_name=f'stacking_study_lg',direction="minimize")
study.optimize(objective, n_trials=200)
best_params = study.best_params

print("  Params: ")
for key, value in best_params.items():
    print("    {}: {}".format(key, value))

# 记录开始时间
start_time = time.time()

stacking_model = LogisticRegression(**best_params)
stacking_model.fit(new_train, y_train)

# 计算训练时间
training_time = time.time() - start_time

prediction = stacking_model.predict_proba(new_test)[:, 1]
score_test = balanced_logarithmic_loss_new(prediction, y_test_1)



### 3.2.2 基学习器：LGBM, CatBoost, RF 元学习器：Logistic回归

In [None]:
_N_FOLDS = 5
_N_CLASS = 2
kf = KFold(n_splits=_N_FOLDS, shuffle=True, random_state=68)

def get_oof(clfname, X_train, y_train, X_test):
    if clfname == 'rf':
        bestparams_rf = {
            'n_estimators': 190,
            'criterion': 'entropy',
            'max_depth': 5,
            'min_samples_split': 9,
            'min_samples_leaf': 5,
            'max_features': 'sqrt'
        }
        clf = RandomForestClassifier(**bestparams_rf)

    if clfname == 'cat':
        bestparams_cat = {
            'bagging_temperature': 7.442784574066854,
            'border_count': 212,
            'depth': 3,
            'iterations': 689,
            'l2_leaf_reg': 0.8262450118748192,
            'learning_rate': 0.09468235278046022,
            'random_strength': 3.098327157242888
        }
        clf = CatBoostClassifier(**bestparams_cat,auto_class_weights='Balanced')

    if clfname == 'lgbm':
        bestparams_lgbm = {
            'bagging_fraction': 0.9,
            'bagging_freq': 1,
            'colsample_bytree': 0.8755453466152356,
            'feature_fraction': 0.6000000000000001,
            'learning_rate': 0.2941129864759795,
            'max_depth': 4,
            'min_child_samples': 85,
            'n_estimators': 22976,
            'num_leaves': 2870,
            'reg_alpha': 0.515407087957732,
            'reg_lambda': 0.26493493680587554
        }
        clf = lgb.LGBMClassifier(**bestparams_lgbm, class_weight='balanced')
    # X_train: *
    # y_train: 1 *
    # X_test :  *
    oof_train = np.zeros((X_train.shape[0], _N_CLASS))  # Stacking后训练数据的输出
    oof_test = np.zeros((X_test.shape[0], _N_CLASS))

    for i, (train_index, test_index) in enumerate(kf.split(X_train)): # 交叉验证划分此时的训练集和验证集
        kf_X_train = X_train.iloc[train_index]  # 训练集
        kf_y_train = y_train.iloc[train_index]  # 训练集对应的输出
        kf_X_test = X_train.iloc[test_index]  # 验证集
        kf_y_test = y_train.iloc[test_index]
        if clfname == 'lgbm':
            clf.fit(kf_X_train, kf_y_train,categorical_feature=[39],early_stopping_rounds=200,eval_set=[(kf_X_test, kf_y_test)])
        elif clfname == 'rf':
            clf.fit(kf_X_train, kf_y_train)  # 当前模型进行训练
        else:
            clf.fit(kf_X_train, kf_y_train, early_stopping_rounds=200, eval_set=[(kf_X_test, kf_y_test)])

        oof_train[test_index] = clf.predict_proba(kf_X_test)# 对当前验证集进行预测
        oof_test += np.around(clf.predict_proba(X_test),6) # 对测试集预测

    oof_test /= 5   # 对每一则交叉验证的结果取平均
    return oof_train, oof_test  # 返回当前分类器对训练集和测试集的预测结果

# 将数据换成你的数据
X_train = x_train_1
y_train = y_train_1
X_test = x_test_1

# 将你的每个分类器都调用get_oof函数，并把它们的结果合并，就得到了新的训练和测试数据new_train,new_test
new_train, new_test = [], []
for clfname in ['lgbm','cat','rf']:
    oof_train, oof_test = get_oof(clfname, X_train, y_train, X_test)
    new_train.append(oof_train)
    new_test.append(oof_test)

new_train = np.concatenate(new_train, axis=1)
new_test = np.concatenate(new_test, axis=1)
new_train = pd.DataFrame(new_train)
new_test = pd.DataFrame(new_test)
new_train.columns = ['lgbm0','lgbm1','cat0','cat1','rf0','rf1']
new_test.columns = ['lgbm0','lgbm1','cat0','cat1','rf0','rf1']
new_train = new_train[['lgbm1','cat1','rf1']]
new_test = new_test[['lgbm1','cat1','rf1']]

# 用新的训练数据new_train作为新的模型的输入，stacking第二层
x_train4, y_train4 = new_train, y_train
def objective(trial):
    param_grid = {
        "C": trial.suggest_float("C", 1e-3, 1e2),
        "solver": trial.suggest_categorical("solver", ['liblinear', 'saga']),
        'penalty': trial.suggest_categorical('penalty', ["l1", "l2"])
    }
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=72)
    scores = []
    # 在训练集上进行5折交叉验证及进行参数优化
    for train_index, test_index in skf.split(x_train4, y_train4):
        x_train5, x_test5 = x_train4.iloc[train_index], x_train4.iloc[test_index]
        y_train5, y_test5 = y_train4.iloc[train_index], y_train4.iloc[test_index]

        model = LogisticRegression(**param_grid)
        model.fit(x_train5, y_train5)
        preds = model.predict_proba(x_test5)[:, 1]
        score = balanced_logarithmic_loss_new(preds, y_test5)
        print(score)
        scores.append(score)

    func_out = np.mean(scores)

    return func_out
study = optuna.create_study(study_name=f'stacking_study_lg',direction="minimize")
study.optimize(objective, n_trials=200)
best_params = study.best_params

print("  Params: ")
for key, value in best_params.items():
    print("    {}: {}".format(key, value))

# 记录开始时间
start_time = time.time()

stacking_model = LogisticRegression(**best_params)
stacking_model.fit(new_train, y_train)

# 计算训练时间
training_time = time.time() - start_time

prediction = stacking_model.predict_proba(new_test)[:, 1]
score_test = balanced_logarithmic_loss_new(prediction, y_test_1)

### 3.2.3 基学习器：LGBM, CatBoost, 随机森林 元学习器：随机森林

In [None]:
_N_FOLDS = 5
_N_CLASS = 2
kf = KFold(n_splits=_N_FOLDS, shuffle=True, random_state=68)

def get_oof(clfname, X_train, y_train, X_test):
    if clfname == 'rf':
        bestparams_rf = {
            'n_estimators': 190,
            'criterion': 'entropy',
            'max_depth': 5,
            'min_samples_split': 9,
            'min_samples_leaf': 5,
            'max_features': 'sqrt'
        }
        clf = RandomForestClassifier(**bestparams_rf)

    if clfname == 'cat':
        bestparams_cat = {
            'bagging_temperature': 7.442784574066854,
            'border_count': 212,
            'depth': 3,
            'iterations': 689,
            'l2_leaf_reg': 0.8262450118748192,
            'learning_rate': 0.09468235278046022,
            'random_strength': 3.098327157242888
        }
        clf = CatBoostClassifier(**bestparams_cat,auto_class_weights='Balanced')

    if clfname == 'lgbm':
        bestparams_lgbm = {
            'bagging_fraction': 0.9,
            'bagging_freq': 1,
            'colsample_bytree': 0.8755453466152356,
            'feature_fraction': 0.6000000000000001,
            'learning_rate': 0.2941129864759795,
            'max_depth': 4,
            'min_child_samples': 85,
            'n_estimators': 22976,
            'num_leaves': 2870,
            'reg_alpha': 0.515407087957732,
            'reg_lambda': 0.26493493680587554
        }
        clf = lgb.LGBMClassifier(**bestparams_lgbm, class_weight='balanced')
    # X_train: *
    # y_train: 1 *
    # X_test :  *
    oof_train = np.zeros((X_train.shape[0], _N_CLASS))  # Stacking后训练数据的输出
    oof_test = np.zeros((X_test.shape[0], _N_CLASS))

    for i, (train_index, test_index) in enumerate(kf.split(X_train)): # 交叉验证划分此时的训练集和验证集
        kf_X_train = X_train.iloc[train_index]  # 训练集
        kf_y_train = y_train.iloc[train_index]  # 训练集对应的输出
        kf_X_test = X_train.iloc[test_index]  # 验证集
        kf_y_test = y_train.iloc[test_index]
        if clfname == 'lgbm':
            clf.fit(kf_X_train, kf_y_train,categorical_feature=[39],early_stopping_rounds=200,eval_set=[(kf_X_test, kf_y_test)])
        elif clfname == 'rf':
            clf.fit(kf_X_train, kf_y_train)  # 当前模型进行训练
        else:
            clf.fit(kf_X_train, kf_y_train, early_stopping_rounds=200, eval_set=[(kf_X_test, kf_y_test)])

        oof_train[test_index] = clf.predict_proba(kf_X_test)# 对当前验证集进行预测
        oof_test += np.around(clf.predict_proba(X_test),6) # 对测试集预测

    oof_test /= 5   # 对每一则交叉验证的结果取平均
    return oof_train, oof_test  # 返回当前分类器对训练集和测试集的预测结果

# 将数据换成你的数据
X_train = x_train_1
y_train = y_train_1
X_test = x_test_1

# 将你的每个分类器都调用get_oof函数，并把它们的结果合并，就得到了新的训练和测试数据new_train,new_test
new_train, new_test = [], []
for clfname in ['lgbm','cat','rf']:
    oof_train, oof_test = get_oof(clfname, X_train, y_train, X_test)
    new_train.append(oof_train)
    new_test.append(oof_test)

new_train = np.concatenate(new_train, axis=1)
new_test = np.concatenate(new_test, axis=1)
new_train = pd.DataFrame(new_train)
new_test = pd.DataFrame(new_test)
new_train.columns = ['lgbm0','lgbm1','cat0','cat1','rf0','rf1']
new_test.columns = ['lgbm0','lgbm1','cat0','cat1','rf0','rf1']
new_train = new_train[['lgbm1','cat1','rf1']]
new_test = new_test[['lgbm1','cat1','rf1']]

# 用新的训练数据new_train作为新的模型的输入，stacking第二层
x_train4, y_train4 = new_train, y_train

def objective(trial):
    param_grid = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "criterion": trial.suggest_categorical("criterion", ["gini", "entropy"]),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 5),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2"]),
        "class_weight": "balanced",
        "random_state": 42,
    }
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=72)
    scores = []
    # 在训练集上进行5折交叉验证及进行参数优化
    for train_index, test_index in skf.split(x_train4, y_train4):
        x_train5, x_test5 = x_train4.iloc[train_index], x_train4.iloc[test_index]
        y_train5, y_test5 = y_train4.iloc[train_index], y_train4.iloc[test_index]

        model = RandomForestClassifier(**param_grid)
        model.fit(x_train5, y_train5)
        preds = model.predict_proba(x_test5)[:, 1]
        score = balanced_logarithmic_loss_new(preds, y_test5)
        print(score)
        scores.append(score)

    func_out = np.mean(scores)

    return func_out
study = optuna.create_study(study_name=f'stacking_study_rf',direction="minimize")
study.optimize(objective, n_trials=200)
best_params = study.best_params

print("  Params: ")
for key, value in best_params.items():
    print("    {}: {}".format(key, value))

# 记录开始时间
start_time = time.time()

stacking_model = RandomForestClassifier(**best_params)
stacking_model.fit(new_train, y_train)
# 计算训练时间
training_time = time.time() - start_time

print(f"training time is {training_time}")

prediction = stacking_model.predict_proba(new_test)[:, 1]
score_test = balanced_logarithmic_loss_new(prediction, y_test_1)



### 3.2.4 基学习器：LGBM, CatBoost, XGBoost 元学习器：RF

In [None]:
_N_FOLDS = 5
_N_CLASS = 2
kf = KFold(n_splits=_N_FOLDS, shuffle=True, random_state=68)

def get_oof(clfname, X_train, y_train, X_test):
    if clfname == 'xgb':
        bestparams_xgb = {
            'n_estimators': 19808,
            'learning_rate': 0.29003009973815874,
            'max_depth': 11,
            'gamma': 0.3584743148039589,
            'subsample': 0.7,
            'colsample_bytree': 0.6874234399659787,
            'reg_alpha': 0.14303443256590267,
            'reg_lambda': 0.30450443282388806,
            'min_child_weight': 2.78622578591918,
            'max_delta_step': 6,
            'alpha': 0.43245457551855127,
        }
        clf = xgb.XGBClassifier(**bestparams_xgb,feval = balanced_logarithmic_loss_new)

    if clfname == 'cat':
        bestparams_cat = {
            'bagging_temperature': 7.442784574066854,
            'border_count': 212,
            'depth': 3,
            'iterations': 689,
            'l2_leaf_reg': 0.8262450118748192,
            'learning_rate': 0.09468235278046022,
            'random_strength': 3.098327157242888
        }
        clf = CatBoostClassifier(**bestparams_cat,auto_class_weights='Balanced')

    if clfname == 'lgbm':
        bestparams_lgbm = {
            'bagging_fraction': 0.9,
            'bagging_freq': 1,
            'colsample_bytree': 0.8755453466152356,
            'feature_fraction': 0.6000000000000001,
            'learning_rate': 0.2941129864759795,
            'max_depth': 4,
            'min_child_samples': 85,
            'n_estimators': 22976,
            'num_leaves': 2870,
            'reg_alpha': 0.515407087957732,
            'reg_lambda': 0.26493493680587554
        }
        clf = lgb.LGBMClassifier(**bestparams_lgbm, class_weight='balanced')
    # X_train: *
    # y_train: 1 *
    # X_test :  *
    oof_train = np.zeros((X_train.shape[0], _N_CLASS))  # Stacking后训练数据的输出
    oof_test = np.zeros((X_test.shape[0], _N_CLASS))

    for i, (train_index, test_index) in enumerate(kf.split(X_train)): # 交叉验证划分此时的训练集和验证集
        kf_X_train = X_train.iloc[train_index]  # 训练集
        kf_y_train = y_train.iloc[train_index]  # 训练集对应的输出
        kf_X_test = X_train.iloc[test_index]  # 验证集
        kf_y_test = y_train.iloc[test_index]
        if clfname == 'lgbm':
            clf.fit(kf_X_train, kf_y_train,categorical_feature=[39],early_stopping_rounds=200,eval_set=[(kf_X_test, kf_y_test)])
        else:
            clf.fit(kf_X_train, kf_y_train,early_stopping_rounds=200,eval_set=[(kf_X_test, kf_y_test)])  # 当前模型进行训练

        oof_train[test_index] = clf.predict_proba(kf_X_test)# 对当前验证集进行预测
        oof_test += np.around(clf.predict_proba(X_test),6) # 对测试集预测

    oof_test /= 5   # 对每一则交叉验证的结果取平均
    return oof_train, oof_test  # 返回当前分类器对训练集和测试集的预测结果

# 将数据换成你的数据
X_train = x_train_1
y_train = y_train_1
X_test = x_test_1

# 将你的每个分类器都调用get_oof函数，并把它们的结果合并，就得到了新的训练和测试数据new_train,new_test
new_train, new_test = [], []
for clfname in ['lgbm','cat','xgb']:
    oof_train, oof_test = get_oof(clfname, X_train, y_train, X_test)
    new_train.append(oof_train)
    new_test.append(oof_test)

new_train = np.concatenate(new_train, axis=1)
new_test = np.concatenate(new_test, axis=1)
new_train = pd.DataFrame(new_train)
new_test = pd.DataFrame(new_test)
new_train.columns = ['lgbm0','lgbm1','cat0','cat1','xgb0','xgb1']
new_test.columns = ['lgbm0','lgbm1','cat0','cat1','xgb0','xgb1']
new_train = new_train[['lgbm1','cat1','xgb1']]
new_test = new_test[['lgbm1','cat1','xgb1']]
new_train.columns = ['LGBM','CatBoost','XGB']


# 用新的训练数据new_train作为新的模型的输入，stacking第二层

x_train4, y_train4 = new_train, y_train
def objective(trial):
    param_grid = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "criterion": trial.suggest_categorical("criterion", ["gini", "entropy"]),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 5),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2"]),
        "class_weight": "balanced",
        "random_state": 42,
    }
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=72)
    scores = []
    # 在训练集上进行5折交叉验证及进行参数优化
    for train_index, test_index in skf.split(x_train4, y_train4):
        x_train5, x_test5 = x_train4.iloc[train_index], x_train4.iloc[test_index]
        y_train5, y_test5 = y_train4.iloc[train_index], y_train4.iloc[test_index]

        model = RandomForestClassifier(**param_grid)
        model.fit(x_train5, y_train5)
        preds = model.predict_proba(x_test5)[:, 1]
        score = balanced_logarithmic_loss_new(preds, y_test5)
        print(score)
        scores.append(score)

    func_out = np.mean(scores)

    return func_out
study = optuna.create_study(study_name=f'stacking_study_rf',direction="minimize")
study.optimize(objective, n_trials=200)
best_params = study.best_params

print("  Params: ")
for key, value in best_params.items():
    print("    {}: {}".format(key, value))

# 记录开始时间
start_time = time.time()

stacking_model = RandomForestClassifier(**best_params)
stacking_model.fit(new_train, y_train)

# 计算训练时间
training_time = time.time() - start_time

print(f"training time is {training_time}")
prediction = stacking_model.predict_proba(new_test)[:, 1]
score_test = balanced_logarithmic_loss_new(prediction, y_test_1)

## 3.3 (LGBM, CatBoost, XGBoost + RF)Stacking模型效果分析

In [None]:
import shap
explainer = shap.TreeExplainer(stacking_model)
explainer.expected_value
np.mean(stacking_model.predict_proba(new_train)[:, 1])

In [None]:
import matplotlib.pyplot as plt
# 变量重要性 shap value
shap_values = explainer.shap_values(new_train)
shap.summary_plot(shap_values[1], new_train, plot_type="bar")

In [None]:
# 变量重要性 PDP
from sklearn.inspection import permutation_importance
perimp = permutation_importance(stacking_model, new_train, y_train, n_repeats=10, random_state=1)
perm_importance = pd.Series(perimp.importances_mean, index = new_train.columns, name = 'Var')
perm_importance.sort_values().plot(kind='barh')
plt.show(block = True)

In [None]:
# 两个最重要的变量的PDP
from sklearn.inspection import PartialDependenceDisplay
disp1 = PartialDependenceDisplay.from_estimator(stacking_model, new_train, features=["cat1","lgbm1"],method="brute")
plt.show(block = True)


In [None]:
# 一个样本的force_plot
shap.force_plot(explainer.expected_value[1], shap_values[1][0,:], feature_names=new_train.columns,matplotlib=True)
plt.show(block = True)

In [None]:
# 所有变量对预测的影响
shap.summary_plot(shap_values[1], new_train ,matplotlib=True)
plt.show(block = True)

# 4.基于Voting的模型融合构建

In [None]:
catboost_paras = {'bagging_temperature': 7.442784574066854,
            'border_count': 212,
            'depth': 3,
            'iterations': 689,
            'l2_leaf_reg': 0.8262450118748192,
            'learning_rate': 0.09468235278046022,
            'random_strength': 3.098327157242888
                 }


xgb_paras = {'n_estimators': 19808,
            'learning_rate': 0.29003009973815874,
            'max_depth': 11,
            'gamma': 0.3584743148039589,
            'subsample': 0.7,
            'colsample_bytree': 0.6874234399659787,
            'reg_alpha': 0.14303443256590267,
            'reg_lambda': 0.30450443282388806,
            'min_child_weight': 2.78622578591918,
            'max_delta_step': 6,
            'alpha': 0.43245457551855127
                 }

lgb_paras = {
            'bagging_fraction': 0.9,
            'bagging_freq': 1,
            'colsample_bytree': 0.8755453466152356,
            'feature_fraction': 0.6000000000000001,
            'learning_rate': 0.2941129864759795,
            'max_depth': 4,
            'min_child_samples': 85,
            'n_estimators': 22976,
            'num_leaves': 2870,
            'reg_alpha': 0.515407087957732,
            'reg_lambda': 0.26493493680587554
        }

In [None]:
models = [
    ('catboost', CatBoostClassifier(**catboost_paras,auto_class_weights='Balanced')),
    ('xgb', xgb.XGBClassifier(**xgb_paras,feval = balanced_logarithmic_loss_new)),
    ('lgb',lgb.LGBMClassifier(**lgb_paras, class_weight='balanced'))
]


In [None]:
from sklearn.ensemble import VotingClassifier
voting_model = VotingClassifier(models, voting='soft')

In [None]:
voting_model.fit(x_train_1, y_train_1)
val_preds = voting_model.predict_proba(x_test_1)
val_score = balanced_logarithmic_loss_new(val_preds[:, 1],y_test_1)
val_score

In [None]:
val_preds = voting_model.predict_proba(x_test_1)
val_score = balanced_logarithmic_loss_new(val_preds[:, 1],y_test_1)
val_score

# 5.输出比赛结果

In [None]:
prediction = stacking_model.predict_proba(new_test)
submission_stacking = pd.DataFrame(columns = submission_df.columns)
submission_stacking['Id'] = submission_df['Id']
submission_stacking[['class_0','class_1']] = pred_vote
submission_stacking.to_csv("submission_stacking.csv")