In [1]:
import re
import math
import warnings
import tqdm
import matplotlib
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
from scipy.optimize import fsolve
from sympy import symbols, Eq, solve
from matplotlib.pyplot import MultipleLocator
from sklearn.preprocessing import MinMaxScaler, Binarizer
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
warnings.filterwarnings("ignore")


%matplotlib inline
plt.rcParams['font.sans-serif'] = ['SimHei'] 
plt.rcParams['axes.unicode_minus'] = False 

In [2]:
data1=pd.read_excel('/root/GDUT/表1-患者列表及临床信息.xlsx')
data2=pd.read_excel('/root/GDUT/表2-患者影像信息血肿及水肿的体积及位置.xlsx')
data3=pd.read_excel('/root/GDUT/表3-患者影像信息血肿及水肿的形状及灰度分布.xlsx')
data4=pd.read_excel('/root/GDUT/表4-答案文件.xlsx')
data5=pd.read_excel('1b数据.xlsx')

data1.rename(columns={data1.columns[0]: "ID"}, inplace=True)
data2.rename(columns={data2.columns[0]: "ID"}, inplace=True)

data_f_1=pd.read_excel('/root/GDUT/附表1-检索表格-流水号vs时间.xlsx')
data_kz=pd.read_excel('/root/GDUT/q11/扩张及时间.xlsx')

In [3]:
# 转换独热编码
xueya_list = []
for i in data5['血压']:
    y1 = i.split('/')[0]
    y2 = i.split('/')[1]
    if int(y1) > 139 or int(y1) < 90:
        status_i = 0
    elif int(y2) > 89 or int(y2) < 60:
        status_i = 0
    else:
        status_i = 1
    xueya_list.append(status_i)
data5['血压'] = np.array(xueya_list)
data5['性别_男'] = data5['性别'].apply(lambda x: 1 if x == '男' else 0)
data5['性别_女'] = data5['性别'].apply(lambda x: 1 if x == '女' else 0)
data5 = data5.drop(columns=['性别'])
x = data5['年龄'].values
scaler = MinMaxScaler()
x_normalized = scaler.fit_transform(x.reshape(-1, 1))
binarizer = Binarizer(threshold=0.5)
x_binarized = binarizer.fit_transform(x_normalized)
data5['年龄'] = x_binarized
data5 = data5.drop(['流水号'], axis = 1)
data5.head(100)

Unnamed: 0,年龄,脑出血前mRS评分,高血压病史,卒中病史,糖尿病史,房颤史,冠心病史,吸烟史,饮酒史,发病到首次影像检查时间间隔,...,NCCT_original_firstorder_Median.Hemo,NCCT_original_firstorder_Minimum.Hemo,NCCT_original_firstorder_Range.Hemo,NCCT_original_firstorder_RobustMeanAbsoluteDeviation.Hemo,NCCT_original_firstorder_RootMeanSquared.Hemo,NCCT_original_firstorder_Skewness.Hemo,NCCT_original_firstorder_Uniformity.Hemo,NCCT_original_firstorder_Variance.Hemo,性别_男,性别_女
0,0.0,0,0,0,0,0,0,0,0,2.5,...,180.997857,86.200064,161.239687,22.759625,175.551025,-0.523096,0.089596,1225.449943,0,1
1,0.0,0,1,0,0,0,0,0,0,3.0,...,137.569328,69.446385,152.848311,26.758880,145.014063,0.113439,0.074774,1394.452289,1,0
2,1.0,0,1,0,0,0,0,0,0,2.0,...,133.217856,61.683739,164.933560,19.037080,135.747065,0.019111,0.098694,812.571273,1,0
3,1.0,0,1,0,0,0,0,0,0,2.0,...,133.217856,61.683739,164.933560,19.037080,135.747065,0.019111,0.098694,812.571273,1,0
4,1.0,2,1,1,0,0,0,0,0,1.0,...,124.293778,59.981668,230.164456,21.618937,130.690592,0.323449,0.123988,1061.812328,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0,1,0,0,0,0,0,0,1.0,...,133.706052,82.130651,133.527068,18.267232,138.197389,0.231509,0.083999,760.356839,0,1
96,1.0,0,0,0,0,0,0,0,0,1.0,...,142.643719,78.215613,146.557579,22.871665,145.937323,0.008035,0.080505,1056.837094,0,1
97,1.0,0,1,0,0,0,0,0,0,3.0,...,162.653881,82.822093,169.854351,29.985000,170.381262,0.062903,0.078045,1702.945541,0,1
98,1.0,0,1,1,1,1,1,1,0,2.5,...,157.553169,64.587076,157.060233,16.909283,154.139448,-0.589568,0.104148,814.067313,1,0


In [4]:
# 归一化
# 创建 MinMaxScaler 对象
scaler = MinMaxScaler()
# 选择要归一化的列（除了 '年龄' 列之外的所有列）
columns_to_normalize = [col for col in data5.columns if col != '性别']
# 使用 MinMaxScaler 对所选列进行归一化
data5[columns_to_normalize] = scaler.fit_transform(data5[columns_to_normalize])

In [5]:
data6 = pd.merge(data5.head(100), data_kz['是否扩张'], left_index=True, right_index=True)
data6

Unnamed: 0,年龄,脑出血前mRS评分,高血压病史,卒中病史,糖尿病史,房颤史,冠心病史,吸烟史,饮酒史,发病到首次影像检查时间间隔,...,NCCT_original_firstorder_Minimum.Hemo,NCCT_original_firstorder_Range.Hemo,NCCT_original_firstorder_RobustMeanAbsoluteDeviation.Hemo,NCCT_original_firstorder_RootMeanSquared.Hemo,NCCT_original_firstorder_Skewness.Hemo,NCCT_original_firstorder_Uniformity.Hemo,NCCT_original_firstorder_Variance.Hemo,性别_男,性别_女,是否扩张
0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.094737,...,0.580976,0.461355,0.596541,0.747897,0.073091,0.233654,0.575522,0.0,1.0,0
1,0.0,0.000000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.115789,...,0.302489,0.415018,0.784219,0.401521,0.432276,0.062318,0.678884,1.0,0.0,0
2,1.0,0.000000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.073684,...,0.173455,0.481753,0.421848,0.296407,0.379049,0.338820,0.323005,1.0,0.0,1
3,1.0,0.000000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.073684,...,0.173455,0.481753,0.421848,0.296407,0.379049,0.338820,0.323005,1.0,0.0,0
4,1.0,0.666667,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.031579,...,0.145162,0.841963,0.543010,0.239052,0.550780,0.631206,0.475441,1.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.000000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031579,...,0.513332,0.308324,0.385720,0.324200,0.498901,0.168950,0.291070,0.0,1.0,0
96,1.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031579,...,0.448255,0.380280,0.601798,0.411993,0.372799,0.128560,0.472398,0.0,1.0,0
97,1.0,0.000000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.115789,...,0.524826,0.508926,0.935615,0.689257,0.403759,0.100130,0.867558,0.0,1.0,0
98,1.0,0.000000,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.094737,...,0.221715,0.438276,0.321994,0.505029,0.035582,0.401865,0.323920,1.0,0.0,1


In [6]:
X = data6.drop(columns=['是否扩张'])  # 特征列
y = data6['是否扩张']  # 目标列

In [7]:
# 将数据划分为80%的训练集和20%的测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# X_train包含训练集的特征
# y_train包含训练集的目标标签
# X_test包含测试集的特征
# y_test包含测试集的目标标签

In [8]:
# 定义超参数的候选值
param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 4, 6, 8, 10],
    'min_samples_leaf': [1, 2, 4]
}

# 创建随机森林分类器
rf_classifier = RandomForestClassifier(random_state=0)

In [None]:
# 随机森林 + 网格搜索 - 5折交叉验证 - 4进程
grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, scoring='accuracy', n_jobs=3)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print(f"训练集最佳超参数组合: {best_params}")
best_model_rf1 = grid_search.best_estimator_
y_pred = best_model_rf1.predict(X)
accuracy = accuracy_score(y_train, y_pred)
print(f"训练集最佳模型的准确率: {accuracy:.4f}")
best_model_rf1 = grid_search.best_estimator_
y_pred = best_model_rf1.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"测试集最佳模型的准确率: {accuracy:.4f}")

In [None]:
# 随机森林 + 留一交叉验证 + 网格搜索 - 5折交叉验证 - 4进程
loo = LeaveOneOut()
grid_search = GridSearchCV(rf_classifier, param_grid, cv=loo, scoring='accuracy', n_jobs=3)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print(f"训练集最佳超参数组合: {best_params}")
best_model_rf2 = grid_search.best_estimator_
y_pred = best_model_rf2.predict(X)
accuracy = accuracy_score(y_train, y_pred)
print(f"训练集最佳模型的准确率: {accuracy:.4f}")
best_model_rf2 = grid_search.best_estimator_
y_pred = best_model_rf2.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"测试集最佳模型的准确率: {accuracy:.4f}")

In [None]:
# # XGBoost + 留一交叉验证 + 网格搜索 + 4进程
# warnings.filterwarnings("ignore", category=FutureWarning)
# xgb_classifier = xgb.XGBClassifier(random_state=0)

# param_grid = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [3, 4, 5],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'min_child_weight': [1, 2, 3]
# }


# loo = LeaveOneOut()

# grid_search = GridSearchCV(xgb_classifier, param_grid, cv=loo, scoring='accuracy', n_jobs=4)
# grid_search.fit(X, y)

# best_params = grid_search.best_params_
# print(f"最佳超参数组合: {best_params}")

# best_model_xgb = grid_search.best_estimator_
# y_pred = best_model_xgb.predict(X)
# accuracy = accuracy_score(y, y_pred)
# print(f"最佳模型的准确率: {accuracy:.4f}")

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif 

In [None]:
# 测试集预测
# probabilities = best_model.predict_proba(data5.tail(len(data5) - 100))