In [None]:
from sklearn.impute import SimpleImputer
import pandas as pd
from scipy.stats import spearmanr

# 读取数据
df = pd.read_csv(r'/kaggle/input/dataset/data.csv', encoding='gbk')

# 使用中位数填充缺失值
imputer = SimpleImputer(strategy='median')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# 计算相关性并筛选特征
correlations = {}
for feature in df_imputed.columns[:-1]:
    # 跳过唯一值的特征
    if len(df_imputed[feature].unique()) == 1:
        print(f"Skipping constant feature: {feature}")
        continue
    
    # 跳过方差为零的特征
    if df_imputed[feature].var() == 0:
        print(f"Skipping feature with zero variance: {feature}")
        continue
    
    # 计算斯皮尔曼等级相关系数的绝对值
    corr, _ = spearmanr(df_imputed[feature], df_imputed[df_imputed.columns[-1]], nan_policy='omit')
    correlations[feature] = abs(corr)

# 按相关性降序排序并选择前15个特征
sorted_correlations = sorted(correlations.items(), key=lambda x: x[1], reverse=True)
top_15_features = sorted_correlations[:15]

print("Top 15 features without significant collinearity:")
for feature, corr in top_15_features:
    print(f"{feature}: {corr}")

# 导入numpy（已提前导入，此行可忽略或删除）
import numpy as np

# 选取排名前15的相关性特征名称
selected_features = [f[0] for f in top_15_features]

In [None]:
#  DataFrame  float32 
X = df_imputed.loc[:, selected_features].values.astype(np.float32)
#  float32 
y = df.iloc[:, -1].values.reshape(-1, 1).astype(np.float32)
#  ndarray 
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

In [None]:
import numpy as np
min_vals = np.min(X, axis=0)
max_vals = np.max(X, axis=0)
denominator = max_vals - min_vals
denominator[denominator == 0] = 1 # 010
# Min-Max
normalized_X = (X - min_vals) / denominator
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(normalized_X, y, test_size=0.
↪3)
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, recall_score
smote = SMOTE(k_neighbors=5, random_state=42,sampling_strategy=0.3)
trainX, trainY = smote.fit_resample(X_train, y_train)

In [None]:
from sklearn.svm import SVC
svmModel = SVC(kernel='rbf',gamma='auto',class_weight={0: 1, 1: 6})
svmModel.fit(trainX, trainY )
svmLabels = svmModel.predict(X_val)
accuracy = accuracy_score(y_val, svmLabels)
print('SVM :', accuracy)
recall = recall_score(y_val, svmLabels, pos_label=1)
print(':', recall)

In [None]:
from joblib import dump
# 
dump(svmModel, 'svmModel.joblib')

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, recall_score
# 
t = DecisionTreeClassifier(max_leaf_nodes=10, random_state=42)
# AdaBoost
boostedTreeModel = AdaBoostClassifier(
estimator=t,
n_estimators=100,
algorithm='SAMME',
random_state=42
)
sample_weight = [1 if y == 0 else 10 for y in trainY]
boostedTreeModel.fit(trainX, trainY, sample_weight=sample_weight)
boostedTreeLabels = boostedTreeModel.predict(X_val)
accuracy = accuracy_score(y_val, boostedTreeLabels)
print('Boosted Tree Accuracy:', accuracy)
recall = recall_score(y_val, boostedTreeLabels, pos_label=1)
print('Recall:', recall)

In [None]:
from joblib import dump
# 
dump(boostedTreeModel, 'boostedTreeModel.joblib')

In [None]:
from sklearn.linear_model import LogisticRegression
class_weights = {0: 1, 1: 5}
logistic_model = LogisticRegression(class_weight=class_weights,random_state=42)
logistic_model.fit(trainX, trainY)
logistic_labels = logistic_model.predict(X_val)
accuracy = accuracy_score(y_val, logistic_labels)
print('Logistic Regression Accuracy:', accuracy)
recall = recall_score(y_val, logistic_labels, pos_label=1)
print('Recall:', recall)

In [None]:
from joblib import dump
# 
dump(logistic_model, 'logistic_model.joblib')

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score
# 101
class_weights = {0: 1, 1: 5}
# 
trainX_weighted = []
trainY_weighted = []
for i, x in enumerate(trainX):
trainX_weighted.append(x)
trainY_weighted.append(trainY[i])
if trainY[i] == 1:
# 
for _ in range(class_weights[1] - 1):
trainX_weighted.append(x)
trainY_weighted.append(trainY[i])
lda_model = LinearDiscriminantAnalysis()
lda_model.fit(trainX_weighted, trainY_weighted)
lda_labels = lda_model.predict(X_val)
accuracy = accuracy_score(y_val, lda_labels)
print('Fisher Linear Discriminant Analysis Accuracy:', accuracy)
recall = recall_score(y_val, lda_labels, pos_label=1)
print('Recall:', recall)

In [None]:
from joblib import dump
# 
dump(lda_model, 'lda_model.joblib')

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
class_weight = {0: 1, 1: 100}
# 
clf = RandomForestClassifier(n_estimators=100, random_state=42)
sample_weight = [1 if y == 0 else 100 for y in trainY]
# 
clf.fit(trainX, trainY,sample_weight=sample_weight)
# 
predictions = clf.predict(X_val)
# 
accuracy = accuracy_score(y_val, predictions)
print("Accuracy:", accuracy)
# 
recall = recall_score(y_val, predictions, pos_label=1)
print(':', recall)
from joblib import dump
# 
dump(clf, 'random_forest_model.joblib')

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, recall_score
from joblib import dump
class_weights = {0: 1., 1: 5.} # 110
# AdaBoost
boosted_model = AdaBoostClassifier(
n_estimators=60,
algorithm='SAMME',
random_state=42
)
# 
boosted_model.fit(trainX, trainY, sample_weight=[class_weights[yi] for yi in␣
↪trainY])
# 
boosted_labels = boosted_model.predict(X_val)
# 
accuracy = accuracy_score(y_val, boosted_labels)
recall = recall_score(y_val, boosted_labels, pos_label=1)
print('Boosted Tree Accuracy:', accuracy)
print('Recall:', recall)
# 
dump(boosted_model, 'boosted_model.joblib')
# # 
# loaded_model = load('boosted_model.joblib')

In [None]:
from keras import Sequential
from keras.layers import Dense,Dropout,BatchNormalization,Activation,Dropout
model=Sequential()
model.add(Dense(512))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Dense(256))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.3))
# model.add(Dense(64))
# model.add(BatchNormalization())
# model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy')
class_weights={0:1,1:5}
history=model.fit(trainX,trainY,
epochs=100,
shuffle=True,
class_weight=class_weights)

In [None]:
predictions = model.predict(X_val)
y_pred_classes = (predictions > 0.5).astype(int)
from sklearn.metrics import accuracy_score ,recall_score
accuracy = accuracy_score(y_val, y_pred_classes)
print(f"Accuracy: {accuracy}")
# 1
recall = recall_score(y_val, y_pred_classes)
print(f"Recall: {recall}")

In [None]:
model.save('model.keras')

In [None]:
from joblib import load
from keras.models import load_model
model1=load('boostedTreeModel.joblib')
model2=load('lda_model.joblib')
model3=load('boosted_model.joblib')
model4=load('logistic_model.joblib')
model5=load('svmModel.joblib')
model6=load('random_forest_model.joblib')
model7=load_model('model.keras')
# 
predictions1 = np.where(model1.predict(X_val) > 0.5, 1, 0)
predictions2 = np.where(model2.predict(X_val) > 0.5, 1, 0)
predictions3 = np.where(model3.predict(X_val) > 0.5, 1, 0)
predictions4 = np.where(model4.predict(X_val) > 0.5, 1, 0)
predictions5 = np.where(model5.predict(X_val) > 0.5, 1, 0)
predictions6 = np.where(model6.predict(X_val) > 0.5, 1, 0)
predictions7 = np.where(model7.predict(X_val) > 0.5, 1, 0)
# 
weights = [0.3, 0.1, 0.5, 0.1, 0.2, 0.1,1] # 
# 
ans0=weights[0] * predictions1
ans1=weights[1] * predictions2
ans2=weights[2] * predictions3
ans3=weights[3] * predictions4
ans4=weights[4] * predictions5
ans5=weights[5] * predictions6
ans6=(weights[6] * predictions7).reshape(-1,)
print(ans0.shape)
print(ans1.shape)
print(ans2.shape)
print(ans3.shape)
print(ans4.shape)
print(ans5.shape)
print(ans6.shape)

In [None]:
ans=(ans0+ans1+ans2+ans3+ans4+ans5+ans6)/np.sum(weights)

In [None]:
#  0.5 
binary_predictions = (ans > 0.5).astype(int)
# 
print("Binary Predictions:", binary_predictions)
# 
accuracy = accuracy_score(y_val, binary_predictions)
print("Weighted Classification Accuracy:", accuracy)
recall=recall_score(y_val, binary_predictions)
print("Weighted Classification Recall:", recall)