In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np # 导入NumPy数学工具箱
import pandas as pd # 导入Pandas数据处理工具箱
df_heart = pd.read_csv("../input/heartdataset/heart.csv")  # 读取文件
df_heart.head() # 显示前5行数据

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns #导入seaborn画图工具箱
sns.countplot(x="target", data=df_heart, palette="bwr")
plt.show()

In [None]:
# 将某些特征转换为数值类型的哑变量
a = pd.get_dummies(df_heart['cp'], prefix = "cp")
b = pd.get_dummies(df_heart['thal'], prefix = "thal")
c = pd.get_dummies(df_heart['slope'], prefix = "slope")
frames = [df_heart, a, b, c]
df_heart = pd.concat(frames, axis = 1)
df_heart.head()
df_heart = df_heart.drop(columns = ['cp', 'thal', 'slope'])
df_heart.head()

In [None]:
# 构建特征和标签集
y = df_heart.target.values
X = df_heart.drop(['target'], axis = 1)

In [None]:
from sklearn.model_selection import train_test_split # 拆分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2,random_state=0)

In [None]:
# 进行特征缩放
from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
from sklearn.neighbors import KNeighborsClassifier # 导入kNN算法
k = 5 # 设定初始K值为5
kNN = KNeighborsClassifier(n_neighbors = k)  # kNN模型
kNN.fit(X_train, y_train) # 拟合kNN模型
y_pred = kNN.predict(X_test) # 预测心脏病结果
from sklearn.metrics import (accuracy_score, f1_score, average_precision_score, confusion_matrix) # 导入评估标准
print("{}NN 预测准确率: {:.2f}%".format(k, kNN.score(X_test, y_test)*100))
print("{}NN 预测F1分数: {:.2f}%".format(k, f1_score(y_test, y_pred)*100))
print('kNN 混淆矩阵:\n', confusion_matrix(y_pred, y_test))

In [None]:
# 寻找最佳K值
f1_score_list = []
acc_score_list = []
for i in range(1,15):
    kNN = KNeighborsClassifier(n_neighbors = i)  # n_neighbors means k
    kNN.fit(X_train, y_train)
    acc_score_list.append(kNN.score(X_test, y_test))
    y_pred = kNN.predict(X_test) # 预测心脏病结果
    f1_score_list.append(f1_score(y_test, y_pred))
index = np.arange(1,15,1)
plt.plot(index,acc_score_list,c='blue',linestyle='solid')
plt.plot(index,f1_score_list,c='red',linestyle='dashed')
plt.legend(["Accuracy", "F1 Score"])
plt.xlabel("K value")
plt.ylabel("Score")
plt.grid('false')
plt.show()
kNN_acc = max(f1_score_list)*100
print("Maximum kNN Score is {:.2f}%".format(kNN_acc))

In [None]:
from sklearn.svm import SVC # 导入SVM分类器
svm = SVC(random_state = 1)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test) # 预测心脏病结果
svm_acc = svm.score(X_test,y_test)*100
print("SVM 预测准确率:: {:.2f}%".format(svm.score(X_test,y_test)*100))
print("SVM 预测F1分数: {:.2f}%".format(f1_score(y_test, y_pred)*100))
print('SVM 混淆矩阵:\n', confusion_matrix(y_pred, y_test))

In [None]:
from sklearn.naive_bayes import GaussianNB # 导入朴素贝叶斯模型
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test) # 预测心脏病结果
nb_acc = nb.score(X_test,y_test)*100
print("NB 预测准确率:: {:.2f}%".format(svm.score(X_test,y_test)*100))
print("NB 预测F1分数: {:.2f}%".format(f1_score(y_test, y_pred)*100))
print('NB 混淆矩阵:\n', confusion_matrix(y_pred, y_test))
print(nb_acc)

In [None]:
from sklearn.tree import DecisionTreeClassifier # 导入决策树分类器
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
dtc_acc = dtc.score(X_test, y_test)*100
y_pred = dtc.predict(X_test) # 预测心脏病结果
print("Decision Tree Test Accuracy {:.2f}%".format(dtc_acc))
print("决策树 预测准确率:: {:.2f}%".format(dtc.score(X_test, y_test)*100))
print("决策树 预测F1分数: {:.2f}%".format(f1_score(y_test, y_pred)*100))
print('决策树 混淆矩阵:\n', confusion_matrix(y_pred, y_test))

In [None]:
from sklearn.ensemble import RandomForestClassifier # 导入随机森林分类器
rf = RandomForestClassifier(n_estimators = 1000, random_state = 1)
rf.fit(X_train, y_train)
rf_acc = rf.score(X_test,y_test)*100
y_pred = rf.predict(X_test) # 预测心脏病结果
print("随机森林 预测准确率:: {:.2f}%".format(rf.score(X_test, y_test)*100))
print("随机森林 预测F1分数: {:.2f}%".format(f1_score(y_test, y_pred)*100))

print('随机森林 混淆矩阵:\n', confusion_matrix(y_pred, y_test))

In [None]:
from sklearn.linear_model import LogisticRegression # 导入逻辑回归模型
lr = LogisticRegression()
lr.fit(X_train,y_train) 
y_pred = lr.predict(X_test) # 预测心脏病结果
lr_acc = lr.score(X_test,y_test)*100 
lr_f1 = f1_score(y_test, y_pred)*100 
print("逻辑回归测试集准确率： {:.2f}%".format(lr_acc))
print("逻辑回归测试集F1分数: {:.2f}%".format(lr_f1))
print('逻辑回归测试集混淆矩阵:\n', confusion_matrix(y_test,y_pred))

In [None]:
methods = ["Logistic Regression", "kNN", "SVM", 
           "Naive Bayes", "Decision Tree", "Random Forest"]
accuracy = [lr_acc, kNN_acc, svm_acc, nb_acc, dtc_acc, rf_acc]
colors = ["orange","red","purple", "magenta", "green","blue"]
sns.set_style("whitegrid")
plt.figure(figsize=(16,5))
plt.yticks(np.arange(0,100,10))
plt.ylabel("Accuracy %")
plt.xlabel("Algorithms")
sns.barplot(x=methods, y=accuracy, palette=colors)
plt.grid(b=None)
plt.show()

In [None]:
# 绘制各算法的混淆矩阵
from sklearn.metrics import confusion_matrix
y_pred_lr = lr.predict(X_test)
kNN3 = KNeighborsClassifier(n_neighbors = 3)
kNN3.fit(X_train, y_train)
y_pred_kNN = kNN3.predict(X_test)
y_pred_svm = svm.predict(X_test)
y_pred_nb = nb.predict(X_test)
y_pred_dtc = dtc.predict(X_test)
y_pred_rf = rf.predict(X_test)
cm_lr = confusion_matrix(y_test,y_pred_lr)
cm_kNN = confusion_matrix(y_test,y_pred_kNN)
cm_svm = confusion_matrix(y_test,y_pred_svm)
cm_nb = confusion_matrix(y_test,y_pred_nb)
cm_dtc = confusion_matrix(y_test,y_pred_dtc)
cm_rf = confusion_matrix(y_test,y_pred_rf)
plt.figure(figsize=(24,12))
plt.suptitle("Confusion Matrixes",fontsize=24)
plt.subplots_adjust(wspace = 0.4, hspace= 0.4)
plt.subplot(2,3,1)
plt.title("Logistic Regression Confusion Matrix")
sns.heatmap(cm_lr,annot=True,cmap="Blues",fmt="d",cbar=False)
plt.subplot(2,3,2)
plt.title("K Nearest Neighbors Confusion Matrix")
sns.heatmap(cm_kNN,annot=True,cmap="Blues",fmt="d",cbar=False)
plt.subplot(2,3,3)
plt.title("Support Vector Machine Confusion Matrix")
sns.heatmap(cm_svm,annot=True,cmap="Blues",fmt="d",cbar=False)
plt.subplot(2,3,4)
plt.title("Naive Bayes Confusion Matrix")
sns.heatmap(cm_nb,annot=True,cmap="Blues",fmt="d",cbar=False)
plt.subplot(2,3,5)
plt.title("Decision Tree Classifier Confusion Matrix")
sns.heatmap(cm_dtc,annot=True,cmap="Blues",fmt="d",cbar=False)
plt.subplot(2,3,6)
plt.title("Random Forest Confusion Matrix")
sns.heatmap(cm_rf,annot=True,cmap="Blues",fmt="d",cbar=False)
plt.show()

In [None]:
from sklearn.model_selection import StratifiedKFold # 导入K折验证工具
from sklearn.model_selection import GridSearchCV # 导入网格搜索工具
kfold = StratifiedKFold(n_splits=10) # 10折验证
rf = RandomForestClassifier() # 随机森林
# 对随机森林算法进行参数优化
rf_param_grid = {"max_depth": [None],
              "max_features": [3, 5, 12],
              "min_samples_split": [2, 5, 10],
              "min_samples_leaf": [3, 5, 10],
              "bootstrap": [False],
              "n_estimators" :[100,300],
              "criterion": ["gini"]}
rf_gs = GridSearchCV(rf,param_grid = rf_param_grid, cv=kfold, 
                    scoring="accuracy", n_jobs= 10, verbose = 1)
rf_gs.fit(X_train, y_train) # 用优化后的参数拟合训练数据集

In [None]:
from sklearn.metrics import (accuracy_score, confusion_matrix)
y_hat_rfgs = rf_gs.predict(X_test) # 用随机森林算法的最佳参数进行预测
print("参数优化后随机森林测试准确率:", accuracy_score(y_test.T, y_hat_rfgs))

In [None]:
cm_rfgs = confusion_matrix(y_test,y_hat_rfgs) # 显示混淆矩阵
plt.figure(figsize=(4,4))
plt.title("Random Forest (Best Score) Confusion Matrix")
sns.heatmap(cm_rfgs,annot=True,cmap="Blues",fmt="d",cbar=False)

In [None]:
print("最佳参数组合:",rf_gs.best_params_)