In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np #导入NumPy数学工具箱
import pandas as pd #导入Pandas数据处理工具箱
df = pd.read_csv("../input/bank-customer/BankCustomer.csv") # 读取文件
df.head() # 显示文件前5行

In [None]:
df.Exited.value_counts()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns #导入seaborn画图工具箱
sns.countplot(x="Exited", data=df, palette="bwr")
plt.show()

In [None]:
# 将某些特征转换为数值类型的哑变量
a = pd.get_dummies(df['ProductsNo'], prefix = "PN")
b = pd.get_dummies(df['City'], prefix = "City")
c = pd.get_dummies(df['Gender'], prefix = "Gender")
frames = [df, a, b, c]
df = pd.concat(frames, axis = 1)
df.head()
df = df.drop(columns = ['ProductsNo', 'City',"Gender", "Name"])
df.head()

In [None]:
# 构建特征和标签集
y = df.Exited.values
X = df.drop(['Exited'], axis = 1)

In [None]:
from sklearn.model_selection import train_test_split  # 拆分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2,random_state=0)

In [None]:
# 进行特征缩放
from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
from sklearn.metrics import (f1_score, confusion_matrix) # 导入评估指标

In [None]:
from sklearn.linear_model import LogisticRegression # 导入逻辑回归模型
from sklearn.metrics import f1_score, confusion_matrix # 导入评估标准
lr = LogisticRegression() # 逻辑回归
lr.fit(X_train,y_train) # 训练模型
y_pred = lr.predict(X_test) # 预测结果
lr_acc = lr.score(X_test,y_test)*100 # 准确率
lr_f1 = f1_score(y_test, y_pred)*100 # F1分数
print("逻辑回归测试集准确率： {:.2f}%".format(lr_acc))
print("逻辑回归测试集F1分数: {:.2f}%".format(lr_f1))
print('逻辑回归测试集混淆矩阵:\n', confusion_matrix(y_test,y_pred))

In [None]:
from sklearn.neighbors import KNeighborsClassifier # 导入KNN算法
k = 5 # 设定初始K值为5
knn = KNeighborsClassifier(n_neighbors = k)  # KNN模型
knn.fit(X_train, y_train) # 拟合KNN模型
y_pred = knn.predict(X_test) # 预测结果
knn_acc = knn.score(X_test,y_test)*100 # 准确率
knn_f1 = f1_score(y_test, y_pred)*100 # F1分数
print("{}NN 预测准确率: {:.2f}%".format(k, knn_acc))
print("{}NN 预测F1分数: {:.2f}%".format(k, knn_f1))
print('KNN 混淆矩阵:\n', confusion_matrix(y_test,y_pred))

In [None]:
# 寻找最佳K值
f1_score_list = []
acc_score_list = []
for i in range(1,15): # 从1到15，尝试每一个K值
    kNN = KNeighborsClassifier(n_neighbors = i)  # n_neighbors means k
    kNN.fit(X_train, y_train)
    acc_score_list.append(kNN.score(X_test, y_test))
    y_pred = kNN.predict(X_test) # 预测结果
    f1_score_list.append(f1_score(y_test, y_pred))
index = np.arange(1,15,1)
# 绘制不同K值时，kNN的准确率和F1分数
plt.plot(index,acc_score_list,c='blue',linestyle='solid')
plt.plot(index,f1_score_list,c='red',linestyle='dashed')
plt.legend(["Accuracy", "F1 Score"])
plt.xlabel("K value")
plt.ylabel("Score")
plt.grid('false')
plt.show()
kNN_acc = max(f1_score_list)*100
print("Maximum kNN Score is {:.2f}%".format(kNN_acc))

In [None]:
from sklearn.svm import SVC # 导入SVM分类器
svm = SVC(random_state = 1) # SVM模型
svm.fit(X_train, y_train) #拟合SVM模型
y_pred = svm.predict(X_test) # 预测心脏病结果
svm_acc = svm.score(X_test,y_test)*100 # 准确率
svm_f1 = f1_score(y_test, y_pred)*100 # F1分数
print("SVM 预测准确率:: {:.2f}%".format(svm_acc))
print("SVM 预测F1分数: {:.2f}%".format(svm_f1))
print('SVM 混淆矩阵:\n', confusion_matrix(y_test,y_pred))

In [None]:
from sklearn.naive_bayes import GaussianNB # 导入模型
nb = GaussianNB() # 朴素贝叶斯模型
nb.fit(X_train, y_train) # 拟合模型
y_pred = nb.predict(X_test) # 预测心脏病结果
nb_acc = nb.score(X_test,y_test)*100 # 准确率
nb_f1 = f1_score(y_test, y_pred)*100 # F1分数
print("朴素贝叶斯测试集准确率:: {:.2f}%".format(nb_acc))
print("朴素贝叶斯测试集F1分数: {:.2f}%".format(nb_f1))
print('朴素贝叶斯混淆矩阵:\n', confusion_matrix(y_test,y_pred))

In [None]:
from sklearn.tree import DecisionTreeClassifier # 导入模型
dt = DecisionTreeClassifier() # 分类决策树
dt.fit(X_train, y_train) # 拟合模型
y_pred = dt.predict(X_test) # 预测心脏病结果
dt_acc = dt.score(X_test,y_test)*100 # 准确率
dt_f1 = f1_score(y_test, y_pred)*100 # F1分数
print("决策树测试集准确率:: {:.2f}%".format(dt_acc))
print("决策树测试集F1分数: {:.2f}%".format(dt_f1))
print('决策树混淆矩阵:\n', confusion_matrix(y_test,y_pred))

In [None]:
from sklearn.ensemble import RandomForestClassifier # 导入模型
rf = RandomForestClassifier(n_estimators = 1000, random_state = 1) # 随机森林
rf.fit(X_train, y_train) # 拟合模型
y_pred = rf.predict(X_test) # 预测心脏病结果
rf_acc = rf.score(X_test,y_test)*100 # 准确率
rf_f1 = f1_score(y_test, y_pred)*100 # F1分数
print("随机森林 预测准确率:: {:.2f}%".format(rf_acc))
print("随机森林 预测F1分数: {:.2f}%".format(rf_f1))
print('随机森林 混淆矩阵:\n', confusion_matrix( y_test,y_pred))

In [None]:
# 用直方图显示出各个算法的F1分数
import seaborn as sns
methods = ["Logistic Regression", "KNN", "SVM", 
           "Naive Bayes", "Decision Tree", "Random Forest"]
f1 = [lr_f1, knn_f1, svm_f1, nb_f1, dt_f1, rf_f1]
colors = ["orange","red","purple", "magenta", "green","blue"]
sns.set_style("whitegrid")
plt.figure(figsize=(16,5))
plt.yticks(np.arange(0,100,10))
plt.ylim((0,80))
plt.ylabel("F1 Score")
plt.xlabel("Algorithms")
sns.barplot(x=methods, y=f1, palette=colors)
# plt.grid(b=None)
plt.show()

In [None]:
# 用直方图显示出各个算法的预测准确率
import seaborn as sns
methods = ["Logistic Regression", "KNN", "SVM", 
           "Naive Bayes", "Decision Tree", "Random Forest"]
f1 = [lr_acc, knn_acc, svm_acc, nb_acc, dt_acc, rf_acc]
colors = ["orange","red","purple", "magenta", "green","blue"]
sns.set_style("whitegrid")
plt.figure(figsize=(16,5))
plt.yticks(np.arange(0,100,10))
plt.ylim((60,100))
plt.ylabel("Accurancy %")
plt.xlabel("Algorithms")
sns.barplot(x=methods, y=f1, palette=colors)
# plt.grid(b=None)
plt.show()