In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler

data = pd.read_csv("data/cancer.csv", sep='\t')

x = data.loc[:, "radius_mean":]
y = data.loc[:, "Diagnosis"]

# 归一化处理
scaler = StandardScaler()
x.loc[:,:] = pd.DataFrame(data=scaler.fit_transform(x), columns=x.columns) # fit_transform() 返回的是没有标签的ndarry 数组，重新加上标签

# 使用归一化后的数据进行划分
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=100)

In [31]:
grid = GridSearchCV(
    estimator=KNeighborsClassifier(),
    param_grid=dict(
        n_neighbors=[x for x in range(2, 30)],
        weights=['uniform', 'distance'],
        p=[1,2]
    ),
    cv=5,
    scoring="accuracy"
)

grid.fit(x_train, y_train)
estimator = grid.best_estimator_

In [32]:
# 执行预测
y_pred = estimator.predict(x_test)
print(f'预测结果：{y_pred[:30]}')
print(f'实际结果：{y_test.values[:30]}')

预测结果：['M' 'B' 'M' 'B' 'B' 'B' 'M' 'M' 'B' 'B' 'B' 'M' 'M' 'M' 'B' 'M' 'M' 'B'
 'B' 'B' 'B' 'B' 'M' 'M' 'M' 'M' 'B' 'B' 'B' 'B']
实际结果：['M' 'B' 'M' 'B' 'B' 'B' 'M' 'M' 'M' 'B' 'B' 'M' 'M' 'M' 'B' 'M' 'M' 'B'
 'B' 'B' 'B' 'B' 'M' 'M' 'M' 'M' 'B' 'B' 'B' 'B']


In [33]:
# 模型评估
score = accuracy_score(y_test, y_pred)
print(f'模型评估分数: {score}')

模型评估分数: 0.9649122807017544
