In [78]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

data = pd.read_csv("data/adults.txt")
print("原始数据形状:", data.shape)

# 空值判断
data.isnull().sum()  # 没有发现控制情况

# 剔除异常值: "?"
data.loc[:,"workclass"].unique()  # 发现有一个问号 "?"
data = data.replace("?",np.nan).dropna().reset_index(drop=True) # 替换为nan，而后删除nan
print("剔除数据[?] 后形状:", data.shape)


# 剔除过高、过低的值
v1 = data.loc[:,"capital_loss"].sort_values(ascending=False).unique()
v2 = data.loc[:,"capital_gain"].sort_values(ascending=False).unique()
cond = ( (data.loc[:,"capital_loss"] != 4356)  & (data.loc[:,"capital_gain"] != 99999))
data = data.loc[cond,:]
print("剔除数据[股票]后形状:", data.shape)


# 字符串转数字
encoder = LabelEncoder()
columns = ["workclass", "education", "marital_status", "occupation", "relationship", "race", "sex", "native_country"]
data.head()
for c in  columns:
    data.loc[:,c] = encoder.fit_transform(data.loc[:,c])
data.head()

# 归一化
scaler = StandardScaler()

col_norm = ['age','final_weight','workclass','education_num','education','marital_status','occupation',
            'relationship','race','capital_gain','capital_loss','hours_per_week','native_country']
data[col_norm] = scaler.fit_transform(data[col_norm])
data.head()

原始数据形状: (32561, 15)
剔除数据[?] 后形状: (30162, 15)
剔除数据[股票]后形状: (30013, 15)


Unnamed: 0,age,workclass,final_weight,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary
0,0.045819,2.938218,-1.062426,-0.347916,1.137876,0.945119,-1.477141,-0.263809,0.385361,1,0.608849,-0.219201,-0.074089,0.265125,<=50K
1,0.883461,1.889835,-1.007584,-0.347916,1.137876,-0.388396,-0.732955,-0.888283,0.385361,1,-0.234401,-0.219201,-2.331824,0.265125,<=50K
2,-0.03033,-0.206931,0.244805,0.176136,-0.435526,-1.72191,-0.23683,-0.263809,0.385361,1,-0.234401,-0.219201,-0.074089,0.265125,<=50K
3,1.111909,-0.206931,0.425327,-2.444126,-1.222226,-0.388396,-0.23683,-0.888283,-2.009468,1,-0.234401,-0.219201,-0.074089,0.265125,<=50K
4,-0.791822,-0.206931,1.406607,-0.347916,1.137876,-0.388396,0.755419,2.234087,-2.009468,0,-0.234401,-0.219201,-0.074089,-5.299925,<=50K


In [81]:
x_train, x_test, y_train, y_test = train_test_split(data.loc[:,:"native_country"], data.loc[:,"salary"], test_size=0.2, random_state=100)

grid = GridSearchCV(
    estimator=KNeighborsClassifier(),
    param_grid=dict(
        n_neighbors=[x for x in range(3, 20)],
        weights=['uniform', 'distance'],
        p = [1,2]
    ),
    cv = 5,
    scoring = 'accuracy'
)

grid.fit(x_train, y_train)


In [132]:
print(f"最优参数: {grid.best_params_}")
print(f"最佳分数：{grid.best_score_}")


# 预测
# 预测
y_pred = grid.best_estimator_.predict(x_test)
print(f"预测结果: {y_pred[:10]}")
print(f"实际结果: {y_test.values[:10]}")

v = accuracy_score(y_test, y_pred)
print(v)


# 模型评估
score = accuracy_score(y_test, y_pred)
print(f"准确率：{score}")

# 
# score = accuracy_score(y_test, y_pred)
# print(f"准确率：{score}")

最优参数: {'n_neighbors': 19, 'p': 1, 'weights': 'uniform'}
最佳分数：0.84618908788005
预测结果: ['<=50K' '>50K' '<=50K' '<=50K' '<=50K' '<=50K' '<=50K' '<=50K' '>50K'
 '>50K']
实际结果: ['<=50K' '>50K' '>50K' '<=50K' '<=50K' '<=50K' '<=50K' '<=50K' '>50K'
 '>50K']
0.8402465433949692
准确率：0.8402465433949692


In [134]:


r1  = np.array(
    object=[
        [1,2,3],
        [4,5,6]
    ]
)

r2 = np.array(
    object=[
        [7,8],
        [9,10],
        [11,12]
    ]
)
np.dot(r1,r2)

array([[ 58,  64],
       [139, 154]])