In [2]:
import numpy as np
import operator

# 使用kNN判断看中一个人的可能性

In [30]:
# 数据集导入函数
def read_data_set (filename):
    label_dictionary = {'largeDoses':3,'smallDoses':2,'didntLike':1}
    with open(filename) as f:
        lines = f.readlines()
        num_lines = len(lines)
        feature_mat = np.zeros((num_lines,3))
        class_label_vec = []
        # 行的标志
        index = 0
        for line in lines:
            # 先删去首尾的空字符，按`\t`对字符串进行分割
            line = line.strip().split('\t')
            feature_mat[index,:] = line[0:3]
            if line[-1].isdigit():
                # 如果是数字，直接作为真实值
                class_label_vec.append(int(line[-1]))
            else:
                # 如果是字符串，使用字典转化为数字
                class_label_vec.append(label_dictionary.get(line[-1],0))
            index += 1
        # 返回特征矩阵和类别标签向量
        return feature_mat, class_label_vec

In [69]:
# 定义归一化函数
# 将所有特征值的范围规范化，从0到1
def normalization(data_set):
    min_val = data_set.min(0)
    max_val = data_set.max(0)
    range_ = max_val- min_val
    num_data = data_set.shape[0]
    norm_data_set = (data_set-min_val)/range_
    # 返回归一化的数据和原本的数数据范围和最小值
    return norm_data_set, range_, min_val

In [70]:
# 定义kNN算法
def kNN(test_x, data_set, labels, k):
     # test_x是一条待测试的数据，data_set是训练集，lebels是训练样本标签，k是取的最近邻个数，k必须大于0
    num_data_set = data_set.shape[0]
    diff_mat = np.tile(test_x, (num_data_set,1))-data_set
    # 计算测试数据和训练集中各个数据之间的欧式距离
    distances = (diff_mat**2).sum(axis=1)**0.5
    # 根据记录进行排序，并返回排序后的索引位置
    indicies_sorted_distance = distances.argsort()
    # 存储k个最邻近的label的各自的出现次数
    label_counter = {}
    for i in range(k):
        # 获得所属的label
        voted_label = labels[indicies_sorted_distance[i]]
        # 根据label在相应的类别上，加1
        label_counter[voted_label] = label_counter.get(voted_label,0) +1
    # operator.itemgetter 获取对象的哪个维度的数据
    # operator.itemgetter（0）根据字典的键排序
    # operator.itemgetter（1）根据字典的值排序
    # 最后返回，k近邻中岁数类别最多的那一类,下标从1开始
    return sorted(label_counter.items(),
                  key=operator.itemgetter(1),reverse=True)[0][0]
    

In [71]:
# 测试算法的正确性
def dating_label_test(k=8):
    # 测试集在所有数据的比例
    rate_test_split = 0.1
    dating_feature_mat, dating_labels = read_data_set('datingTestSet.txt')
    norm_feature_mat, range_, min_val = normalization(dating_feature_mat)
    num_data_set = norm_feature_mat.shape[0]
    num_test = int(num_data_set*rate_test_split)
    error_counter = 0.0
    train = norm_feature_mat[num_test:num_data_set]
    train_labels = dating_labels[num_test:num_data_set]
    for i in range(num_test):
        # 所有数据中的前num_test条数据作为测试集
        predict_result = kNN(norm_feature_mat[i,:],train,train_labels,k)
        print("the classifier came back with: %d, the real answer is: %d" %
              (predict_result, dating_labels[i]))
        if predict_result != dating_labels[i]:
            error_counter += 1
    # 返回错误的个数和错误率        
    return error_counter,error_counter / float(num_test)


    

In [74]:
# 验证算法的正确性
num_error, error_rate = dating_label_test(5)
print("the total error rate is: {}".format(error_rate))
print("the number of wrong predict is {}"
      .format(num_error)) 

the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answe

In [85]:
# 搭建可用的系统，输入个人信息判断是否会喜欢这个人
def classify_person(k=5):
    results = ['not at all','in small doses','in large doses']
    flier = float(input('frequent flier miles earned per year'))
    game = float(input("Please input your percentage of time spent playing video games.(a number)"))
    ice_cream = float(input('liters of ice cream consumed per year'))
    dating_feature_mat, dating_labels = read_data_set("datingTestSet.txt")
    norm_feature_mat, range_, min_val = normalization(dating_feature_mat)
    test_feature = np.array([flier,game,ice_cream])
    print(range_)
    norm_test_feature = (test_feature-min_val)/range_
    predict = kNN(norm_test_feature,-----------------------norm_feature_mat,dating_labels,k)
    print("You will probably like this person:{}".format(results[predict-1]))


In [87]:
# 构建系统，自己输入个人的信息，然后对这个人的好感度
classify_person()

frequent flier miles earned per year40920
Please input your percentage of time spent playing video games.(a number)8.326976
liters of ice cream consumed per year0.953952
[9.1273000e+04 2.0919349e+01 1.6943610e+00]
You will probably like this person:in large doses
