# Sklean学习之K-近邻算法

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

In [2]:
# 读取数据
data=pd.read_csv(r'C:\Users\17575\VSCode\机器学习\Data\day_2\train.csv')

In [3]:
# 打印前10行
data.head(10)

Unnamed: 0,row_id,x,y,accuracy,time,place_id
0,0,0.7941,9.0809,54,470702,8523065625
1,1,5.9567,4.7968,13,186555,1757726713
2,2,8.3078,7.0407,74,322648,1137537235
3,3,7.3665,2.5165,65,704587,6567393236
4,4,4.0961,1.1307,31,472130,7440663949
5,5,3.8099,1.9586,75,178065,6289802927
6,6,6.3336,4.372,13,666829,9931249544
7,7,5.7409,6.7697,85,369002,5662813655
8,8,4.3114,6.941,3,166384,8471780938
9,9,6.3414,0.0758,65,400060,1253803156


In [4]:
# 处理数据
# 1、缩小数据,查询数据筛选
data=data.query('x>1.0 & x<1.25 & y>2.5 & y<2.75')

In [5]:
# 处理时间序列
time_value=pd.to_datetime(data['time'],unit='s')
time_value

600        1970-01-01 18:09:40
957        1970-01-10 02:11:10
4345       1970-01-05 15:08:02
4735       1970-01-06 23:03:03
5580       1970-01-09 11:26:50
                   ...        
29100203   1970-01-01 10:33:56
29108443   1970-01-07 23:22:04
29109993   1970-01-08 15:03:14
29111539   1970-01-04 00:53:41
29112154   1970-01-08 23:01:07
Name: time, Length: 17710, dtype: datetime64[ns]

In [6]:
# 日期格式转化为字典格式
time_value=pd.DatetimeIndex(time_value)
time_value

DatetimeIndex(['1970-01-01 18:09:40', '1970-01-10 02:11:10',
               '1970-01-05 15:08:02', '1970-01-06 23:03:03',
               '1970-01-09 11:26:50', '1970-01-02 16:25:07',
               '1970-01-04 15:52:57', '1970-01-01 10:13:36',
               '1970-01-09 15:26:06', '1970-01-08 23:52:02',
               ...
               '1970-01-07 10:03:36', '1970-01-09 11:44:34',
               '1970-01-04 08:07:44', '1970-01-04 15:47:47',
               '1970-01-08 01:24:11', '1970-01-01 10:33:56',
               '1970-01-07 23:22:04', '1970-01-08 15:03:14',
               '1970-01-04 00:53:41', '1970-01-08 23:01:07'],
              dtype='datetime64[ns]', name='time', length=17710, freq=None)

In [7]:
# 构造一些特征
data['day']=time_value.day
data['hour']=time_value.hour
data['weekday']=time_value.weekday

In [8]:
# 删除特征时间戳
data=data.drop(['time'],axis=1)
data

Unnamed: 0,row_id,x,y,accuracy,place_id,day,hour,weekday
600,600,1.2214,2.7023,17,6683426742,1,18,3
957,957,1.1832,2.6891,58,6683426742,10,2,5
4345,4345,1.1935,2.6550,11,6889790653,5,15,0
4735,4735,1.1452,2.6074,49,6822359752,6,23,1
5580,5580,1.0089,2.7287,19,1527921905,9,11,4
...,...,...,...,...,...,...,...,...
29100203,29100203,1.0129,2.6775,12,3312463746,1,10,3
29108443,29108443,1.1474,2.6840,36,3533177779,7,23,2
29109993,29109993,1.0240,2.7238,62,6424972551,8,15,3
29111539,29111539,1.2032,2.6796,87,3533177779,4,0,6


In [9]:
# 签到数量少于n个的删除
place_count=data.groupby('place_id').count()
place_count

Unnamed: 0_level_0,row_id,x,y,accuracy,day,hour,weekday
place_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1012023972,1,1,1,1,1,1,1
1057182134,1,1,1,1,1,1,1
1059958036,3,3,3,3,3,3,3
1085266789,1,1,1,1,1,1,1
1097200869,1044,1044,1044,1044,1044,1044,1044
...,...,...,...,...,...,...,...
9904182060,1,1,1,1,1,1,1
9915093501,1,1,1,1,1,1,1
9946198589,1,1,1,1,1,1,1
9950190890,1,1,1,1,1,1,1


In [10]:
tf=place_count[place_count.row_id>3].reset_index()
tf

Unnamed: 0,place_id,row_id,x,y,accuracy,day,hour,weekday
0,1097200869,1044,1044,1044,1044,1044,1044,1044
1,1228935308,120,120,120,120,120,120,120
2,1267801529,58,58,58,58,58,58,58
3,1278040507,15,15,15,15,15,15,15
4,1285051622,21,21,21,21,21,21,21
...,...,...,...,...,...,...,...,...
234,9741307878,5,5,5,5,5,5,5
235,9753855529,21,21,21,21,21,21,21
236,9806043737,6,6,6,6,6,6,6
237,9809476069,23,23,23,23,23,23,23


In [11]:
data=data[data['place_id'].isin(tf.place_id)]
data

Unnamed: 0,row_id,x,y,accuracy,place_id,day,hour,weekday
600,600,1.2214,2.7023,17,6683426742,1,18,3
957,957,1.1832,2.6891,58,6683426742,10,2,5
4345,4345,1.1935,2.6550,11,6889790653,5,15,0
4735,4735,1.1452,2.6074,49,6822359752,6,23,1
5580,5580,1.0089,2.7287,19,1527921905,9,11,4
...,...,...,...,...,...,...,...,...
29100203,29100203,1.0129,2.6775,12,3312463746,1,10,3
29108443,29108443,1.1474,2.6840,36,3533177779,7,23,2
29109993,29109993,1.0240,2.7238,62,6424972551,8,15,3
29111539,29111539,1.2032,2.6796,87,3533177779,4,0,6


In [12]:
# 取出数据中的特征值和目标值
y=data['place_id']
x=data.drop(['place_id'],axis=1)

In [13]:
# 进行分割训练集和测试集
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25)

In [14]:
# 特征工程（标准化）
sta=StandardScaler()
# 对测试集和训练集的特征值进行标准化
x_train=sta.fit_transform(x_train)
x_test=sta.transform(x_test)

In [15]:
knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train,y_train)

KNeighborsClassifier()

In [16]:
# 得出预测结果
y_predict=knn.predict(x_test)
y_predict

array([2814307842, 1097200869, 5270522918, ..., 6683426742, 2584530303,
       6683426742], dtype=int64)

In [17]:
# 预测的准确率
knn.score(x_test,y_test)

0.41749408983451536

In [6]:
# 进行网格搜索例子

# 读取数据
data=pd.read_csv(r'C:\Users\17575\VSCode\机器学习\Data\day_2\train.csv')

# 处理数据
# 1、缩小数据,查询数据筛选
data=data.query('x>1.0 & x<1.25 & y>2.5 & y<2.75')

# 处理时间序列
time_value=pd.to_datetime(data['time'],unit='s')

# 日期格式转化为字典格式
time_value=pd.DatetimeIndex(time_value)

# 构造一些特征
data['day']=time_value.day
data['hour']=time_value.hour
data['weekday']=time_value.weekday

# 删除特征时间戳
data=data.drop(['time'],axis=1)

# 签到数量少于n个的删除
place_count=data.groupby('place_id').count()

tf=place_count[place_count.row_id>3].reset_index()

data=data[data['place_id'].isin(tf.place_id)]

# 取出数据中的特征值和目标值
y=data['place_id']
x=data.drop(['place_id'],axis=1)

# 进行分割训练集和测试集
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25)

# 特征工程（标准化）
sta=StandardScaler()
# 对测试集和训练集的特征值进行标准化
x_train=sta.fit_transform(x_train)
x_test=sta.transform(x_test)

knn=KNeighborsClassifier()

# 构造一些参数的值进行搜索
param={'n_neighbors':[3,4,5,6,7,8,9,10]}

gc=GridSearchCV(knn,param_grid=param,cv=10)
gc.fit(x_train,y_train)

# 预测准确率
print('在测试集的准确率：',gc.score(x_test,y_test))

在测试集的准确率： 0.41843971631205673


In [7]:
print('在交叉验证中最好的结果：',gc.best_score_)

在交叉验证中最好的结果： 0.4240226164818419


In [8]:
print('选择最好的模型是：',gc.best_estimator_)

选择最好的模型是： KNeighborsClassifier(n_neighbors=9)


In [9]:
print('每个超参数的结果：',gc.cv_results_)

每个超参数的结果： {'mean_fit_time': array([0.01991799, 0.02001777, 0.02001774, 0.01991777, 0.02001884,
       0.02021935, 0.02032015, 0.02001772]), 'std_fit_time': array([3.00098754e-04, 4.01790131e-07, 3.24284327e-07, 3.00264396e-04,
       4.47776262e-04, 3.99541659e-04, 4.57222296e-04, 4.15696997e-07]), 'mean_score_time': array([0.04414039, 0.04614213, 0.04864464, 0.05044611, 0.0521477 ,
       0.05415127, 0.05555086, 0.0569524 ]), 'std_score_time': array([0.00070074, 0.00070067, 0.0004905 , 0.00049026, 0.00053897,
       0.0007027 , 0.00067155, 0.00053901]), 'param_n_neighbors': masked_array(data=[3, 4, 5, 6, 7, 8, 9, 10],
             mask=[False, False, False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'n_neighbors': 3}, {'n_neighbors': 4}, {'n_neighbors': 5}, {'n_neighbors': 6}, {'n_neighbors': 7}, {'n_neighbors': 8}, {'n_neighbors': 9}, {'n_neighbors': 10}], 'split0_test_score': array([0.40977147, 0.40977147, 0.41449961, 0.43262411