## Постановка задачи
Загрузим данные и разделим выборку на обучающую/проверочную в соотношении 80/20.

Применим метод ближайших соседей (kNN) для классификации скоринга. Будем использовать только биометрические данные.

Проверим качество предсказания через каппа-метрику и матрицу неточностей.

Данные:
* https://video.ittensive.com/machine-learning/prudential/train.csv.gz

Соревнование: https://www.kaggle.com/c/prudential-life-insurance-assessment/

© ITtensive, 2020

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

data = pd.read_csv("https://video.ittensive.com/machine-learning/prudential/train.csv.gz")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59381 entries, 0 to 59380
Columns: 128 entries, Id to Response
dtypes: float64(18), int64(109), object(1)
memory usage: 58.0+ MB


### Разделение данных

In [2]:
data_train, data_test = train_test_split(data, test_size=.2)
data_train.head()

Unnamed: 0,Id,Product_Info_1,Product_Info_2,Product_Info_3,Product_Info_4,Product_Info_5,Product_Info_6,Product_Info_7,Ins_Age,Ht,...,Medical_Keyword_40,Medical_Keyword_41,Medical_Keyword_42,Medical_Keyword_43,Medical_Keyword_44,Medical_Keyword_45,Medical_Keyword_46,Medical_Keyword_47,Medical_Keyword_48,Response
46850,62344,1,D3,26,0.487179,2,3,1,0.522388,0.690909,...,0,0,0,0,0,0,0,0,0,7
37650,49987,1,A8,26,0.025641,2,3,1,0.671642,0.690909,...,1,0,0,0,1,0,0,0,0,5
8742,11652,2,D4,26,1.0,2,1,1,0.283582,0.672727,...,0,0,0,0,0,0,0,0,0,7
47842,63717,1,D3,26,0.487179,2,1,1,0.492537,0.818182,...,0,0,0,0,0,0,0,0,0,5
45796,60925,1,E1,26,0.128205,2,3,1,0.119403,0.781818,...,0,0,0,0,0,0,0,0,0,8


### Расчет модели kNN (k ближайших соседей)
Вычисляем не центры (кластеры) исходных групп, а расстояние до всех значений. Выбираем то значение, которое превалирует у k ближайших соседей.

Для оценки качества модели возьмем k равным 10, 100, 1000, 10000.

In [3]:
columns = ['Wt', 'Ht', 'Ins_Age', 'BMI']
max_nn = data_train.groupby('Response')['Id'].count().min()
knn10 = KNeighborsClassifier(n_neighbors=10, n_jobs=-1)
knn100 = KNeighborsClassifier(n_neighbors=100, n_jobs=-1)
knn1000 = KNeighborsClassifier(n_neighbors=1000, n_jobs=-1)
knn10000 = KNeighborsClassifier(n_neighbors=10000, n_jobs=-1)
knnmax = KNeighborsClassifier(n_neighbors=max_nn, n_jobs=-1)

In [4]:
y = data_train['Response']
x = data_train[columns]
knn10.fit(x, y)
knn100.fit(x, y)
knn1000.fit(x, y)
knn10000.fit(x, y)
knnmax.fit(x, y)

KNeighborsClassifier(n_jobs=-1, n_neighbors=807)

### Предсказание данных
Внимание: 10000 соседей потребует порядка 4 Гб оперативной памяти

In [5]:
%%time
x_test = data_test[columns]
data_test['target_10'] = knn10.predict(x_test)
data_test['target_100'] = knn100.predict(x_test)
data_test['target_1000'] = knn1000.predict(x_test)
data_test['target_10000'] = knn10000.predict(x_test)
data_test['target_max'] = knnmax.predict(x_test)
data_test.head(20)

CPU times: total: 1min 41s
Wall time: 15.6 s


Unnamed: 0,Id,Product_Info_1,Product_Info_2,Product_Info_3,Product_Info_4,Product_Info_5,Product_Info_6,Product_Info_7,Ins_Age,Ht,...,Medical_Keyword_45,Medical_Keyword_46,Medical_Keyword_47,Medical_Keyword_48,Response,target_10,target_100,target_1000,target_10000,target_max
19180,25577,1,D1,26,1.0,2,1,1,0.492537,0.836364,...,0,0,0,0,5,8,8,8,8,8
51231,68225,1,C3,26,0.230769,2,3,1,0.656716,0.763636,...,0,1,0,0,2,1,1,6,6,6
17419,23227,1,A6,29,1.0,2,3,1,0.19403,0.745455,...,0,0,0,0,8,8,8,8,8,8
52725,70185,1,D2,26,0.487179,2,3,1,0.641791,0.836364,...,1,1,0,0,8,8,8,8,8,8
46058,61279,1,D4,26,0.487179,2,3,1,0.238806,0.709091,...,0,0,0,0,5,5,6,6,6,6
55210,73522,1,D1,29,0.076923,2,3,1,0.223881,0.763636,...,0,0,0,0,1,6,6,6,6,6
15837,21088,1,D4,26,0.076923,2,3,1,0.313433,0.563636,...,0,0,0,0,4,8,8,8,8,8
57693,76869,1,D3,26,0.076923,2,3,1,0.447761,0.763636,...,0,0,0,1,1,6,6,6,6,6
34748,46151,1,D4,26,0.435897,2,3,1,0.104478,0.8,...,0,0,0,0,5,5,5,5,6,5
48554,64654,1,D2,26,0.282051,2,3,1,0.761194,0.763636,...,0,0,0,0,7,1,1,1,8,1


### Оценка модели

In [6]:
print ('kNN, 10:', cohen_kappa_score(data_test['target_10'], data_test['Response'], weights='quadratic'))
print ('kNN, 100:', cohen_kappa_score(data_test['target_100'], data_test['Response'], weights='quadratic'))
print ('kNN, 1000:', cohen_kappa_score(data_test['target_1000'], data_test['Response'], weights='quadratic'))
print ('kNN, 10000:', cohen_kappa_score(data_test['target_10000'], data_test['Response'], weights='quadratic'))
print ('kNN, max:', cohen_kappa_score(data_test['target_max'], data_test['Response'], weights='quadratic'))

kNN, 10: 0.2792984086091306
kNN, 100: 0.28535482664481715
kNN, 1000: 0.2643968655909744
kNN, 10000: 0.14217054723881606
kNN, max: 0.26696953668886336


### Матрица неточностей

In [7]:
print(confusion_matrix(data_test['target_10'], data_test['Response']))
print(confusion_matrix(data_test['target_10000'], data_test['Response']))

[[ 173  148   11   13  103  196  128  164]
 [ 169  244   17   10  135  135   86  112]
 [   1    0    0    0    2    1    0    4]
 [   4    1    0    1    1    2    2    8]
 [  91  137   41    3  327  155   43   30]
 [ 258  261   55   46  239  695  426  397]
 [ 120  131   25   22   75  287  284  264]
 [ 400  361   57  204  214  802  640 2916]]
[[   0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0]
 [ 512  664   96   42  665  869  459  202]
 [   0    0    0    0    0    0    0    0]
 [ 704  619  110  257  431 1404 1150 3693]]
