## Постановка задачи
Загрузим данные, приведем их к числовым, заполним пропуски, нормализуем данные и оптимизируем память.

Разделим выборку на обучающую/проверочную в соотношении 80/20.

Построим модель опорных векторов (SVM) для наиболее оптимального разделения параметров на классы, используем несколько реализаций: линейную (LinearSVC) и через градиентный бустинг (SGDClassifier).

Проведем предсказание и проверим качество через каппа-метрику.

Данные:
* https://video.ittensive.com/machine-learning/prudential/train.csv.gz

Соревнование: https://www.kaggle.com/c/prudential-life-insurance-assessment/

© ITtensive, 2020

In [1]:
GRAIN = 11
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score, confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn import preprocessing
import re
from etl_utils import reduce_mem_usage, show_inf_and_na, inf_and_na_columns
pd.set_option('display.max_columns', 200)


data = pd.read_csv("https://video.ittensive.com/machine-learning/prudential/train.csv.gz")

data['Product_Info_2_1'] = data['Product_Info_2'].str.slice(0, 1)
data['Product_Info_2_2'] = pd.to_numeric(data['Product_Info_2'].str.slice(1, 2))
data = data.drop('Product_Info_2', axis='columns')

onehot_df = pd.get_dummies(data['Product_Info_2_1'])
onehot_df.columns = ['Product_Info_2_1' + column for column in onehot_df.columns]
data = pd.merge(left=data, right=onehot_df, left_index=True, right_index=True).drop('Product_Info_2_1', axis=1).fillna(-1)
del onehot_df

feature_regsearcher = r'Insurance_History.*|InsuredInfo.*|Medical_Keyword|Family_Hist.*|Medical_History.*|Product_Info.*|Wt|Ht|Ins_Age|BMI'
columns = [column for column in data.columns if re.match(feature_regsearcher, column) != None]

scaler = preprocessing.StandardScaler()
data_transformed  = pd.DataFrame(scaler.fit_transform(data[columns]))
columns_transformed = data_transformed.columns
data_transformed['Response'] = data['Response']
data_transformed = reduce_mem_usage(data_transformed)

data_train, data_test = train_test_split(data_transformed, test_size=0.2, random_state=GRAIN)
data_train.head()

Потребление памяти меньше на 42.87 Мб (-75.1%)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,Response
40387,-0.164551,0.312256,2.375,11.945312,0.44165,-0.149292,-0.618652,1.249023,-0.090637,-0.786133,-0.500977,-0.086487,0.809082,0.362793,-0.117371,-0.832031,-0.140137,-1.634766,-0.169434,0.862305,-1.013672,0.880859,-0.928711,-1.388672,-0.822754,0.649414,0.881836,-0.850586,0.604492,-0.64502,0.191406,0.039276,-0.337158,0.726074,-0.085693,0.241333,-0.071228,-0.151367,-1.829102,-0.077454,0.064636,-0.244873,0.362061,0.159058,-0.425537,-0.442627,0.149902,-0.237061,-0.188477,0.122925,-0.349609,0.136719,0.555664,-0.140869,-0.480225,0.48584,0.100098,-0.268311,0.544434,-0.205811,0.086182,-0.043427,0.329102,0.428955,-0.032196,-0.435059,0.256104,-0.069824,0.304199,0.128418,-0.687012,-0.209351,-0.094971,-0.227661,-0.121521,-0.093262,-0.112976,-0.118774,-0.102539,-0.081848,-0.19458,-0.248169,-0.100525,-0.077454,-0.088928,-0.485107,-0.113464,-0.09613,-0.086914,-0.096863,-0.090576,-0.121704,-0.196533,-0.329102,-0.138794,-0.313477,-0.116699,-0.109558,-0.123169,-0.10907,-0.160278,-0.10498,-0.147095,-0.152832,-0.145142,-0.083557,-0.102539,-0.26709,-0.082947,-0.117676,-0.245728,-0.100769,-0.218384,-0.104065,-0.087097,-0.117798,-0.092529,-0.142456,-0.240112,-0.666992,-0.623535,-0.14209,-0.128906,0.750977,-0.215942,8
17090,-0.164551,0.312256,2.375,-0.083679,0.44165,-0.149292,0.062622,-0.220581,-0.513672,-0.481445,1.891602,-0.086487,0.061371,0.362793,-0.117371,-0.832031,-0.140137,0.611816,-0.169434,0.862305,-1.013672,0.867676,-0.928711,1.259766,-0.822754,0.649414,-1.023438,1.305664,0.829102,-0.64502,0.834961,-1.332031,-0.337158,0.726074,-0.085693,0.241333,-0.071228,-0.151367,0.546387,-0.077454,0.064636,-0.244873,0.362061,0.159058,-0.425537,-0.442627,0.149902,-0.237061,-0.188477,0.122925,-0.349609,0.136719,0.555664,-0.140869,-0.480225,0.48584,0.100098,-0.268311,0.544434,-0.205811,0.086182,-0.043427,0.329102,0.428955,-0.032196,-0.435059,0.256104,-0.069824,0.304199,0.128418,1.456055,-0.209351,-0.094971,-0.227661,-0.121521,-0.093262,-0.112976,-0.118774,-0.102539,-0.081848,-0.19458,-0.248169,-0.100525,-0.077454,-0.088928,-0.485107,-0.113464,-0.09613,-0.086914,-0.096863,-0.090576,-0.121704,-0.196533,-0.329102,-0.138794,-0.313477,-0.116699,-0.109558,-0.123169,-0.10907,-0.160278,-0.10498,-0.147095,-0.152832,-0.145142,-0.083557,-0.102539,-0.26709,-0.082947,-0.117676,-0.245728,-0.100769,-0.218384,-0.104065,-0.087097,-0.117798,-0.092529,-0.142456,-0.240112,-0.200073,-0.623535,-0.14209,-0.128906,0.750977,-0.215942,7
21687,-0.164551,0.312256,2.375,-0.083679,0.44165,-0.149292,1.046875,1.003906,0.42627,-0.08197,-0.500977,-0.086487,0.809082,0.362793,-0.117371,-0.832031,-0.140137,-1.634766,-0.169434,0.862305,-1.013672,0.869629,-0.928711,-1.388672,-0.822754,0.649414,-1.023438,0.753906,1.012695,-0.64502,0.191406,1.741211,-0.337158,0.726074,-0.085693,0.241333,-0.071228,-0.151367,0.546387,-0.077454,0.064636,-0.244873,0.362061,0.159058,-0.425537,-0.442627,0.149902,-0.237061,-0.188477,0.122925,-0.349609,0.136719,0.555664,-0.140869,-0.480225,0.48584,0.100098,-0.268311,0.544434,-0.205811,0.086182,-0.043427,0.329102,0.428955,-0.032196,-0.435059,0.256104,-0.069824,0.304199,0.128418,-0.687012,-0.209351,-0.094971,-0.227661,-0.121521,-0.093262,-0.112976,-0.118774,-0.102539,-0.081848,-0.19458,-0.248169,-0.100525,-0.077454,-0.088928,-0.485107,-0.113464,-0.09613,-0.086914,-0.096863,-0.090576,-0.121704,-0.196533,-0.329102,-0.138794,-0.313477,-0.116699,-0.109558,-0.123169,-0.10907,-0.160278,-0.10498,-0.147095,-0.152832,-0.145142,-0.083557,-0.102539,-0.26709,-0.082947,-0.117676,-0.245728,-0.100769,-0.218384,-0.104065,-0.087097,-0.117798,-0.092529,-0.142456,-0.240112,-1.133789,-0.623535,-0.14209,-0.128906,0.750977,-0.215942,8
40073,-0.164551,0.312256,2.375,-0.083679,0.44165,-0.149292,1.878906,1.003906,0.567383,0.075378,-0.500977,-0.086487,0.809082,-2.755859,-0.117371,-0.832031,-0.140137,0.611816,-0.169434,0.862305,0.043671,0.861328,-0.928711,1.259766,-0.822754,-1.419922,-1.023438,1.161133,-1.419922,1.464844,-0.29126,-0.794922,-0.337158,-1.37793,-0.085693,0.241333,-0.071228,-0.151367,0.546387,-0.077454,0.064636,-0.244873,0.362061,0.159058,2.876953,-0.442627,0.149902,-0.237061,-0.188477,0.122925,-0.349609,0.136719,-1.799805,-0.140869,1.982422,-2.056641,0.100098,-0.268311,0.544434,4.84375,0.086182,-0.043427,0.329102,0.428955,-0.032196,1.988281,0.256104,-0.069824,0.304199,0.128418,-0.687012,-0.209351,-0.094971,-0.227661,-0.121521,-0.093262,-0.112976,-0.118774,-0.102539,-0.081848,-0.19458,-0.248169,-0.100525,-0.077454,-0.088928,2.0625,-0.113464,-0.09613,-0.086914,-0.096863,-0.090576,-0.121704,-0.196533,-0.329102,-0.138794,-0.313477,-0.116699,-0.109558,-0.123169,-0.10907,-0.160278,-0.10498,-0.147095,-0.152832,-0.145142,-0.083557,-0.102539,-0.26709,12.054688,-0.117676,-0.245728,-0.100769,-0.218384,-0.104065,-0.087097,-0.117798,-0.092529,-0.142456,-0.240112,-1.133789,-0.623535,-0.14209,-0.128906,0.750977,-0.215942,6
3408,-0.164551,0.312256,0.560059,-0.083679,-2.263672,-0.149292,0.516602,-0.220581,-0.395996,-0.329834,-0.500977,-0.086487,0.061371,-2.755859,-0.117371,-0.832031,-0.140137,0.611816,-0.169434,0.862305,0.043671,0.867676,-0.928711,1.259766,-0.822754,-1.419922,-1.023438,1.055664,0.849609,-0.64502,-0.532715,-0.794922,-0.337158,-1.37793,-0.085693,0.241333,-0.071228,-0.151367,-1.829102,-0.077454,0.064636,-0.244873,-2.761719,0.159058,2.876953,-0.442627,0.149902,-0.237061,-0.188477,0.122925,-0.349609,0.136719,-1.799805,1.777344,-0.480225,0.48584,0.100098,-0.268311,0.544434,-0.205811,0.086182,-0.043427,0.329102,-2.330078,-0.032196,-0.435059,0.256104,-0.069824,0.304199,0.128418,1.456055,-0.209351,-0.094971,-0.227661,-0.121521,-0.093262,-0.112976,-0.118774,-0.102539,-0.081848,-0.19458,-0.248169,-0.100525,-0.077454,-0.088928,2.0625,-0.113464,-0.09613,-0.086914,-0.096863,-0.090576,-0.121704,-0.196533,-0.329102,-0.138794,-0.313477,-0.116699,-0.109558,-0.123169,-0.10907,-0.160278,-0.10498,-0.147095,-0.152832,-0.145142,-0.083557,-0.102539,-0.26709,-0.082947,-0.117676,-0.245728,-0.100769,4.578125,-0.104065,-0.087097,-0.117798,-0.092529,7.015625,-0.240112,-0.200073,-0.623535,-0.14209,-0.128906,0.750977,-0.215942,7


### SVM
Выбираем направления преобразований исходных данных, чтобы различные классы можно было разделить гиперплоскостью по значениям параметров.
![](https://static.learme.ru/storage/uploads/editor/npnJ9VVXcfzhDGrl5e1IoPhSUTT5KmZL8seNRykU.png)

In [2]:
x = data_train[columns_transformed]

In [3]:
%%time
model_lin = LinearSVC(max_iter=10000, random_state=GRAIN).fit(x, data_train['Response'])

CPU times: total: 17min 42s
Wall time: 17min 43s




In [4]:
%%time
model_sgd = SGDClassifier(random_state=GRAIN).fit(x, data_train['Response'])

CPU times: total: 7.59 s
Wall time: 7.61 s


### Предсказание данных и оценка модели

In [5]:
x_test = data_test[columns_transformed]
data_test['target_lin'] = model_lin.predict(x_test)
data_test['target_sgd'] = model_sgd.predict(x_test)

Кластеризация дает 0.192, kNN(100) - 0.3, лог. регрессия - 0.512/0.496

In [6]:
print(
    'SVM (линейный):', round(cohen_kappa_score(data_test['target_lin'], data_test['Response'], weights='quadratic'), 3)
)
print(
    'SVM (градиент):', round(cohen_kappa_score(data_test['target_sgd'], data_test['Response'], weights='quadratic'), 3)
)

SVM (линейный): 0.46
SVM (градиент): 0.447


### Матрица неточностей

In [7]:
print('SVM (линейный)\n', confusion_matrix(data_test['target_lin'], data_test['Response']))
print('SVM (градиент)\n', confusion_matrix(data_test['target_sgd'], data_test['Response']))

SVM (линейный)
 [[ 290  154   20    7   61  156   53   37]
 [ 181  278    8    1  110  153   41   15]
 [   0    1    0    0    0    0    0    0]
 [   3    3    5   35    2    8    2    1]
 [  84  149   12    0  263  103   28   14]
 [ 249  296  141  201  250  877  213  159]
 [ 114  107    1    2  100  220  429   43]
 [ 317  319   15   70  278  719  796 3683]]
SVM (градиент)
 [[ 359  306   29   18  134  279   70   68]
 [  92  124    9    8   46  125   63   63]
 [  17   21   17   16    4   14    2    2]
 [  23   29   18   64   17  125   29   26]
 [ 146  197   26    6  315  227   85   52]
 [ 212  218   88  135  224  631  207  166]
 [ 117  130    2   10  101  241  403   67]
 [ 272  282   13   59  223  594  703 3508]]
