## Постановка задачи
Загрузим данные, приведем их к числовым, заполним пропуски, нормализуем данные и оптимизируем память.

Разделим выборку на обучающую/проверочную в соотношении 80/20.

Применим логистическую регрессию по всему набору данных.

Проведем предсказание и проверим качество через каппа-метрику.

Данные:
* https://video.ittensive.com/machine-learning/prudential/train.csv.gz

Соревнование: https://www.kaggle.com/c/prudential-life-insurance-assessment/

© ITtensive, 2020

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
import re
from etl_utils import reduce_mem_usage, show_inf_and_na, inf_and_na_columns
pd.set_option('display.max_columns', 200)

data = pd.read_csv("https://video.ittensive.com/machine-learning/prudential/train.csv.gz")

data['Product_Info_2_1'] = data['Product_Info_2'].str.slice(0, 1)
data['Product_Info_2_2'] = pd.to_numeric(data['Product_Info_2'].str.slice(1, 2))
data = reduce_mem_usage(data.drop('Product_Info_2', axis='columns'))

onehot_df = pd.get_dummies(data['Product_Info_2_1'])
onehot_df.columns = ['Product_Info_2_1' + column for column in onehot_df.columns]
data = pd.merge(left=data, right=onehot_df, left_index=True, right_index=True).drop('Product_Info_2_1', axis=1).fillna(-1)

Потребление памяти меньше на 49.89 Мб (-85.4%)


### Набор столбцов для расчета

In [2]:
feature_regsearcher = r'Insurance_History.*|InsuredInfo.*|Medical_Keyword|Family_Hist.*|Medical_History.*|Product_Info.*|Wt|Ht|Ins_Age|BMI'
columns = [column for column in data.columns if re.match(feature_regsearcher, column) != None]
columns

['Product_Info_1',
 'Product_Info_3',
 'Product_Info_4',
 'Product_Info_5',
 'Product_Info_6',
 'Product_Info_7',
 'Ins_Age',
 'Ht',
 'Wt',
 'BMI',
 'InsuredInfo_1',
 'InsuredInfo_2',
 'InsuredInfo_3',
 'InsuredInfo_4',
 'InsuredInfo_5',
 'InsuredInfo_6',
 'InsuredInfo_7',
 'Insurance_History_1',
 'Insurance_History_2',
 'Insurance_History_3',
 'Insurance_History_4',
 'Insurance_History_5',
 'Insurance_History_7',
 'Insurance_History_8',
 'Insurance_History_9',
 'Family_Hist_1',
 'Family_Hist_2',
 'Family_Hist_3',
 'Family_Hist_4',
 'Family_Hist_5',
 'Medical_History_1',
 'Medical_History_2',
 'Medical_History_3',
 'Medical_History_4',
 'Medical_History_5',
 'Medical_History_6',
 'Medical_History_7',
 'Medical_History_8',
 'Medical_History_9',
 'Medical_History_10',
 'Medical_History_11',
 'Medical_History_12',
 'Medical_History_13',
 'Medical_History_14',
 'Medical_History_15',
 'Medical_History_16',
 'Medical_History_17',
 'Medical_History_18',
 'Medical_History_19',
 'Medical_Histor

### Предобработка данных
Дополнительно проведем z-нормализацию данных через предварительную обработку (preprocessing).

In [3]:
scaler = preprocessing.StandardScaler().fit(data[columns])

### Разделение данных
Преобразуем выборки в отдельные наборы данных

In [4]:
data_train, data_test = train_test_split(data, test_size=0.2)
data_train.head()

Unnamed: 0,Id,Product_Info_1,Product_Info_3,Product_Info_4,Product_Info_5,Product_Info_6,Product_Info_7,Ins_Age,Ht,Wt,BMI,Employment_Info_1,Employment_Info_2,Employment_Info_3,Employment_Info_4,Employment_Info_5,Employment_Info_6,InsuredInfo_1,InsuredInfo_2,InsuredInfo_3,InsuredInfo_4,InsuredInfo_5,InsuredInfo_6,InsuredInfo_7,Insurance_History_1,Insurance_History_2,Insurance_History_3,Insurance_History_4,Insurance_History_5,Insurance_History_7,Insurance_History_8,Insurance_History_9,Family_Hist_1,Family_Hist_2,Family_Hist_3,Family_Hist_4,Family_Hist_5,Medical_History_1,Medical_History_2,Medical_History_3,Medical_History_4,Medical_History_5,Medical_History_6,Medical_History_7,Medical_History_8,Medical_History_9,Medical_History_10,Medical_History_11,Medical_History_12,Medical_History_13,Medical_History_14,Medical_History_15,Medical_History_16,Medical_History_17,Medical_History_18,Medical_History_19,Medical_History_20,Medical_History_21,Medical_History_22,Medical_History_23,Medical_History_24,Medical_History_25,Medical_History_26,Medical_History_27,Medical_History_28,Medical_History_29,Medical_History_30,Medical_History_31,Medical_History_32,Medical_History_33,Medical_History_34,Medical_History_35,Medical_History_36,Medical_History_37,Medical_History_38,Medical_History_39,Medical_History_40,Medical_History_41,Medical_Keyword_1,Medical_Keyword_2,Medical_Keyword_3,Medical_Keyword_4,Medical_Keyword_5,Medical_Keyword_6,Medical_Keyword_7,Medical_Keyword_8,Medical_Keyword_9,Medical_Keyword_10,Medical_Keyword_11,Medical_Keyword_12,Medical_Keyword_13,Medical_Keyword_14,Medical_Keyword_15,Medical_Keyword_16,Medical_Keyword_17,Medical_Keyword_18,Medical_Keyword_19,Medical_Keyword_20,Medical_Keyword_21,Medical_Keyword_22,Medical_Keyword_23,Medical_Keyword_24,Medical_Keyword_25,Medical_Keyword_26,Medical_Keyword_27,Medical_Keyword_28,Medical_Keyword_29,Medical_Keyword_30,Medical_Keyword_31,Medical_Keyword_32,Medical_Keyword_33,Medical_Keyword_34,Medical_Keyword_35,Medical_Keyword_36,Medical_Keyword_37,Medical_Keyword_38,Medical_Keyword_39,Medical_Keyword_40,Medical_Keyword_41,Medical_Keyword_42,Medical_Keyword_43,Medical_Keyword_44,Medical_Keyword_45,Medical_Keyword_46,Medical_Keyword_47,Medical_Keyword_48,Response,Product_Info_2_2,Product_Info_2_1A,Product_Info_2_1B,Product_Info_2_1C,Product_Info_2_1D,Product_Info_2_1E
53109,70712,1,10,0.230713,2,1,1,0.671875,0.563477,0.184082,0.418457,0.0,1,3,0.0,3,0.75,1,2,3,3,1,2,1,2,1,3,1,0.000167,1,3,2,3,-1.0,0.480469,-1.0,0.580566,1.0,613,3,1,1,3,2,2,2,-1.0,3,3,3,3,240.0,3,3,1,2,2,1,2,1,-1.0,1,3,3,1,1,2,3,-1.0,3,3,1,2,2,1,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,8,1,0,0,0,0
34097,45265,1,26,0.0,2,3,1,0.507324,0.708984,0.286621,0.46167,0.018005,1,3,0.0,2,0.005001,1,2,8,3,1,1,1,2,1,1,3,-1.0,3,2,3,3,-1.0,0.539062,-1.0,0.5,6.0,112,2,1,1,1,2,2,2,-1.0,3,3,1,3,40.0,3,2,1,1,2,1,2,3,-1.0,1,3,3,1,3,2,3,-1.0,1,1,1,2,2,1,3,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,7,1,0,0,0,0
1407,1896,1,26,0.743652,2,3,1,0.373047,0.727051,0.382812,0.603516,0.059998,9,1,0.0,2,0.024994,1,2,11,3,1,1,1,2,1,1,3,-1.0,3,2,3,3,0.507324,-1.0,0.47876,-1.0,10.0,161,2,2,1,1,2,2,2,-1.0,3,2,3,3,-1.0,3,3,1,1,2,1,2,1,-1.0,2,2,3,1,3,2,3,-1.0,3,3,1,3,2,1,3,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,4,0,0,0,1,0
50740,67554,1,26,0.076904,2,3,1,0.417969,0.654785,0.290771,0.530273,0.024994,9,1,-1.0,2,-1.0,1,2,11,3,1,2,1,2,1,1,3,-1.0,3,2,3,3,-1.0,0.470703,0.49292,-1.0,-1.0,162,2,1,1,3,2,2,1,-1.0,3,2,3,3,14.0,1,3,1,1,2,1,2,3,-1.0,2,2,3,1,3,2,3,-1.0,3,3,1,3,2,1,3,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,4,0,0,0,1,0
993,1332,1,26,0.230713,2,3,1,0.432861,0.727051,0.284424,0.439697,0.099976,12,1,0.0,2,0.5,1,2,3,3,1,1,1,2,1,1,3,-1.0,3,2,3,3,-1.0,0.539062,0.746582,-1.0,2.0,125,2,2,1,3,2,2,2,-1.0,3,2,3,1,-1.0,1,3,1,1,2,1,2,1,-1.0,1,3,3,1,3,2,3,-1.0,3,3,1,2,2,1,1,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,6,4,0,0,1,0,0


### Логистическая регрессия
\begin{equation}
P = \frac{exp ^{\ T}}{1+exp ^{\ T}}
\end{equation}
\begin{equation}
T = a_0 + b_1x_1 + \cdots + b_nx_n
\end{equation}
T - терминатор, логистическая кривая

![](https://scikit-learn.org/0.22/_images/sphx_glr_plot_sgd_iris_001.png "")

In [5]:
def regression_model(df, columns):
    y = df['Response']
    x = scaler.transform(df[columns])
    model = LogisticRegression(max_iter=1000, class_weight='balanced', multi_class='multinomial').fit(x, y)
    return model


def logistic_regression(columns):
    x = scaler.transform(data_test[columns])
    data_test['target'] = regression_model(data_train, columns).predict(x)
    return cohen_kappa_score(data_test['Response'], data_test['target'], weights='quadratic')

### Предсказание данных и оценка модели
Кластеризация дает 0.192, kNN(100) - 0.3

In [6]:
print('Логистическая регрессия:', round(logistic_regression(columns), 3))

Логистическая регрессия: 0.515


В соревновании на Kaggle 0.512 - **2248 место**

### Матрица неточностей

In [7]:
print(confusion_matrix(data_test['target'], data_test['Response']))

[[ 367  232    5    2   87  214   97   92]
 [ 169  315    6    5  128  200   66   52]
 [ 125  121  117   53  117  277   36   20]
 [  67   66   50  139   20  362   65  129]
 [ 148  203    9    5  458  290  165  129]
 [  54   71    8    8   61  277  126  125]
 [ 153  121    4    3  122  341  666  365]
 [ 165  160    3   28  116  307  419 2966]]
