## Постановка задачи
Загрузим данные, приведем их к числовым, заполним пропуски, нормализуем данные и оптимизируем память.

Разделим выборку на обучающую/проверочную в соотношении 80/20.

Построим 4 модели логистической регрессии: для 8, 6 и остальных классов, для 2, 5 и остальных, для 1, 7 и остальных, и для 4 и 3 - по убыванию частоты значения. Будем использовать перекрестную проверку при принятии решения об оптимальном наборе столбцов.

Проведем предсказание и проверим качество через каппа-метрику.

Данные:
* https://video.ittensive.com/machine-learning/prudential/train.csv.gz

Соревнование: https://www.kaggle.com/c/prudential-life-insurance-assessment/

© ITtensive, 2020

In [1]:
GRAIN = 11
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score, confusion_matrix, make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
import re
from etl_utils import reduce_mem_usage, show_inf_and_na, inf_and_na_columns
pd.set_option('display.max_columns', 200)

data = pd.read_csv("https://video.ittensive.com/machine-learning/prudential/train.csv.gz")

data['Product_Info_2_1'] = data['Product_Info_2'].str.slice(0, 1)
data['Product_Info_2_2'] = pd.to_numeric(data['Product_Info_2'].str.slice(1, 2))
data = reduce_mem_usage(data.drop('Product_Info_2', axis='columns'))

onehot_df = pd.get_dummies(data['Product_Info_2_1'])
onehot_df.columns = ['Product_Info_2_1' + column for column in onehot_df.columns]
data = pd.merge(left=data, right=onehot_df, left_index=True, right_index=True).drop('Product_Info_2_1', axis=1).fillna(-1)
del onehot_df

feature_regsearcher = r'Insurance_History.*|InsuredInfo.*|Medical_Keyword|Family_Hist.*|Medical_History.*|Product_Info.*|Wt|Ht|Ins_Age|BMI'
columns = [column for column in data.columns if re.match(feature_regsearcher, column) != None]

scaler = preprocessing.StandardScaler()
data_transformed  = pd.DataFrame(scaler.fit_transform(data[columns]))
columns_transformed = data_transformed.columns
data_transformed['Response'] = data['Response']

data_train, data_test = train_test_split(data_transformed, test_size=0.2, random_state=GRAIN)
data_train.head()

Потребление памяти меньше на 49.89 Мб (-85.4%)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,Response
40387,-0.164525,0.312319,2.375019,11.949044,0.441621,-0.149284,-0.618005,1.24654,-0.091734,-0.784844,-0.500858,-0.0865,0.809179,0.362834,-0.117377,-0.832218,-0.140157,-1.634368,-0.169414,0.862391,-1.013721,0.881078,-0.928723,-1.388458,-0.822964,0.64942,0.882208,-0.850424,0.604184,-0.644789,0.191406,0.039262,-0.337091,0.725957,-0.085709,0.241387,-0.071228,-0.151323,-1.828718,-0.077431,0.064649,-0.244864,0.362135,0.159109,-0.425612,-0.44254,0.149843,-0.237045,-0.188428,0.122931,-0.349505,0.136745,0.555706,-0.140862,-0.480107,0.485769,0.100114,-0.26821,0.544473,-0.20581,0.086178,-0.043414,0.329041,0.429066,-0.0322,-0.434938,0.256066,-0.069812,0.304099,0.128359,-0.686839,-0.209383,-0.094989,-0.22766,-0.121511,-0.093259,-0.112948,-0.11877,-0.102552,-0.081832,-0.194523,-0.24817,-0.10052,-0.077442,-0.088936,-0.485054,-0.113482,-0.096156,-0.086894,-0.096867,-0.090557,-0.121725,-0.196473,-0.329198,-0.138776,-0.31344,-0.116712,-0.109535,-0.123142,-0.109062,-0.160265,-0.104956,-0.147058,-0.15287,-0.145195,-0.083587,-0.102552,-0.26709,-0.082972,-0.117672,-0.245752,-0.100776,-0.218424,-0.10405,-0.087091,-0.117819,-0.092521,-0.142512,-0.240076,-0.66686,-0.623305,-0.142142,-0.128866,0.750845,-0.216001,8
17090,-0.164525,0.312319,2.375019,-0.083689,0.441621,-0.149284,0.062943,-0.220558,-0.514003,-0.4812,1.891857,-0.0865,0.061379,0.362834,-0.117377,-0.832218,-0.140157,0.611857,-0.169414,0.862391,-1.013721,0.867624,-0.928723,1.260049,-0.822964,0.64942,-1.023824,1.306068,0.829232,-0.644789,0.834947,-1.332368,-0.337091,0.725957,-0.085709,0.241387,-0.071228,-0.151323,0.546416,-0.077431,0.064649,-0.244864,0.362135,0.159109,-0.425612,-0.44254,0.149843,-0.237045,-0.188428,0.122931,-0.349505,0.136745,0.555706,-0.140862,-0.480107,0.485769,0.100114,-0.26821,0.544473,-0.20581,0.086178,-0.043414,0.329041,0.429066,-0.0322,-0.434938,0.256066,-0.069812,0.304099,0.128359,1.455972,-0.209383,-0.094989,-0.22766,-0.121511,-0.093259,-0.112948,-0.11877,-0.102552,-0.081832,-0.194523,-0.24817,-0.10052,-0.077442,-0.088936,-0.485054,-0.113482,-0.096156,-0.086894,-0.096867,-0.090557,-0.121725,-0.196473,-0.329198,-0.138776,-0.31344,-0.116712,-0.109535,-0.123142,-0.109062,-0.160265,-0.104956,-0.147058,-0.15287,-0.145195,-0.083587,-0.102552,-0.26709,-0.082972,-0.117672,-0.245752,-0.100776,-0.218424,-0.10405,-0.087091,-0.117819,-0.092521,-0.142512,-0.240076,-0.200031,-0.623305,-0.142142,-0.128866,0.750845,-0.216001,7
21687,-0.164525,0.312319,2.375019,-0.083689,0.441621,-0.149284,1.045984,1.00312,0.426506,-0.081668,-0.500858,-0.0865,0.809179,0.362834,-0.117377,-0.832218,-0.140157,-1.634368,-0.169414,0.862391,-1.013721,0.86964,-0.928723,-1.388458,-0.822964,0.64942,-1.023824,0.753684,1.013524,-0.644789,0.191406,1.741202,-0.337091,0.725957,-0.085709,0.241387,-0.071228,-0.151323,0.546416,-0.077431,0.064649,-0.244864,0.362135,0.159109,-0.425612,-0.44254,0.149843,-0.237045,-0.188428,0.122931,-0.349505,0.136745,0.555706,-0.140862,-0.480107,0.485769,0.100114,-0.26821,0.544473,-0.20581,0.086178,-0.043414,0.329041,0.429066,-0.0322,-0.434938,0.256066,-0.069812,0.304099,0.128359,-0.686839,-0.209383,-0.094989,-0.22766,-0.121511,-0.093259,-0.112948,-0.11877,-0.102552,-0.081832,-0.194523,-0.24817,-0.10052,-0.077442,-0.088936,-0.485054,-0.113482,-0.096156,-0.086894,-0.096867,-0.090557,-0.121725,-0.196473,-0.329198,-0.138776,-0.31344,-0.116712,-0.109535,-0.123142,-0.109062,-0.160265,-0.104956,-0.147058,-0.15287,-0.145195,-0.083587,-0.102552,-0.26709,-0.082972,-0.117672,-0.245752,-0.100776,-0.218424,-0.10405,-0.087091,-0.117819,-0.092521,-0.142512,-0.240076,-1.13369,-0.623305,-0.142142,-0.128866,0.750845,-0.216001,8
40073,-0.164525,0.312319,2.375019,-0.083689,0.441621,-0.149284,1.877979,1.00312,0.566348,0.076147,-0.500858,-0.0865,0.809179,-2.75608,-0.117377,-0.832218,-0.140157,0.611857,-0.169414,0.862391,0.043662,0.861233,-0.928723,1.260049,-0.822964,-1.420309,-1.023824,1.161341,-1.41983,1.465212,-0.291249,-0.794913,-0.337091,-1.377491,-0.085709,0.241387,-0.071228,-0.151323,0.546416,-0.077431,0.064649,-0.244864,0.362135,0.159109,2.876399,-0.44254,0.149843,-0.237045,-0.188428,0.122931,-0.349505,0.136745,-1.799555,-0.140862,1.982468,-2.057248,0.100114,-0.26821,0.544473,4.842179,0.086178,-0.043414,0.329041,0.429066,-0.0322,1.988542,0.256066,-0.069812,0.304099,0.128359,-0.686839,-0.209383,-0.094989,-0.22766,-0.121511,-0.093259,-0.112948,-0.11877,-0.102552,-0.081832,-0.194523,-0.24817,-0.10052,-0.077442,-0.088936,2.061628,-0.113482,-0.096156,-0.086894,-0.096867,-0.090557,-0.121725,-0.196473,-0.329198,-0.138776,-0.31344,-0.116712,-0.109535,-0.123142,-0.109062,-0.160265,-0.104956,-0.147058,-0.15287,-0.145195,-0.083587,-0.102552,-0.26709,12.052329,-0.117672,-0.245752,-0.100776,-0.218424,-0.10405,-0.087091,-0.117819,-0.092521,-0.142512,-0.240076,-1.13369,-0.623305,-0.142142,-0.128866,0.750845,-0.216001,6
3408,-0.164525,0.312319,0.559708,-0.083689,-2.264385,-0.149284,0.516083,-0.220558,-0.396096,-0.329378,-0.500858,-0.0865,0.061379,-2.75608,-0.117377,-0.832218,-0.140157,0.611857,-0.169414,0.862391,0.043662,0.867624,-0.928723,1.260049,-0.822964,-1.420309,-1.023824,1.056234,0.849788,-0.644789,-0.532577,-0.794913,-0.337091,-1.377491,-0.085709,0.241387,-0.071228,-0.151323,-1.828718,-0.077431,0.064649,-0.244864,-2.761625,0.159109,2.876399,-0.44254,0.149843,-0.237045,-0.188428,0.122931,-0.349505,0.136745,-1.799555,1.776953,-0.480107,0.485769,0.100114,-0.26821,0.544473,-0.20581,0.086178,-0.043414,0.329041,-2.33087,-0.0322,-0.434938,0.256066,-0.069812,0.304099,0.128359,1.455972,-0.209383,-0.094989,-0.22766,-0.121511,-0.093259,-0.112948,-0.11877,-0.102552,-0.081832,-0.194523,-0.24817,-0.10052,-0.077442,-0.088936,2.061628,-0.113482,-0.096156,-0.086894,-0.096867,-0.090557,-0.121725,-0.196473,-0.329198,-0.138776,-0.31344,-0.116712,-0.109535,-0.123142,-0.109062,-0.160265,-0.104956,-0.147058,-0.15287,-0.145195,-0.083587,-0.102552,-0.26709,-0.082972,-0.117672,-0.245752,-0.100776,4.578256,-0.10405,-0.087091,-0.117819,-0.092521,7.016961,-0.240076,-0.200031,-0.623305,-0.142142,-0.128866,0.750845,-0.216001,7


### Логистическая регрессия
В обучающих данных пометим все классы, кроме 6 и 8, как 0 - и проведем обучение по такому набору данных.

Затем в оставшихся данных (в которых класс не равен 6 или 8) заменим все классы, кроме 7 и 1, на 0 - и снова проведем обучение. И т.д. Получим иерархию классификаторов:
8/6/нет -> 7/1/нет -> 2/5/нет -> 4/3

In [2]:
def regression_model(columns, df):
    return LogisticRegression(max_iter=1000, random_state=GRAIN).fit(df[columns], df['Response'])


def logistic_regression(columns, df):
    return cross_val_score(
        LogisticRegression(max_iter=1000, random_state=GRAIN),
        df[columns],
        df['Response'],
        scoring=make_scorer(cohen_kappa_score),
        cv=5,
        n_jobs=-1
    ).mean()

### Оптимальный набор столбцов
Для каждого уровня иерархии это будет свой набор столбцов в исходных данных.

### Перекрестная проверка
Разбиваем обучающую выборку еще на k (часто 5) частей, на каждой части данных обучаем модель. Затем проверяем 1-ю, 2-ю, 3-ю, 4-ю части на 5; 1-ю, 2-ю, 3-ю, 5-ю части на 4 и т.д.

В итоге обучение пройдет весь набор данных, и каждая часть набора будет проверена на всех оставшихся (перекрестным образом).

In [3]:
def find_opt_columns(data_train):
    kappa_score_opt = 0
    columns_opt = []
    for col in columns_transformed:
        kappa_score = logistic_regression([col], data_train)
        if kappa_score > kappa_score_opt:
            columns_opt = [col]
            kappa_score_opt = kappa_score
            
    for col in columns_transformed:
        if col not in columns_opt:
            columns_opt.append(col)
            kappa_score = logistic_regression(columns_opt, data_train)
            if kappa_score < kappa_score_opt:
                columns_opt.pop()
            else:
                kappa_score_opt = kappa_score
                
    return columns_opt, kappa_score_opt

Будем последовательно "урезать" набор данных при расчете более глубоких моделей: после получения разделения на 8 и остальные отсечем все данные со значением 8, и т.д.

После каждого расчета модели будем вычислять значения в проверочной выборке. Проверочную выборку нулями заполнять не будем, иначе оценка будет считаться некорректно.

Набор разделений 6/8, 2/5, 1/7, 3/4 дает наибольшую точность

In [4]:
%%time
responses = [[6, 8], [2, 5], [1, 7], [3, 4]]
logr_models = [{}]*len(responses)
data_train_current = data_train.copy()

i = 0
for response in responses:
    m_train = data_train_current.copy()
    if response != [3, 4]:
        m_train['Response'] = m_train['Response'].map(lambda x: 0 if x not in response else x)
        
    columns_opt, kappa_score_opt = find_opt_columns(m_train)
    print(i, kappa_score_opt, columns_opt, end='\n\n')
    
    logr_models[i] = {
        'model': regression_model(columns_opt, m_train),
        'columns': columns_opt,
        'weight': kappa_score_opt
    }
    if response != [3, 4]:
        data_train_current = data_train_current[~data_train_current['Response'].isin(response)]
        
    i += 1

0 0.4264607274624718 [9, 0, 1, 2, 3, 4, 6, 7, 8, 10, 11, 12, 14, 15, 16, 17, 18, 19, 21, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 37, 38, 40, 42, 44, 45, 46, 47, 49, 50, 52, 53, 54, 55, 56, 60, 61, 64, 66, 67, 68, 69, 70, 71, 72, 73, 74, 76, 77, 82, 83, 85, 86, 88, 89, 91, 92, 94, 97, 100, 103, 108, 113, 114, 120, 122, 123]

1 0.18203793255467143 [73, 4, 5, 9, 14, 15, 16, 18, 19, 22, 24, 27, 29, 30, 31, 32, 33, 34, 35, 37, 42, 44, 45, 48, 49, 52, 54, 55, 57, 58, 59, 60, 61, 69, 74, 91, 94, 103, 113, 118, 119, 121]

2 0.5458809160915234 [9, 6, 7, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 27, 28, 30, 32, 33, 34, 35, 36, 37, 39, 40, 42, 43, 44, 46, 47, 48, 49, 50, 51, 53, 54, 56, 57, 58, 59, 60, 61, 62, 63, 64, 66, 67, 68, 69, 70, 73, 75, 79, 84, 87, 92, 94, 100, 104, 108, 109, 110, 111, 112, 118, 119, 120]

3 0.45048334329544 [9, 0, 1, 2, 4, 6, 11, 12, 14, 20, 25, 26, 30, 32, 35, 42, 44, 48, 64, 66, 67, 68, 71, 100, 114, 119, 120, 121]

CPU times: total: 26.8 s
Wall time: 6min 19s


### Предсказание данных и оценка модели
Последовательно считаем предсказания для каждой классификации. После этого объединяем предсказание по иерархии.

In [5]:
def logr_hierarchy(x):
    for response in range(0, len(responses)):
        if x['target' + str(response)] > 0:
            x['target'] = x['target' + str(response)]
            break;
    return x

In [6]:
for response in range(0, len(responses)):
    model = logr_models[response]['model']
    columns_opt = logr_models[response]['columns']
    data_test['target' + str(response)] = model.predict(data_test[columns_opt])

In [7]:
data_test = data_test.apply(logr_hierarchy, axis='columns')
data_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,Response,target0,target1,target2,target3,target
12001,-0.164525,0.312319,-0.528961,-0.083689,0.441621,-0.149284,0.894938,1.496539,3.365938,2.743019,-0.500858,-0.0865,0.061379,0.362834,-0.117377,-0.832218,-0.140157,0.611857,-0.169414,-1.159587,1.101046,-1.156735,1.130555,-0.064204,1.139466,0.64942,-1.023824,-0.850424,-1.41983,-0.644789,0.432734,1.0078,-0.337091,0.725957,-0.085709,0.241387,-0.071228,-0.151323,0.546416,-0.077431,0.064649,-0.244864,0.362135,0.159109,-0.425612,-0.44254,0.149843,-0.237045,-0.188428,0.122931,-0.349505,0.136745,-1.799555,-0.140862,-0.480107,0.485769,0.100114,-0.26821,0.544473,-0.20581,0.086178,-0.043414,0.329041,0.429066,-0.0322,-0.434938,0.256066,-0.069812,-3.288759,0.128359,-0.686839,-0.209383,-0.094989,4.392523,-0.121511,-0.093259,-0.112948,-0.11877,-0.102552,-0.081832,-0.194523,-0.24817,-0.10052,-0.077442,-0.088936,-0.485054,-0.113482,-0.096156,-0.086894,-0.096867,-0.090557,-0.121725,-0.196473,-0.329198,-0.138776,-0.31344,-0.116712,-0.109535,-0.123142,-0.109062,-0.160265,-0.104956,-0.147058,-0.15287,-0.145195,-0.083587,-0.102552,-0.26709,-0.082972,-0.117672,-0.245752,-0.100776,-0.218424,-0.10405,-0.087091,-0.117819,-0.092521,-0.142512,-0.240076,-0.200031,-0.623305,-0.142142,-0.128866,0.750845,-0.216001,1.0,0.0,2.0,1.0,3.0,2.0
44003,-0.164525,0.312319,2.375019,-0.083689,0.441621,-0.149284,-0.618005,0.022862,-0.160284,-0.181551,1.891857,-0.0865,1.930878,0.362834,-0.117377,1.201609,-0.140157,0.611857,-0.169414,-1.159587,1.101046,-1.156735,1.130555,-0.064204,1.139466,-1.420309,1.115466,-0.850424,0.747365,-0.644789,0.271849,-1.332368,-0.337091,0.725957,-0.085709,0.241387,-0.071228,-0.151323,-1.828718,-0.077431,0.064649,-0.244864,0.362135,0.159109,-0.425612,-0.44254,0.149843,-0.237045,-0.188428,-8.116064,-0.349505,0.136745,0.555706,-0.140862,-0.480107,0.485769,0.100114,-0.26821,0.544473,-0.20581,0.086178,-0.043414,0.329041,0.429066,-0.0322,-0.434938,0.256066,-0.069812,0.304099,0.128359,-0.686839,-0.209383,-0.094989,-0.22766,-0.121511,-0.093259,-0.112948,-0.11877,-0.102552,-0.081832,-0.194523,-0.24817,-0.10052,-0.077442,-0.088936,-0.485054,-0.113482,-0.096156,-0.086894,-0.096867,-0.090557,-0.121725,-0.196473,-0.329198,-0.138776,-0.31344,-0.116712,-0.109535,-0.123142,-0.109062,-0.160265,-0.104956,-0.147058,-0.15287,-0.145195,-0.083587,-0.102552,-0.26709,-0.082972,-0.117672,-0.245752,-0.100776,-0.218424,-0.10405,-0.087091,-0.117819,-0.092521,-0.142512,-0.240076,0.266799,-0.623305,-0.142142,-0.128866,0.750845,-0.216001,1.0,0.0,0.0,7.0,4.0,7.0
5630,-0.164525,-2.841731,-0.347516,-0.083689,0.441621,-0.149284,-0.921336,-1.937656,-1.383219,-0.680966,-0.500858,-0.0865,1.556978,0.362834,-0.117377,1.201609,-0.140157,0.611857,-0.169414,-1.159587,1.101046,-1.156735,1.130555,-0.064204,1.139466,0.64942,0.784908,-0.850424,0.542872,-0.644789,-0.452134,0.45355,-0.337091,0.725957,-0.085709,0.241387,-0.071228,-0.151323,-1.828718,-0.077431,0.064649,-0.244864,0.362135,0.159109,-0.425612,-0.44254,0.149843,-0.237045,-0.188428,0.122931,-0.349505,0.136745,0.555706,-0.140862,-0.480107,0.485769,0.100114,-0.26821,-1.836771,-0.20581,0.086178,-0.043414,0.329041,0.429066,-0.0322,-0.434938,0.256066,-0.069812,0.304099,0.128359,-0.686839,-0.209383,-0.094989,-0.22766,-0.121511,-0.093259,-0.112948,-0.11877,-0.102552,-0.081832,-0.194523,-0.24817,-0.10052,-0.077442,11.243979,-0.485054,-0.113482,-0.096156,-0.086894,-0.096867,-0.090557,-0.121725,-0.196473,-0.329198,-0.138776,-0.31344,-0.116712,-0.109535,-0.123142,-0.109062,-0.160265,-0.104956,-0.147058,-0.15287,-0.145195,-0.083587,-0.102552,-0.26709,-0.082972,-0.117672,-0.245752,-0.100776,-0.218424,-0.10405,-0.087091,-0.117819,-0.092521,-0.142512,-0.240076,-0.200031,1.60435,-0.142142,-0.128866,-1.331832,-0.216001,8.0,8.0,0.0,7.0,4.0,8.0
55165,-0.164525,0.312319,-0.347516,-0.083689,0.441621,-0.149284,-0.466958,-0.707398,1.037974,1.943956,1.891857,-0.0865,0.061379,0.362834,-0.117377,-0.832218,-0.140157,0.611857,-0.169414,-1.159587,1.101046,-1.156735,1.130555,-0.064204,1.139466,0.64942,-1.023824,-0.850424,-1.41983,1.189994,0.271849,-0.794913,-0.337091,0.725957,-0.085709,0.241387,-0.071228,-0.151323,0.546416,-0.077431,0.064649,-0.244864,0.362135,0.159109,-0.425612,-0.44254,0.149843,-0.237045,-0.188428,0.122931,-0.349505,0.136745,0.555706,-0.140862,1.982468,-2.057248,0.100114,-0.26821,0.544473,-0.20581,0.086178,-0.043414,0.329041,0.429066,-0.0322,1.988542,0.256066,-0.069812,0.304099,0.128359,-0.686839,-0.209383,-0.094989,-0.22766,-0.121511,-0.093259,-0.112948,-0.11877,-0.102552,-0.081832,-0.194523,-0.24817,-0.10052,-0.077442,-0.088936,-0.485054,-0.113482,-0.096156,-0.086894,-0.096867,-0.090557,-0.121725,-0.196473,-0.329198,-0.138776,-0.31344,-0.116712,-0.109535,-0.123142,-0.109062,-0.160265,-0.104956,-0.147058,-0.15287,-0.145195,-0.083587,-0.102552,-0.26709,-0.082972,-0.117672,-0.245752,-0.100776,-0.218424,-0.10405,-0.087091,-0.117819,-0.092521,-0.142512,-0.240076,-0.200031,-0.623305,-0.142142,-0.128866,0.750845,-0.216001,5.0,0.0,5.0,7.0,3.0,5.0
13184,-0.164525,0.312319,0.015373,-0.083689,0.441621,-0.149284,-0.845813,0.51628,0.777483,0.637488,-0.500858,-0.0865,0.061379,0.362834,-0.117377,-0.832218,-0.140157,0.611857,-0.169414,-1.159587,1.101046,-1.156735,1.130555,-0.064204,1.139466,0.64942,0.920866,-0.850424,0.542872,-0.644789,-0.452134,-0.794913,-0.337091,-1.377491,-0.085709,0.241387,-0.071228,-0.151323,0.546416,-0.077431,0.064649,-0.244864,-2.761625,0.159109,-0.425612,-0.44254,0.149843,-0.237045,-0.188428,0.122931,-0.349505,0.136745,0.555706,-0.140862,-0.480107,0.485769,0.100114,-0.26821,0.544473,-0.20581,0.086178,0.315809,0.329041,0.429066,-0.0322,-0.434938,0.256066,-0.069812,0.304099,0.128359,-0.686839,-0.209383,-0.094989,-0.22766,-0.121511,-0.093259,-0.112948,-0.11877,-0.102552,-0.081832,-0.194523,-0.24817,-0.10052,-0.077442,-0.088936,-0.485054,-0.113482,-0.096156,-0.086894,-0.096867,-0.090557,-0.121725,-0.196473,-0.329198,-0.138776,-0.31344,-0.116712,-0.109535,-0.123142,-0.109062,-0.160265,-0.104956,-0.147058,-0.15287,-0.145195,-0.083587,-0.102552,-0.26709,-0.082972,-0.117672,4.069147,-0.100776,-0.218424,-0.10405,-0.087091,-0.117819,-0.092521,-0.142512,-0.240076,0.266799,-0.623305,-0.142142,-0.128866,0.750845,-0.216001,6.0,0.0,0.0,0.0,3.0,3.0


Кластеризация дает 0.192, kNN(100) - 0.3, простая лог. регрессия - 0.512

In [8]:
print(
    'Логистическая регрессия, 4 уровня:',
    round(cohen_kappa_score(data_test['target'], data_test['Response'], weights='quadratic'), 3)
)

Логистическая регрессия, 4 уровня: 0.501


### Матрица неточностей

In [9]:
print(confusion_matrix(data_test['target'], data_test['Response']))

[[ 457  373   41   24  193  425  137  122]
 [ 141  203    5    0   65   79   15    4]
 [  40   58   61   38   83  151   15    9]
 [  58   60   56  164   24  242   35   45]
 [  79  135   11    0  200   75   19   10]
 [  15   18    8   33   10  177   26   52]
 [ 249  277   14   11  352  695  885  539]
 [ 199  183    6   46  137  392  430 3171]]


In [16]:
def calculate_weighted_target(cortage):
    weights = []
    targets = []
    for response in range(0, len(responses)):
        weights.append(logr_models[response]['weight'])
        targets.append(cortage['target' + str(response)])
    
    weights_sum = np.sum(weights)
    weighted_target = 0.0
    for i, weight in enumerate(weights):
        weighted_target += targets[i] * weight / weights_sum
    
    cortage['weighted_target'] = weighted_target
    return np.floor(cortage)


print(
    'Логистическая регрессия, 4 уровня с усреднением:',
    round(cohen_kappa_score(data_test['target'], data_test['Response'], weights='quadratic'), 3)
)

Логистическая регрессия, 4 уровня с усреднением: 0.501
