# Задание
Загрузите данные, приведите их к числовым, заполните пропуски, нормализуйте данные и оптимизируйте память.

Разделите выборку на обучающую/проверочную в соотношении 80/20.

Постройте 2 модели - kNN по 100 соседей и множественную логистическую регрессию - каждую по наиболее оптимальным наборам параметров (для каждой модели), используйте для этого перекрестную проверку GridSearchCV.

Проведите предсказание и проверьте качество через каппа-метрику.

Данные: video.ittensive.com/machine-learning/prudential/train.csv.gz

In [1]:
GRAIN = 11
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score, confusion_matrix, make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
import re
from etl_utils import reduce_mem_usage, show_inf_and_na, inf_and_na_columns
pd.set_option('display.max_columns', 200)

data = pd.read_csv("https://video.ittensive.com/machine-learning/prudential/train.csv.gz")

data['Product_Info_2_1'] = data['Product_Info_2'].str.slice(0, 1)
data['Product_Info_2_2'] = pd.to_numeric(data['Product_Info_2'].str.slice(1, 2))
data = data.drop('Product_Info_2', axis='columns')

onehot_df = pd.get_dummies(data['Product_Info_2_1'])
onehot_df.columns = ['Product_Info_2_1' + column for column in onehot_df.columns]
data = pd.merge(left=data, right=onehot_df, left_index=True, right_index=True).drop('Product_Info_2_1', axis=1).fillna(-1)
del onehot_df

feature_regsearcher = r'Insurance_History.*|InsuredInfo.*|Medical_Keyword|Family_Hist.*|Medical_History.*|Product_Info.*|Wt|Ht|Ins_Age|BMI'
columns = [column for column in data.columns if re.match(feature_regsearcher, column) != None]

scaler = preprocessing.StandardScaler()
data_transformed  = pd.DataFrame(scaler.fit_transform(data[columns]))
columns_transformed = data_transformed.columns
data_transformed['Response'] = data['Response']
data_transformed = reduce_mem_usage(data_transformed)

data_train, data_test = train_test_split(data_transformed, test_size=0.2, random_state=GRAIN)
#data_train = data_train.iloc[:1000, :]
data_train.shape

Потребление памяти меньше на 42.87 Мб (-75.1%)


(47504, 126)

In [2]:
LOGREG = 0
KNN = 1

def logistic_regression_score(df: pd.DataFrame, features: list, target_feature: str):
    return cross_val_score(
        LogisticRegression(max_iter=1000, random_state=GRAIN),
        df[features], df[target_feature], scoring=make_scorer(cohen_kappa_score), cv=5, n_jobs=-1
    ).mean()


def knn_score(df: pd.DataFrame, features: list, target_feature: str):
    return cross_val_score(
        KNeighborsClassifier(n_neighbors=100),
        df[features], df[target_feature], scoring=make_scorer(cohen_kappa_score), cv=5, n_jobs=-1
    ).mean()


def find_optimal_features(df: pd.DataFrame, features: list, target_feature: str, model_type: int):
    kappa_score_the_best = 0
    optimal_features = []
    
    # выбираем первую фичу (с самым оптимальным скором)
    for feature in features:
        if model_type == LOGREG:
            kappa_score = logistic_regression_score(df, [feature], target_feature)
        elif model_type == KNN:
            kappa_score = knn_score(df, [feature], target_feature)
        else:
            kappa_score = 0
            
        if kappa_score > kappa_score_the_best:
            optimal_features = [feature]
            kappa_score_the_best = kappa_score

    # добавляем фичи, увеличивающие скором
    for feature in features:
        if feature not in optimal_features:
            optimal_features.append(feature)
            if model_type == LOGREG:
                kappa_score = logistic_regression_score(df, optimal_features, target_feature)
            elif model_type == KNN:
                kappa_score = knn_score(df, optimal_features, target_feature)
            else:
                kappa_score = 0
            
            if kappa_score < kappa_score_the_best:
                optimal_features.pop()
            else:
                kappa_score_the_best = kappa_score
                
    return optimal_features, kappa_score_the_best


def generate_model(df: pd.DataFrame, features: list, target_feature: str, target_classes: list, model_type: int):
    train_df = df[features]
    train_df[target_feature] = df[target_feature].map(lambda value: 0 if value not in target_classes else value)
    optimal_features, kappa_score = find_optimal_features(df, features, target_feature, model_type)
    
    if model_type == LOGREG:
        model = LogisticRegression(max_iter=1000, random_state=GRAIN).fit(train_df[optimal_features], train_df[target_feature])
    elif model_type == KNN:
        model = KNeighborsClassifier(n_neighbors=100).fit(train_df[optimal_features], train_df[target_feature])
    else:
        return None
    
    return {
        'model': model,
        'features': optimal_features
    }

In [3]:
target_sets = [[6, 8], [2, 5], [1, 7], [3, 4]]
models = dict()
models[LOGREG] = list()
models[KNN] = list()

## Построение моделей

In [4]:
%%time
remained_df = data_train.copy()
for i, target_set in enumerate(target_sets):
    models[LOGREG].append(generate_model(remained_df, columns_transformed, 'Response', target_set, LOGREG))
    print(target_set, 'Логистическая регрессия', i, 'построена.')
    
    models[KNN].append(generate_model(remained_df, columns_transformed, 'Response', target_set, KNN))
    print(target_set, 'KNN', i, 'построен.')
    
    remained_df = remained_df[~remained_df['Response'].isin(target_set)]

[6, 8] Логистическая регрессия 0 построена.
[6, 8] KNN 0 построен.
[2, 5] Логистическая регрессия 1 построена.
[2, 5] KNN 1 построен.
[1, 7] Логистическая регрессия 2 построена.
[1, 7] KNN 2 построен.
[3, 4] Логистическая регрессия 3 построена.
[3, 4] KNN 3 построен.
CPU times: total: 36.5 s
Wall time: 49min 13s


## Предсказание данных и оценка моделей

In [17]:
for i, target_set in enumerate(target_sets):
    data_test['LOGREG_' + str(i)] = models[LOGREG][i]['model'].predict(data_test[models[LOGREG][i]['features']])
    data_test['KNN_' + str(i)] = models[KNN][i]['model'].predict(data_test[models[KNN][i]['features']])

In [22]:
def generate_target(cortage, model_type: str):
    for i, target_set in enumerate(target_sets):
        if cortage[model_type + '_' + str(i)] > 0:
            cortage['target' + '_' + model_type] = cortage[model_type + '_' + str(i)]
            return cortage
    return cortage


data_test = data_test.apply(generate_target, axis='columns', model_type='LOGREG')
data_test = data_test.apply(generate_target, axis='columns', model_type='KNN')

data_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,Response,LOGREG_0,KNN_0,LOGREG_1,KNN_1,LOGREG_2,KNN_2,LOGREG_3,KNN_3,target_LOGREG,target_KNN
12001,-0.164551,0.312256,-0.528809,-0.083679,0.44165,-0.149292,0.89502,1.494141,3.363281,2.742188,-0.500977,-0.086487,0.061371,0.362793,-0.117371,-0.832031,-0.140137,0.611816,-0.169434,-1.15918,1.100586,-1.15625,1.130859,-0.064209,1.139648,0.649414,-1.023438,-0.850586,-1.419922,-0.64502,0.432617,1.007812,-0.337158,0.726074,-0.085693,0.241333,-0.071228,-0.151367,0.546387,-0.077454,0.064636,-0.244873,0.362061,0.159058,-0.425537,-0.442627,0.149902,-0.237061,-0.188477,0.122925,-0.349609,0.136719,-1.799805,-0.140869,-0.480225,0.48584,0.100098,-0.268311,0.544434,-0.205811,0.086182,-0.043427,0.329102,0.428955,-0.032196,-0.435059,0.256104,-0.069824,-3.289062,0.128418,-0.687012,-0.209351,-0.094971,4.390625,-0.121521,-0.093262,-0.112976,-0.118774,-0.102539,-0.081848,-0.19458,-0.248169,-0.100525,-0.077454,-0.088928,-0.485107,-0.113464,-0.09613,-0.086914,-0.096863,-0.090576,-0.121704,-0.196533,-0.329102,-0.138794,-0.313477,-0.116699,-0.109558,-0.123169,-0.10907,-0.160278,-0.10498,-0.147095,-0.152832,-0.145142,-0.083557,-0.102539,-0.26709,-0.082947,-0.117676,-0.245728,-0.100769,-0.218384,-0.104065,-0.087097,-0.117798,-0.092529,-0.142456,-0.240112,-0.200073,-0.623535,-0.14209,-0.128906,0.750977,-0.215942,1.0,0.0,0.0,2.0,2.0,1.0,1.0,3.0,3.0,2.0,2.0
44003,-0.164551,0.312256,2.375,-0.083679,0.44165,-0.149292,-0.618652,0.024353,-0.161133,-0.181763,1.891602,-0.086487,1.930664,0.362793,-0.117371,1.201172,-0.140137,0.611816,-0.169434,-1.15918,1.100586,-1.15625,1.130859,-0.064209,1.139648,-1.419922,1.115234,-0.850586,0.747559,-0.64502,0.271729,-1.332031,-0.337158,0.726074,-0.085693,0.241333,-0.071228,-0.151367,-1.829102,-0.077454,0.064636,-0.244873,0.362061,0.159058,-0.425537,-0.442627,0.149902,-0.237061,-0.188477,-8.117188,-0.349609,0.136719,0.555664,-0.140869,-0.480225,0.48584,0.100098,-0.268311,0.544434,-0.205811,0.086182,-0.043427,0.329102,0.428955,-0.032196,-0.435059,0.256104,-0.069824,0.304199,0.128418,-0.687012,-0.209351,-0.094971,-0.227661,-0.121521,-0.093262,-0.112976,-0.118774,-0.102539,-0.081848,-0.19458,-0.248169,-0.100525,-0.077454,-0.088928,-0.485107,-0.113464,-0.09613,-0.086914,-0.096863,-0.090576,-0.121704,-0.196533,-0.329102,-0.138794,-0.313477,-0.116699,-0.109558,-0.123169,-0.10907,-0.160278,-0.10498,-0.147095,-0.152832,-0.145142,-0.083557,-0.102539,-0.26709,-0.082947,-0.117676,-0.245728,-0.100769,-0.218384,-0.104065,-0.087097,-0.117798,-0.092529,-0.142456,-0.240112,0.266846,-0.623535,-0.14209,-0.128906,0.750977,-0.215942,1.0,0.0,8.0,0.0,0.0,7.0,7.0,4.0,4.0,7.0,8.0
5630,-0.164551,-2.841797,-0.347412,-0.083679,0.44165,-0.149292,-0.921387,-1.93457,-1.382812,-0.682129,-0.500977,-0.086487,1.556641,0.362793,-0.117371,1.201172,-0.140137,0.611816,-0.169434,-1.15918,1.100586,-1.15625,1.130859,-0.064209,1.139648,0.649414,0.784668,-0.850586,0.542969,-0.64502,-0.452148,0.453613,-0.337158,0.726074,-0.085693,0.241333,-0.071228,-0.151367,-1.829102,-0.077454,0.064636,-0.244873,0.362061,0.159058,-0.425537,-0.442627,0.149902,-0.237061,-0.188477,0.122925,-0.349609,0.136719,0.555664,-0.140869,-0.480225,0.48584,0.100098,-0.268311,-1.836914,-0.205811,0.086182,-0.043427,0.329102,0.428955,-0.032196,-0.435059,0.256104,-0.069824,0.304199,0.128418,-0.687012,-0.209351,-0.094971,-0.227661,-0.121521,-0.093262,-0.112976,-0.118774,-0.102539,-0.081848,-0.19458,-0.248169,-0.100525,-0.077454,11.242188,-0.485107,-0.113464,-0.09613,-0.086914,-0.096863,-0.090576,-0.121704,-0.196533,-0.329102,-0.138794,-0.313477,-0.116699,-0.109558,-0.123169,-0.10907,-0.160278,-0.10498,-0.147095,-0.152832,-0.145142,-0.083557,-0.102539,-0.26709,-0.082947,-0.117676,-0.245728,-0.100769,-0.218384,-0.104065,-0.087097,-0.117798,-0.092529,-0.142456,-0.240112,-0.200073,1.604492,-0.14209,-0.128906,-1.332031,-0.215942,8.0,8.0,8.0,0.0,0.0,7.0,7.0,4.0,4.0,8.0,8.0
55165,-0.164551,0.312256,-0.347412,-0.083679,0.44165,-0.149292,-0.467285,-0.710449,1.037109,1.945312,1.891602,-0.086487,0.061371,0.362793,-0.117371,-0.832031,-0.140137,0.611816,-0.169434,-1.15918,1.100586,-1.15625,1.130859,-0.064209,1.139648,0.649414,-1.023438,-0.850586,-1.419922,1.19043,0.271729,-0.794922,-0.337158,0.726074,-0.085693,0.241333,-0.071228,-0.151367,0.546387,-0.077454,0.064636,-0.244873,0.362061,0.159058,-0.425537,-0.442627,0.149902,-0.237061,-0.188477,0.122925,-0.349609,0.136719,0.555664,-0.140869,1.982422,-2.056641,0.100098,-0.268311,0.544434,-0.205811,0.086182,-0.043427,0.329102,0.428955,-0.032196,1.988281,0.256104,-0.069824,0.304199,0.128418,-0.687012,-0.209351,-0.094971,-0.227661,-0.121521,-0.093262,-0.112976,-0.118774,-0.102539,-0.081848,-0.19458,-0.248169,-0.100525,-0.077454,-0.088928,-0.485107,-0.113464,-0.09613,-0.086914,-0.096863,-0.090576,-0.121704,-0.196533,-0.329102,-0.138794,-0.313477,-0.116699,-0.109558,-0.123169,-0.10907,-0.160278,-0.10498,-0.147095,-0.152832,-0.145142,-0.083557,-0.102539,-0.26709,-0.082947,-0.117676,-0.245728,-0.100769,-0.218384,-0.104065,-0.087097,-0.117798,-0.092529,-0.142456,-0.240112,-0.200073,-0.623535,-0.14209,-0.128906,0.750977,-0.215942,5.0,0.0,0.0,5.0,5.0,7.0,7.0,3.0,3.0,5.0,5.0
13184,-0.164551,0.312256,0.015503,-0.083679,0.44165,-0.149292,-0.845703,0.51416,0.778809,0.638184,-0.500977,-0.086487,0.061371,0.362793,-0.117371,-0.832031,-0.140137,0.611816,-0.169434,-1.15918,1.100586,-1.15625,1.130859,-0.064209,1.139648,0.649414,0.920898,-0.850586,0.542969,-0.64502,-0.452148,-0.794922,-0.337158,-1.37793,-0.085693,0.241333,-0.071228,-0.151367,0.546387,-0.077454,0.064636,-0.244873,-2.761719,0.159058,-0.425537,-0.442627,0.149902,-0.237061,-0.188477,0.122925,-0.349609,0.136719,0.555664,-0.140869,-0.480225,0.48584,0.100098,-0.268311,0.544434,-0.205811,0.086182,0.315918,0.329102,0.428955,-0.032196,-0.435059,0.256104,-0.069824,0.304199,0.128418,-0.687012,-0.209351,-0.094971,-0.227661,-0.121521,-0.093262,-0.112976,-0.118774,-0.102539,-0.081848,-0.19458,-0.248169,-0.100525,-0.077454,-0.088928,-0.485107,-0.113464,-0.09613,-0.086914,-0.096863,-0.090576,-0.121704,-0.196533,-0.329102,-0.138794,-0.313477,-0.116699,-0.109558,-0.123169,-0.10907,-0.160278,-0.10498,-0.147095,-0.152832,-0.145142,-0.083557,-0.102539,-0.26709,-0.082947,-0.117676,4.070312,-0.100769,-0.218384,-0.104065,-0.087097,-0.117798,-0.092529,-0.142456,-0.240112,0.266846,-0.623535,-0.14209,-0.128906,0.750977,-0.215942,6.0,0.0,6.0,0.0,0.0,0.0,0.0,3.0,4.0,3.0,6.0


Какая модель дала наибольшую точность: логистическая регрессия или kNN?

In [25]:
print('Логистическая регрессия:', round(cohen_kappa_score(data_test['target_LOGREG'], data_test['Response']), 3))
print('kNN:', round(cohen_kappa_score(data_test['target_KNN'], data_test['Response']), 3))

Логистическая регрессия: 0.314
kNN: 0.351
