## Постановка задачи
Загрузим данные, приведем их к числовым, заполним пропуски, нормализуем данные и оптимизируем память.

Построим LightGBM модель с оптимальными параметрами. Выгрузим результаты расчетов в требуемом формате.

Данные:
* https://video.ittensive.com/machine-learning/prudential/train.csv.gz
* https://video.ittensive.com/machine-learning/prudential/test.csv.gz
* https://video.ittensive.com/machine-learning/prudential/sample_submission.csv.gz

Соревнование: https://www.kaggle.com/c/prudential-life-insurance-assessment/

© ITtensive, 2020

In [1]:
GRAIN = 11
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score, confusion_matrix
import lightgbm as lgb
from sklearn import preprocessing
from etl_utils import reduce_mem_usage


data = pd.read_csv("https://video.ittensive.com/machine-learning/prudential/train.csv.gz")

### Предобработка данных

In [2]:
def data_preprocess(df: pd.DataFrame) -> pd.DataFrame:
    df['Product_Info_2_1'] = df['Product_Info_2'].str.slice(0, 1)
    df['Product_Info_2_2'] = pd.to_numeric(df['Product_Info_2'].str.slice(1, 2))
    df = df.drop('Product_Info_2', axis='columns')

    onehot_df = pd.get_dummies(df['Product_Info_2_1'])
    onehot_df.columns = ['Product_Info_2_1' + column for column in onehot_df.columns]
    df = pd.merge(left=df, right=onehot_df, left_index=True, right_index=True).drop('Product_Info_2_1', axis=1).fillna(-1)
    del onehot_df

    if 'Response' in df.columns:
        df['Response'] = df['Response'] - 1
    return df

In [3]:
data = data_preprocess(data)

### Набор столбцов для расчета

In [4]:
columns_groups = ['Insurance_History', 'InsurеdInfo', 'Medical_Keyword', 'Family_Hist', 'Medical_History', 'Product_Info']
columns = ['Wt', 'Ht', 'Ins_Age', 'BMI']
for cg in columns_groups:
    columns.extend(data.columns[data.columns.str.startswith(cg)])
print(columns)

['Wt', 'Ht', 'Ins_Age', 'BMI', 'Insurance_History_1', 'Insurance_History_2', 'Insurance_History_3', 'Insurance_History_4', 'Insurance_History_5', 'Insurance_History_7', 'Insurance_History_8', 'Insurance_History_9', 'Medical_Keyword_1', 'Medical_Keyword_2', 'Medical_Keyword_3', 'Medical_Keyword_4', 'Medical_Keyword_5', 'Medical_Keyword_6', 'Medical_Keyword_7', 'Medical_Keyword_8', 'Medical_Keyword_9', 'Medical_Keyword_10', 'Medical_Keyword_11', 'Medical_Keyword_12', 'Medical_Keyword_13', 'Medical_Keyword_14', 'Medical_Keyword_15', 'Medical_Keyword_16', 'Medical_Keyword_17', 'Medical_Keyword_18', 'Medical_Keyword_19', 'Medical_Keyword_20', 'Medical_Keyword_21', 'Medical_Keyword_22', 'Medical_Keyword_23', 'Medical_Keyword_24', 'Medical_Keyword_25', 'Medical_Keyword_26', 'Medical_Keyword_27', 'Medical_Keyword_28', 'Medical_Keyword_29', 'Medical_Keyword_30', 'Medical_Keyword_31', 'Medical_Keyword_32', 'Medical_Keyword_33', 'Medical_Keyword_34', 'Medical_Keyword_35', 'Medical_Keyword_36', 'M

### Нормализация данных

In [5]:
scaler = preprocessing.StandardScaler()
data_transformed = pd.DataFrame(scaler.fit_transform(data[columns]))
columns_transformed = data_transformed.columns
data_transformed['Response'] = data['Response']
data_transformed = reduce_mem_usage(data_transformed)

Потребление памяти меньше на 40.49 Мб (-75.1%)


### LightGBM
Рассчитаем модель по оптимальным показателям. Возможно уточнение/дообучение уже на всей выборке без разбиения на обучающую/тестовую.

In [6]:
model = lgb.LGBMRegressor(
    random_state=GRAIN, max_depth=17, min_child_samples=18, num_leaves=35, n_estimators=1000
).fit(data_transformed[columns_transformed], data['Response'])

### Загрузка данных для расчетов
Применим построенную модель для расчета актуальных данных.

Будем использовать ранее рассчитанные значения нормализация данных.

In [10]:
data_test = pd.read_csv("https://video.ittensive.com/machine-learning/prudential/test.csv.gz")
data_test = reduce_mem_usage(data_preprocess(data_test))
data_test_transformed = reduce_mem_usage(pd.DataFrame(scaler.transform(data_test[columns])))
data_test_transformed.info()

Потребление памяти меньше на 16.34 Мб (-84.9%)
Потребление памяти меньше на 13.35 Мб (-75.0%)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19765 entries, 0 to 19764
Columns: 118 entries, 0 to 117
dtypes: float16(118)
memory usage: 4.4 MB


### Предсказание данных и оценка модели
LightGBM возвращает дробное значение класса, его нужно округлить.

Дополнительно приведем значение класса к диапазону 1...8

In [11]:
data_test_transformed['Response'] = np.round(model.predict(data_test_transformed)) + 1
data_test_transformed['Response'] = (
    data_test_transformed['Response'].apply(lambda x: 1 if x < 1 else 8 if x > 8 else x)
)
data_test_transformed.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,109,110,111,112,113,114,115,116,117,Response
0,0.52002,1.00293,1.045898,0.022141,0.611816,-0.169434,-1.15918,1.100586,-1.15625,1.130859,...,-0.083679,0.44165,-0.149292,-0.200073,-0.623535,-0.14209,-0.128906,0.750977,-0.215942,4.0
1,0.215454,0.266357,1.123047,0.125977,-1.634766,-0.169434,0.862305,-1.013672,0.864258,-0.928711,...,-0.083679,0.44165,-0.149292,-0.666992,1.604492,-0.14209,-0.128906,-1.332031,-0.215942,6.0
2,0.308594,0.022919,0.89502,0.405762,0.611816,-0.169434,-1.15918,1.100586,-1.15625,1.130859,...,-0.083679,0.44165,-0.149292,-0.200073,-0.623535,-0.14209,-0.128906,0.750977,-0.215942,6.0
3,-0.278076,-0.707031,0.592773,0.144043,-1.634766,-0.169434,0.862305,-1.013672,0.862305,0.100891,...,-0.083679,-2.263672,-0.149292,-1.133789,1.604492,-0.14209,-0.128906,-1.332031,-0.215942,7.0
4,-0.51416,-0.463867,-0.54248,-0.333496,0.611816,-0.169434,-1.15918,1.100586,-1.15625,1.130859,...,-0.083679,0.44165,-0.149292,-1.133789,1.604492,-0.14209,-0.128906,-1.332031,-0.215942,6.0


### Формирование результатов
Загрузим пример данных для отправки и заменим в нем столбец Response на рассчитанный ранее.

In [13]:
submission = pd.read_csv("https://video.ittensive.com/machine-learning/prudential/sample_submission.csv.gz")
submission.head()

Unnamed: 0,Id,Response
0,1,8
1,3,8
2,4,8
3,9,8
4,12,8


In [14]:
submission['Response'] = data_test_transformed['Response'].astype('int8')
submission.head()

Unnamed: 0,Id,Response
0,1,4
1,3,6
2,4,6
3,9,7
4,12,6


### Выгрузка результатов

In [15]:
submission.to_csv('submission', index=False)