### Загружаем необходимые модули

In [1]:
import pandas as pd
import numpy as np

### Устанавливаем RNG seed для гарантии воспроизводимости (на всякий случай)

In [2]:
np.random.seed(3927)

### Загружаем данные

In [3]:
df = pd.read_csv("input/train.csv", index_col=False)

### Обрабатываем нечисловые (категориальные) признаки

In [4]:
cat_index = list(df.dtypes[df.dtypes == 'O'].index)
print(cat_index)

['f1', 'f11', 'f12', 'f29', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40', 'f106', 'f114', 'f118', 'f152']


In [5]:
df[cat_index].head()

Unnamed: 0,f1,f11,f12,f29,f33,f34,f35,f36,f37,f38,f39,f40,f106,f114,f118,f152
0,2014-01-29,Investment,Juzhnoe Butovo,no,no,no,no,no,no,no,no,no,no,no,no,good
1,2014-04-07,OwnerOccupier,Poselenie Filimonkovskoe,no,no,no,no,no,no,no,no,no,no,no,no,no data
2,2012-05-18,Investment,Lomonosovskoe,no,no,no,no,no,no,no,no,no,no,no,no,satisfactory
3,2013-02-08,Investment,Juzhnoe Tushino,no,no,no,no,no,no,no,no,no,no,no,no,poor
4,2014-01-10,Investment,Ochakovo-Matveevskoe,no,yes,no,no,no,no,no,no,no,no,no,no,satisfactory


Признак f1 – дата -> разобьём на три столбца

In [6]:
ymd = df['f1'].str.split('-', expand=True).astype('int64').rename(columns={0: 'year', 1: 'month', 2: 'day'})
df = pd.concat([df, ymd], axis=1).drop(['f1'], axis=1)
cat_index = cat_index[1:] + ['year', 'month', 'day']

In [7]:
df[cat_index].head()

Unnamed: 0,f11,f12,f29,f33,f34,f35,f36,f37,f38,f39,f40,f106,f114,f118,f152,year,month,day
0,Investment,Juzhnoe Butovo,no,no,no,no,no,no,no,no,no,no,no,no,good,2014,1,29
1,OwnerOccupier,Poselenie Filimonkovskoe,no,no,no,no,no,no,no,no,no,no,no,no,no data,2014,4,7
2,Investment,Lomonosovskoe,no,no,no,no,no,no,no,no,no,no,no,no,satisfactory,2012,5,18
3,Investment,Juzhnoe Tushino,no,no,no,no,no,no,no,no,no,no,no,no,poor,2013,2,8
4,Investment,Ochakovo-Matveevskoe,no,yes,no,no,no,no,no,no,no,no,no,no,satisfactory,2014,1,10


Признак f11 – неупорядоченный -> one-hot encoding

In [8]:
f11_unique = df['f11'].unique()
f11_unique

array(['Investment', 'OwnerOccupier'], dtype=object)

In [9]:
f11_encoding = dict(zip(f11_unique, np.eye(len(f11_unique))))
f11_encoded = pd.DataFrame([np.nan if k == np.nan else f11_encoding[k] for k in df['f11']], columns=f11_unique)
df = pd.concat([df, f11_encoded], axis=1).drop(['f11'], axis=1)
cat_index = cat_index[1:] + list(f11_unique)

In [10]:
df[cat_index].head()

Unnamed: 0,f12,f29,f33,f34,f35,f36,f37,f38,f39,f40,f106,f114,f118,f152,year,month,day,Investment,OwnerOccupier
0,Juzhnoe Butovo,no,no,no,no,no,no,no,no,no,no,no,no,good,2014,1,29,1.0,0.0
1,Poselenie Filimonkovskoe,no,no,no,no,no,no,no,no,no,no,no,no,no data,2014,4,7,0.0,1.0
2,Lomonosovskoe,no,no,no,no,no,no,no,no,no,no,no,no,satisfactory,2012,5,18,1.0,0.0
3,Juzhnoe Tushino,no,no,no,no,no,no,no,no,no,no,no,no,poor,2013,2,8,1.0,0.0
4,Ochakovo-Matveevskoe,no,yes,no,no,no,no,no,no,no,no,no,no,satisfactory,2014,1,10,1.0,0.0


Признак f12 – неупорядоченный -> one-hot encoding

In [16]:
f12_unique = df['f12'].unique()
f12_unique[:5], f12_unique.shape

(array(['Juzhnoe Butovo', 'Poselenie Filimonkovskoe', 'Lomonosovskoe',
        'Juzhnoe Tushino', 'Ochakovo-Matveevskoe'], dtype=object),
 (146,))

In [17]:
f12_encoding = dict(zip(f12_unique, np.eye(len(f12_unique))))
f12_encoded = pd.DataFrame([np.nan if k == np.nan else f12_encoding[k] for k in df['f12']], columns=f12_unique)
df = pd.concat([df, f12_encoded], axis=1).drop(['f12'], axis=1)
cat_index = cat_index[1:] + list(f12_unique)

In [18]:
df[cat_index].head()

Unnamed: 0,f29,f33,f34,f35,f36,f37,f38,f39,f40,f106,...,Ostankinskoe,Poselenie Kokoshkino,Poselenie Rjazanovskoe,Poselenie Klenovskoe,Poselenie Voronovskoe,Severnoe,Vostochnoe,Poselenie Kievskij,Molzhaninovskoe,Poselenie Mihajlovo-Jarcevskoe
0,no,no,no,no,no,no,no,no,no,no,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,no,no,no,no,no,no,no,no,no,no,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,no,no,no,no,no,no,no,no,no,no,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,no,no,no,no,no,no,no,no,no,no,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,no,yes,no,no,no,no,no,no,no,no,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Признаки f29-f118 – бинарные -> кодируем как 0 и 1

In [23]:
pd.unique(df[cat_index[:12]].values.flatten())

array(['no', 'yes'], dtype=object)

In [24]:
bool_encoding = dict.fromkeys(cat_index[:12], {'no': 0, 'yes': 1})
df = df.replace(bool_encoding)
cat_index = cat_index[12:] + cat_index[:12]

In [25]:
df[cat_index].head()

Unnamed: 0,f152,year,month,day,Investment,OwnerOccupier,Juzhnoe Butovo,Poselenie Filimonkovskoe,Lomonosovskoe,Juzhnoe Tushino,...,f34,f35,f36,f37,f38,f39,f40,f106,f114,f118
0,good,2014,1,29,1.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,no data,2014,4,7,0.0,1.0,0.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,satisfactory,2012,5,18,1.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,poor,2013,2,8,1.0,0.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
4,satisfactory,2014,1,10,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


Признак f152 – упорядоченный -> кодируем с помощью range

In [26]:
df['f152'].unique()

array(['good', 'no data', 'satisfactory', 'poor', 'excellent'],
      dtype=object)

In [27]:
f152_encoding = {'f152': dict(zip(['no data', 'poor', 'satisfactory', 'good', 'excellent'], range(5)))}
df = df.replace(f152_encoding)
cat_index = cat_index[1:] + cat_index[:1]

In [28]:
df[cat_index].head()

Unnamed: 0,year,month,day,Investment,OwnerOccupier,Juzhnoe Butovo,Poselenie Filimonkovskoe,Lomonosovskoe,Juzhnoe Tushino,Ochakovo-Matveevskoe,...,f35,f36,f37,f38,f39,f40,f106,f114,f118,f152
0,2014,1,29,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,3
1,2014,4,7,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,2012,5,18,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,2
3,2013,2,8,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,1
4,2014,1,10,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,2


### Обрабатываем пропуски

Пропуски могут быть только в столбцах типа 'float' (из-за np.nan), но некоторые из них на самом деле могут содержать только целочисленные значения.

Определим такие столбцы.

In [30]:
nan_columns = df.columns[df.isna().any()]
nan_columns

Index(['f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f18', 'f21', 'f24',
       'f68', 'f69', 'f70', 'f71', 'f72', 'f73', 'f74', 'f75', 'f76', 'f77',
       'f78', 'f79', 'f80', 'f81', 'f82', 'f83', 'f87', 'f88', 'f97', 'f98',
       'f99', 'f160', 'f161', 'f162', 'f183', 'f184', 'f185', 'f206', 'f207',
       'f208', 'f229', 'f230', 'f231', 'f252', 'f253', 'f254', 'f269', 'f275',
       'f276', 'f277'],
      dtype='object')

In [51]:
df[nan_columns].head()

Unnamed: 0,f3,f4,f5,f6,f7,f8,f9,f10,f18,f21,...,f229,f230,f231,f252,f253,f254,f269,f275,f276,f277
0,38.0,7.0,10.0,1.0,2001.0,2.0,11.0,2.0,11926.0,24750.0,...,776.92,1230.77,1003.85,776.92,1230.77,1003.85,4.94,725.0,1175.0,950.0
1,,2.0,1.0,4.0,,2.0,10.0,,,,...,,,,,,,3.54,600.0,1000.0,800.0
2,16.0,2.0,,,,,,,3091.0,8359.0,...,748.31,1245.76,997.03,764.97,1271.57,1018.27,5.81,776.46,1293.67,1035.06
3,43.0,1.0,,,,,,,2707.0,6340.0,...,725.93,1240.74,983.33,688.24,1169.12,928.68,8.68,722.99,1211.23,967.11
4,28.0,3.0,5.0,2.0,1960.0,2.0,5.0,,1830.0,6484.0,...,746.15,1230.77,988.46,811.9,1369.05,1090.48,11.27,752.08,1260.42,1006.25


In [32]:
nan_is_int = df[nan_columns].replace({np.nan: 0}).applymap(lambda x: x.is_integer()).all()
nan_is_int[:5], nan_is_int[-5:]

(f3    True
 f4    True
 f5    True
 f6    True
 f7    True
 dtype: bool,
 f254    False
 f269    False
 f275    False
 f276    False
 f277    False
 dtype: bool)

In [46]:
nan_groups = nan_is_int.groupby(nan_is_int).groups  # Groups of "true" integers and floats
for k, v in nan_groups.items():
    print(f"{k}: {v}")

False: Index(['f87', 'f88', 'f97', 'f98', 'f160', 'f161', 'f162', 'f183', 'f184',
       'f185', 'f206', 'f207', 'f208', 'f229', 'f230', 'f231', 'f252', 'f253',
       'f254', 'f269', 'f275', 'f276', 'f277'],
      dtype='object')
True: Index(['f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f18', 'f21', 'f24',
       'f68', 'f69', 'f70', 'f71', 'f72', 'f73', 'f74', 'f75', 'f76', 'f77',
       'f78', 'f79', 'f80', 'f81', 'f82', 'f83', 'f99'],
      dtype='object')


In [47]:
df_floats, df_ints = df[nan_groups[False]], df[nan_groups[True]]

In [48]:
df_floats.head()

Unnamed: 0,f87,f88,f97,f98,f160,f161,f162,f183,f184,f185,...,f229,f230,f231,f252,f253,f254,f269,f275,f276,f277
0,10.311668,0.859306,3.547837,42.574042,1000.0,1500.0,1250.0,1000.0,1500.0,1250.0,...,776.92,1230.77,1003.85,776.92,1230.77,1003.85,4.94,725.0,1175.0,950.0
1,151.852635,12.654386,14.667819,176.013824,,,,,,,...,,,,,,,3.54,600.0,1000.0,800.0
2,18.847768,1.570647,5.248798,62.985572,883.33,1416.67,1150.0,916.67,1466.67,1191.67,...,748.31,1245.76,997.03,764.97,1271.57,1018.27,5.81,776.46,1293.67,1035.06
3,10.015733,0.834644,3.701668,44.420019,625.0,1125.0,875.0,645.45,1136.36,890.91,...,725.93,1240.74,983.33,688.24,1169.12,928.68,8.68,722.99,1211.23,967.11
4,6.794893,0.566241,1.842696,22.11235,,,,,,,...,746.15,1230.77,988.46,811.9,1369.05,1090.48,11.27,752.08,1260.42,1006.25


In [49]:
df_ints.head()

Unnamed: 0,f3,f4,f5,f6,f7,f8,f9,f10,f18,f21,...,f75,f76,f77,f78,f79,f80,f81,f82,f83,f99
0,38.0,7.0,10.0,1.0,2001.0,2.0,11.0,2.0,11926.0,24750.0,...,1.0,84.0,5.0,1680.0,34.0,299.0,439.0,109.0,799.0,47.0
1,,2.0,1.0,4.0,,2.0,10.0,,,,...,,,,,,,,,,24.0
2,16.0,2.0,,,,,,,3091.0,8359.0,...,0.0,0.0,0.0,210.0,0.0,0.0,153.0,33.0,24.0,42.0
3,43.0,1.0,,,,,,,2707.0,6340.0,...,0.0,0.0,0.0,331.0,3.0,4.0,211.0,93.0,20.0,4.0
4,28.0,3.0,5.0,2.0,1960.0,2.0,5.0,,1830.0,6484.0,...,0.0,0.0,0.0,299.0,0.0,2.0,134.0,106.0,57.0,33.0


Для экономии времени будем восстанавливать пропуски следующим образом:  
- 'int' -> median (при KNN получается многократно дольше + нужно приводить обратно к целым значениям)
- 'float' -> KNN

In [50]:
from sklearn.impute import SimpleImputer, KNNImputer

imputer_float = KNNImputer(n_neighbors=10, weights='distance')
imputer_int = SimpleImputer(strategy='median')

In [52]:
imp_float = pd.DataFrame(imputer_float.fit_transform(df_floats), columns=df_floats.columns)
imp_int = pd.DataFrame(imputer_int.fit_transform(df_ints), columns=df_ints.columns)
df = pd.concat([df.drop(nan_columns, axis=1), imp_float, imp_int], axis=1)

In [53]:
df[nan_columns].head()

Unnamed: 0,f3,f4,f5,f6,f7,f8,f9,f10,f18,f21,...,f229,f230,f231,f252,f253,f254,f269,f275,f276,f277
0,38.0,7.0,10.0,1.0,2001.0,2.0,11.0,2.0,11926.0,24750.0,...,776.92,1230.77,1003.85,776.92,1230.77,1003.85,4.94,725.0,1175.0,950.0
1,30.0,2.0,1.0,4.0,1979.0,2.0,10.0,2.0,2854.0,7377.0,...,869.547432,1342.790877,1106.168734,621.393832,988.717881,805.056709,3.54,600.0,1000.0,800.0
2,16.0,2.0,12.0,1.0,1979.0,2.0,6.0,2.0,3091.0,8359.0,...,748.31,1245.76,997.03,764.97,1271.57,1018.27,5.81,776.46,1293.67,1035.06
3,43.0,1.0,12.0,1.0,1979.0,2.0,6.0,2.0,2707.0,6340.0,...,725.93,1240.74,983.33,688.24,1169.12,928.68,8.68,722.99,1211.23,967.11
4,28.0,3.0,5.0,2.0,1960.0,2.0,5.0,2.0,1830.0,6484.0,...,746.15,1230.77,988.46,811.9,1369.05,1090.48,11.27,752.08,1260.42,1006.25


### Трансформируем данные

Очевидно, что данные достаточно сильно разнятся по величине – от единиц до десятков тысяч.  
Значит, нужно их отнормировать. Для этого используем Normalizer, т.е. отнормируем "по строкам"  
(при нормировке "по столбцам" алгоритмы работали хуже)

In [62]:
from sklearn.preprocessing import Normalizer

df = pd.DataFrame(Normalizer().fit_transform(df), columns=df.columns)

In [63]:
df.head()

Unnamed: 0,id,f2,f13,f14,f15,f16,f17,f19,f20,f22,...,f75,f76,f77,f78,f79,f80,f81,f82,f83,f99
0,3.593114e-08,2e-06,0.939784,0.006405,4.952972e-09,1.477358e-09,0.000506,3.952426e-07,0.000535,4.671049e-07,...,3.593114e-08,3e-06,1.796557e-07,6e-05,1.221659e-06,1.074341e-05,1.6e-05,4e-06,2.870898e-05,1.688764e-06
1,5.5627e-08,2e-06,0.994275,7.1e-05,1.526938e-08,9.639069e-10,5e-06,0.0,5e-06,0.0,...,0.0,0.0,0.0,8e-06,0.0,5.5627e-08,4e-06,2e-06,6.67524e-07,6.67524e-07
2,3.959775e-07,4e-06,0.439062,0.011067,6.847858e-09,0.0,0.000619,6.599626e-07,0.000664,9.239476e-07,...,0.0,0.0,0.0,2.8e-05,0.0,0.0,2e-05,4e-06,3.16782e-06,5.543686e-06
3,4.733741e-07,5e-06,0.933456,0.012359,2.634875e-08,2.234134e-08,0.000709,3.550306e-07,0.000726,4.733741e-07,...,0.0,0.0,0.0,3.9e-05,3.550306e-07,4.733741e-07,2.5e-05,1.1e-05,2.36687e-06,4.733741e-07
4,2.659342e-07,2e-06,0.93218,0.006321,7.093678e-09,1.646022e-08,0.000396,1.063737e-07,0.00041,1.595605e-07,...,0.0,0.0,0.0,1.6e-05,0.0,1.063737e-07,7e-06,6e-06,3.03165e-06,1.755166e-06


### Перепишем все рассмотренные выше операции в виде методов

In [54]:
def process_f1(df_train, df_test, index):
    ymd_train = df_train['f1'].str.split('-', expand=True).astype('int64').rename(columns={0: 'year', 1: 'month', 2: 'day'})
    ymd_test = df_test['f1'].str.split('-', expand=True).astype('int64').rename(columns={0: 'year', 1: 'month', 2: 'day'})
    return pd.concat([df_train, ymd_train], axis=1).drop(['f1'], axis=1), \
           pd.concat([df_test, ymd_test], axis=1).drop(['f1'], axis=1), \
           index[1:]

In [55]:
def process_f11(df_train, df_test, index):
    f11_unique = df_train['f11'].unique()
    f11_encode = dict(zip(f11_unique, np.eye(len(f11_unique))))
    encode_train = pd.DataFrame([np.nan if k == np.nan else f11_encode[k] for k in df_train['f11']], columns=f11_unique)
    encode_test = pd.DataFrame([np.nan if k == np.nan else f11_encode[k] for k in df_test['f11']], columns=f11_unique)
    return pd.concat([df_train, encode_train], axis=1).drop(['f11'], axis=1), \
           pd.concat([df_test, encode_test], axis=1).drop(['f11'], axis=1), \
           index[1:]

In [56]:
def process_f12(df_train, df_test, index):
    f12_unique = df_train['f12'].unique()
    f12_encode = dict(zip(f12_unique, np.eye(len(f12_unique))))
    encode_train = pd.DataFrame([np.nan if k == np.nan else f12_encode[k] for k in df_train['f12']], columns=f12_unique)
    encode_test = pd.DataFrame([np.nan if k == np.nan else f12_encode[k] for k in df_test['f12']], columns=f12_unique)
    return pd.concat([df_train, encode_train], axis=1).drop(['f12'], axis=1), \
           pd.concat([df_test, encode_test], axis=1).drop(['f12'], axis=1), \
           index[1:]

In [57]:
def process_bool(df_train, df_test, index):
    bool_index = index[:-1]
    bool_encode = {'no': 0, 'yes': 1}
    encoding = dict.fromkeys(bool_index, bool_encode)
    return df_train.replace(encoding), \
           df_test.replace(encoding), \
           [index[-1]]

In [58]:
def process_f152(df_train, df_test, index):
    f152_encode = dict(zip(['no data', 'poor', 'satisfactory', 'good', 'excellent'], range(5)))
    encoding = {'f152': f152_encode}
    return df_train.replace(encoding), \
           df_test.replace(encoding), \
           index

In [59]:
def process_categorical(df_train, df_test):
    cat = df_train.dtypes[df_train.dtypes == type(object)]
    cat_index = list(cat.index)
    
    df_train_res, df_test_res, cat_index = process_f1(df_train, df_test, cat_index)
    df_train_res, df_test_res, cat_index = process_f11(df_train_res, df_test_res, cat_index)
    df_train_res, df_test_res, cat_index = process_f12(df_train_res, df_test_res, cat_index)
    df_train_res, df_test_res, cat_index = process_bool(df_train_res, df_test_res, cat_index)
    df_train_res, df_test_res, cat_index = process_f152(df_train_res, df_test_res, cat_index)
    return df_train_res, df_test_res

In [60]:
from sklearn.impute import SimpleImputer, KNNImputer

def impute(df):
    # Setup imputers
    imputer_float = KNNImputer(n_neighbors=10, weights='distance')
    imputer_int = SimpleImputer(strategy='median')

    # Find columns, containing NaN, and split into int/float
    nan_columns = df.columns[df.isna().any()].to_list()
    nan_is_int= df[nan_columns].replace({np.nan: 0}).applymap(lambda x: x.is_integer()).all()
    nan_groups = nan_is_int.groupby(nan_is_int).groups  # Groups of "true" integers and floats
    df_floats, df_ints = df[nan_groups[False]], df[nan_groups[True]]

    # Impute missing values
    imp_float = pd.DataFrame(imputer_float.fit_transform(df_floats), columns=df_floats.columns)
    imp_int = pd.DataFrame(imputer_int.fit_transform(df_ints), columns=df_ints.columns)

    return pd.concat([df.drop(nan_columns, axis=1), imp_float, imp_int], axis=1)

def impute_na(df_train, df_test):
    return impute(df_train), impute(df_test)

In [61]:
from sklearn.preprocessing import Normalizer

def transform(df_train, df_test):
    transformer = Normalizer()
    return pd.DataFrame(transformer.fit_transform(df_train), columns=df_train.columns), \
           pd.DataFrame(transformer.fit_transform(df_test), columns=df_test.columns)

In [64]:
def preprocess(df_train, df_test):
    df_train_res, df_test_res = process_categorical(df_train, df_test)
    df_train_res, df_test_res = impute_na(df_train_res, df_test_res)
    df_train_res, df_test_res = transform(df_train_res, df_test_res)
    return df_train_res, df_test_res

In [65]:
def prepare_dataset(df_train, df_test):
    return df_train.drop(['id', 'target'], axis=1), \
           df_train['target'], \
           df_test.drop('id', axis=1)

In [66]:
def load(train_path, test_path):
    df_train = pd.read_csv(train_path, index_col=False)
    df_test = pd.read_csv(test_path, index_col=False)
    X_train, y_train, X_test = prepare_dataset(df_train, df_test)
    X_train, X_test = preprocess(X_train, X_test)
    return X_train, y_train, X_test

In [67]:
X_train, y_train, X_test = load("input/train.csv", "input/test.csv")

In [68]:
X_train.head()

Unnamed: 0,f2,f13,f14,f15,f16,f17,f19,f20,f22,f23,...,f75,f76,f77,f78,f79,f80,f81,f82,f83,f99
0,3e-06,0.999836,0.006815,5.269468e-09,1.571761e-09,0.000538,4.204987e-07,0.000569,4.96953e-07,3.822715e-08,...,3.822715e-08,3e-06,1.911358e-07,6.4e-05,1.299723e-06,1.142992e-05,1.7e-05,4e-06,3.05435e-05,1.796676e-06
1,2e-06,0.999989,7.1e-05,1.535712e-08,9.694461e-10,5e-06,0.0,5e-06,0.0,0.0,...,0.0,0.0,0.0,8e-06,0.0,5.594666e-08,4e-06,2e-06,6.713599e-07,6.713599e-07
2,7e-06,0.776825,0.01958,1.211579e-08,0.0,0.001095,1.16766e-06,0.001174,1.634724e-06,2.33532e-07,...,0.0,0.0,0.0,4.9e-05,0.0,0.0,3.6e-05,8e-06,5.604768e-06,9.808343e-06
3,5e-06,0.960755,0.012721,2.711932e-08,2.299471e-08,0.000729,3.654134e-07,0.000748,4.872179e-07,0.0,...,0.0,0.0,0.0,4e-05,3.654134e-07,4.872179e-07,2.6e-05,1.1e-05,2.43609e-06,4.872179e-07
4,3e-06,0.997675,0.006765,7.592085e-09,1.761673e-08,0.000424,1.138476e-07,0.000439,1.707714e-07,0.0,...,0.0,0.0,0.0,1.7e-05,0.0,1.138476e-07,8e-06,6e-06,3.244656e-06,1.878485e-06


In [69]:
y_train.head()

0    9500000
1    3837949
2    6250000
3    2000000
4    6700000
Name: target, dtype: int64

In [70]:
X_test.head()

Unnamed: 0,f2,f13,f14,f15,f16,f17,f19,f20,f22,f23,...,f75,f76,f77,f78,f79,f80,f81,f82,f83,f99
0,2e-06,0.999957,0.000157,1.943485e-08,2.788976e-10,1.1e-05,0.0,1e-05,0.0,0.0,...,0.0,0.0,0.0,1.1e-05,0.0,7.831653e-08,5e-06,3e-06,9.397984e-07,9.397984e-07
1,3e-06,0.999973,0.000157,1.943516e-08,2.789021e-10,1.1e-05,0.0,1e-05,0.0,0.0,...,0.0,0.0,0.0,1.1e-05,0.0,7.83178e-08,5e-06,3e-06,9.398136e-07,9.398136e-07
2,3e-06,0.5118,0.005279,4.583622e-09,5.473705e-12,0.000297,2.80154e-07,0.000448,2.80154e-07,7.003851e-08,...,0.0,3.501925e-07,7.003851e-08,4.6e-05,1.8e-05,7.354043e-06,1.1e-05,5e-06,3.992195e-06,3.501925e-07
3,2e-06,0.995239,0.009234,2.009066e-08,2.44335e-09,0.00055,4.753063e-07,0.000565,5.347196e-07,0.0,...,0.0,6.535462e-07,0.0,2.7e-05,3e-06,2.495358e-06,6e-06,1e-05,5.347196e-06,1.544746e-06
4,2.5e-05,0.926716,0.012046,5.140045e-08,0.0,0.000632,9.867259e-07,0.000666,9.867259e-07,0.0,...,0.0,0.0,0.0,4.3e-05,0.0,0.0,2.8e-05,4e-06,1.065664e-05,6.512391e-06


In [73]:
X_train.shape, y_train.shape, X_test.shape

((24376, 438), (24376,), (6095, 438))

Удостоверимся, что после предобработки все пропуски действительно были восстановлены

In [71]:
True in X_train.isna().values

False

In [72]:
True in X_test.isna().values

False

### Определим тренировочный метод

Качество работы алгоритма осуществляется по RMSLE -> определим scorer на основе этой ошибки.  
По умолчанию, алгоритм может выдавать отрицательные предсказания, но RMSLE не допускает значений < -1.  
Значит, необходимо вынести знак "минус" (если есть) из-под логарифма

In [21]:
from sklearn.metrics import make_scorer

def rmsle(y, y_pred, **kwargs):
    # Allow negative predictions by incorporating prediction sign
    terms = [np.sign(pred) * np.log(1 + np.sign(pred) * pred) - np.log(1 + targ) for pred, targ in zip(y_pred, y)]
    return np.sqrt(np.sum(np.power(terms, 2.0)) / len(y))

rmsle_scorer = make_scorer(rmsle, greater_is_better=False)

В тренировочном методе дополнительно можем определить гиперпараметры модели,  
а также оценить качество её работы по 5-fold cross-validation.

In [22]:
from sklearn.model_selection import cross_val_score, RandomizedSearchCV

def train(model, X, y, score=True, cv=5, tune=False, params=None, n_iter=10):
    if tune:
        if score:
            print(f"Pre-tune CV scores: {cross_val_score(model, X, y, cv=cv, scoring=rmsle_scorer)}")
        
        rscv = RandomizedSearchCV(model, params, n_iter=n_iter, scoring=rmsle_scorer, refit=True, cv=cv, random_state=3927)
        rscv.fit(X, y)
        print(f"Best params: {rscv.best_params_}")
        model_tr = rscv.best_estimator_

        if score:
            print(f"Post-tune CV scores: {cross_val_score(model_tr, X, y, cv=cv, scoring=rmsle_scorer)}")
    else:
        model_tr = model.fit(X, y)
    
        if score:
            print(f"CV scores: {cross_val_score(model_tr, X, y, cv=cv, scoring=rmsle_scorer)}")

    return model_tr

### Определим тестировочный метод

In [23]:
def test(model, X):
    pred = model.predict(X)
    data = list(zip(range(1, len(pred) + 1), pred))
    return pd.DataFrame(data, columns=['id', 'prediction'])

### Выбор модели

#### Ridge

In [72]:
from sklearn.linear_model import Ridge
from scipy.stats import loguniform, uniform

In [73]:
model = Ridge(random_state=3927)
params = {'alpha': loguniform(1, 10000)}

model = train(model, X_train, y_train, tune=True, params=params, n_iter=100)
model

Pre-tune CV scores: [-0.60000799 -0.60348582 -0.60745596 -0.60441781 -0.62379452]
Best params: {'alpha': 1.738157608461319}
Post-tune CV scores: [-0.60021189 -0.6031775  -0.60753366 -0.60417195 -0.62365405]


Ridge(alpha=1.738157608461319, random_state=3927)

In [74]:
model = Ridge(random_state=3927)
params = {'alpha': loguniform(0.001, 1)}

model = train(model, X_train, y_train, tune=True, params=params, n_iter=100)
model

Pre-tune CV scores: [-0.60000799 -0.60348582 -0.60745596 -0.60441781 -0.62379452]
Best params: {'alpha': 0.9653009578533324}
Post-tune CV scores: [-0.59999686 -0.60353432 -0.60745569 -0.60444701 -0.62382557]


Ridge(alpha=0.9653009578533324, random_state=3927)

#### ElasticNet

In [75]:
from sklearn.linear_model import ElasticNet
from scipy.stats import loguniform, uniform

In [76]:
model = ElasticNet(max_iter=500, selection='random', random_state=3927)
params = {'alpha': loguniform(0.01, 100), 'l1_ratio': uniform(0.1, 0.8)}

model = train(model, X_train, y_train, tune=True, params=params, n_iter=10)
model

Pre-tune CV scores: [-0.61793116 -0.61371143 -0.62756285 -0.62152099 -0.63506551]
Best params: {'alpha': 0.034339915870309524, 'l1_ratio': 0.7688000396844558}
Post-tune CV scores: [-0.60329539 -0.60547727 -0.61235485 -0.60602077 -0.62584919]


ElasticNet(alpha=0.034339915870309524, l1_ratio=0.7688000396844558,
           max_iter=500, random_state=3927, selection='random')

#### DecisionTree

In [77]:
from sklearn.tree import DecisionTreeRegressor

In [78]:
model = DecisionTreeRegressor(random_state=3927)
params = {"max_depth" : [None, 1, 3, 5, 7, 9],
          "min_samples_leaf": [1, 3, 5, 7, 9],
          "min_weight_fraction_leaf": [0.1, 0.2, 0.3, 0.4, 0.5],
          "max_leaf_nodes": [None, 10, 30, 50, 70, 90]}

model = train(model, X_train, y_train, tune=True, params=params, n_iter=20)
model

Pre-tune CV scores: [-0.66235402 -0.66574107 -0.65846784 -0.66679303 -0.66972977]
Best params: {'min_weight_fraction_leaf': 0.1, 'min_samples_leaf': 3, 'max_leaf_nodes': 70, 'max_depth': None}
Post-tune CV scores: [-0.59148096 -0.59375481 -0.60045088 -0.59816698 -0.61848356]


DecisionTreeRegressor(max_leaf_nodes=70, min_samples_leaf=3,
                      min_weight_fraction_leaf=0.1, random_state=3927)

#### SGD

In [80]:
from sklearn.linear_model import SGDRegressor
from scipy.stats import loguniform, uniform

In [81]:
model = SGDRegressor(max_iter=5000, penalty='l2', random_state=3927)
params = {'alpha': uniform(0.001, 1000)}

model = train(model, X_train, y_train, tune=True, params=params, n_iter=10)
model

Pre-tune CV scores: [-0.60184232 -0.60206038 -0.60952616 -0.60570489 -0.62559363]
Best params: {'alpha': 75.64173072984201}
Post-tune CV scores: [-0.62324302 -0.61909415 -0.62862468 -0.6263908  -0.63351724]


SGDRegressor(alpha=75.64173072984201, max_iter=5000, random_state=3927)

In [82]:
model = SGDRegressor(max_iter=5000, penalty='elasticnet', random_state=3927)
params = {'alpha': loguniform(0.001, 1000), 'l1_ratio': uniform(0.1, 0.8)}

model = train(model, X_train, y_train, tune=True, params=params, n_iter=10)
model

Pre-tune CV scores: [-0.6018405  -0.60205898 -0.60952005 -0.60570345 -0.62559216]
Best params: {'alpha': 0.005012444501918985, 'l1_ratio': 0.2919813353920865}
Post-tune CV scores: [-0.60308389 -0.60236847 -0.60717254 -0.60343441 -0.62382585]


SGDRegressor(alpha=0.005012444501918985, l1_ratio=0.2919813353920865,
             max_iter=5000, penalty='elasticnet', random_state=3927)

In [83]:
model = SGDRegressor(max_iter=5000, penalty='elasticnet', random_state=3927)
params = {'alpha': uniform(0.0001, 1), 'l1_ratio': uniform(0.1, 0.8)}

model = train(model, X_train, y_train, tune=True, params=params, n_iter=10)
model

Pre-tune CV scores: [-0.6018405  -0.60205898 -0.60952005 -0.60570345 -0.62559216]
Best params: {'alpha': 0.1340498067109922, 'l1_ratio': 0.7688000396844558}
Post-tune CV scores: [-0.60382075 -0.60341686 -0.61697702 -0.60296978 -0.62210577]


SGDRegressor(alpha=0.1340498067109922, l1_ratio=0.7688000396844558,
             max_iter=5000, penalty='elasticnet', random_state=3927)

In [38]:
model = SGDRegressor(max_iter=5000, penalty='elasticnet', random_state=3927)
params = {'alpha': uniform(0.005, 0.05), 'l1_ratio': uniform(0.1, 0.8)}

model = train(model, X_train, y_train, tune=True, params=params, n_iter=10)
model

Pre-train CV scores: [-0.52058201 -0.80294035 -0.52343453 -0.67453236 -0.66563684]
Best params: {'alpha': 0.01169749033554961, 'l1_ratio': 0.7688000396844558}
Post-train CV scores: [-0.51815211 -0.53837724 -0.53557793 -0.54162234 -0.56373763]


SGDRegressor(alpha=0.01169749033554961, l1_ratio=0.7688000396844558,
             max_iter=5000, penalty='elasticnet', random_state=3927)

#### Stacking

In [84]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import StackingRegressor

In [86]:
estimators = [
    ('RIDGE', Ridge(alpha=1, random_state=3927)),
    ('ELNET', ElasticNet(max_iter=500, alpha=0.01, l1_ratio=0.6, selection='random', random_state=3927)),
    ('DTREE', DecisionTreeRegressor(max_leaf_nodes=70, min_samples_leaf=3,
                                    min_weight_fraction_leaf=0.1, random_state=3927)),
    ('SGDR', SGDRegressor(max_iter=5000, penalty='elasticnet', alpha=0.01, l1_ratio=0.6, random_state=3927))
]
model = StackingRegressor(estimators, n_jobs=-1)

model = train(model, X_train, y_train)
model

CV scores: [-27.28249445 -28.71580287 -15.66973739 -22.25587526 -29.54273279]


StackingRegressor(estimators=[('RIDGE', Ridge(alpha=1, random_state=3927)),
                              ('ELNET',
                               ElasticNet(alpha=0.01, l1_ratio=0.6,
                                          max_iter=500, random_state=3927,
                                          selection='random')),
                              ('DTREE',
                               DecisionTreeRegressor(max_leaf_nodes=70,
                                                     min_samples_leaf=3,
                                                     min_weight_fraction_leaf=0.1,
                                                     random_state=3927)),
                              ('SGDR',
                               SGDRegressor(alpha=0.01, l1_ratio=0.6,
                                            max_iter=5000, penalty='elasticnet',
                                            random_state=3927))],
                  n_jobs=-1)

In [87]:
estimators = [
    ('RIDGE', Ridge(alpha=1, random_state=3927)),
    ('ELNET', ElasticNet(max_iter=500, alpha=0.01, l1_ratio=0.6, selection='random', random_state=3927)),
    ('SGDR', SGDRegressor(max_iter=5000, penalty='elasticnet', alpha=0.01, l1_ratio=0.6, random_state=3927))
]
final = DecisionTreeRegressor(max_leaf_nodes=70, min_samples_leaf=3,
                                    min_weight_fraction_leaf=0.1, random_state=3927)
model = StackingRegressor(estimators, final_estimator=final, n_jobs=-1)

model = train(model, X_train, y_train)
model

CV scores: [-0.59551544 -0.5980722  -0.60405917 -0.60086041 -0.61951078]


StackingRegressor(estimators=[('RIDGE', Ridge(alpha=1, random_state=3927)),
                              ('ELNET',
                               ElasticNet(alpha=0.01, l1_ratio=0.6,
                                          max_iter=500, random_state=3927,
                                          selection='random')),
                              ('SGDR',
                               SGDRegressor(alpha=0.01, l1_ratio=0.6,
                                            max_iter=5000, penalty='elasticnet',
                                            random_state=3927))],
                  final_estimator=DecisionTreeRegressor(max_leaf_nodes=70,
                                                        min_samples_leaf=3,
                                                        min_weight_fraction_leaf=0.1,
                                                        random_state=3927),
                  n_jobs=-1)

#### Bagging

In [24]:
from sklearn.ensemble import BaggingRegressor
from scipy.stats import randint

In [102]:
model = BaggingRegressor(n_estimators=10, n_jobs=-1, random_state=3927)
params = {'max_samples': randint(1, 100), 'max_features': randint(1, 20)}

model = train(model, X_train, y_train, tune=True, params=params, n_iter=30)
model

Pre-tune CV scores: [-0.47788705 -0.48441964 -0.49308377 -0.50106005 -0.50426299]
Best params: {'max_features': 8, 'max_samples': 96}
Post-tune CV scores: [-0.60069901 -0.59479298 -0.60073053 -0.59615137 -0.62399794]


BaggingRegressor(max_features=8, max_samples=96, n_jobs=-1, random_state=3927)

In [110]:
model = BaggingRegressor(n_estimators=100, n_jobs=-1, random_state=3927)

model = train(model, X_train, y_train)
model

CV scores: [-0.46414895 -0.4710153  -0.48348008 -0.48675993 -0.49328828]


BaggingRegressor(n_estimators=100, n_jobs=-1, random_state=3927)

In [25]:
model = BaggingRegressor(n_estimators=1000, n_jobs=-1, random_state=3927)

model = train(model, X_train, y_train, score=False)
model

BaggingRegressor(n_estimators=1000, n_jobs=-1, random_state=3927)

In [90]:
base = Ridge(alpha=1, random_state=3927)
model = BaggingRegressor(base_estimator=base, n_estimators=20, n_jobs=-1, random_state=3927)
params = {'max_samples': randint(1, 100), 'max_features': randint(1, 20)}

model = train(model, X_train, y_train, tune=True, params=params, n_iter=10)
model

Pre-tune CV scores: [-0.60038772 -0.60371702 -0.60781098 -0.6049524  -0.62425982]
Best params: {'max_features': 17, 'max_samples': 30}
Post-tune CV scores: [-0.61641832 -0.62776915 -0.63349879 -0.62867303 -0.63003017]


BaggingRegressor(base_estimator=Ridge(alpha=1, random_state=3927),
                 max_features=17, max_samples=30, n_estimators=20, n_jobs=-1,
                 random_state=3927)

In [91]:
base = SGDRegressor(max_iter=5000, alpha=0.01, l1_ratio=0.6, penalty='elasticnet', random_state=3927)
model = BaggingRegressor(base_estimator=base, n_estimators=2, n_jobs=-1, random_state=3927)
params = {'max_samples': randint(1, 100), 'max_features': randint(1, 20)}

model = train(model, X_train, y_train, tune=True, params=params, n_iter=10)
model

Pre-tune CV scores: [-0.60772179 -0.60589377 -0.60805996 -0.60706383 -0.62722213]
Best params: {'max_features': 12, 'max_samples': 60}
Post-tune CV scores: [-0.61628121 -0.60219481 -0.61696946 -0.62098255 -0.62340703]


BaggingRegressor(base_estimator=SGDRegressor(alpha=0.01, l1_ratio=0.6,
                                             max_iter=5000,
                                             penalty='elasticnet',
                                             random_state=3927),
                 max_features=12, max_samples=60, n_estimators=2, n_jobs=-1,
                 random_state=3927)

#### AdaBoost

In [92]:
from sklearn.ensemble import AdaBoostRegressor

In [93]:
model = AdaBoostRegressor(n_estimators=10, loss='square', random_state=3927)

model = train(model, X_train, y_train)
model

CV scores: [-0.62612785 -0.63610368 -0.70419332 -0.68575288 -0.65371367]


AdaBoostRegressor(loss='square', n_estimators=10, random_state=3927)

In [96]:
base = Ridge(alpha=1, random_state=3927)
model = AdaBoostRegressor(base_estimator=base, n_estimators=2, loss='square', random_state=3927)

model = train(model, X_train, y_train)
model

CV scores: [-0.59987581 -0.60262106 -0.60742434 -0.6040483  -0.62469991]


AdaBoostRegressor(base_estimator=Ridge(alpha=1, random_state=3927),
                  loss='square', n_estimators=2, random_state=3927)

In [97]:
base = SGDRegressor(max_iter=5000, alpha=0.01, l1_ratio=0.6, penalty='elasticnet', random_state=3927)
model = AdaBoostRegressor(base_estimator=base, n_estimators=2, loss='square', random_state=3927)

model = train(model, X_train, y_train)
model

CV scores: [-0.5987626  -0.60642201 -0.61348093 -0.60559309 -0.63237204]


AdaBoostRegressor(base_estimator=SGDRegressor(alpha=0.01, l1_ratio=0.6,
                                              max_iter=5000,
                                              penalty='elasticnet',
                                              random_state=3927),
                  loss='square', n_estimators=2, random_state=3927)

#### GradientBoosting

In [98]:
from sklearn.ensemble import GradientBoostingRegressor

In [99]:
model = GradientBoostingRegressor(n_estimators=10, random_state=3927)

model = train(model, X_train, y_train)
model

CV scores: [-0.58158919 -0.57946082 -0.5900043  -0.58762625 -0.6051725 ]


GradientBoostingRegressor(n_estimators=10, random_state=3927)

In [100]:
model = GradientBoostingRegressor(n_estimators=100, random_state=3927)

model = train(model, X_train, y_train)
model

CV scores: [-0.53934113 -0.53961423 -0.55229335 -0.55318057 -0.56992807]


GradientBoostingRegressor(random_state=3927)

Выводы:  
1. Большинство стандартных моделей показывает примерно одинаковые результаты (score ~ 0.6), кроме SGD (score ~ 0.53)
2. Бустинги оказываются даже хуже SGD:
    - AdaBoost показывал очень плохие результаты (score > 1) при n_estimators > 2; в противном случае всё равно достигался лишь score ~ 0.6
    - GradientBoosting получил score ~ 0.55 (хуже, чем SGD) при тренировке 10 минут (дольше, чем SGD); возможно, стоило понастраивать гиперпараметры
3. Bagging показал наилучший результат на public-е, и даже лучше на private-е:
    - n_estimators = 10 -> public = 0.51010, private = 0.48320 (time ~ 0.15 m)
    - n_estimators = 100 -> public = 0.49475, private = 0.47458 (time ~ 1.5 m)
    - n_estimators = 1000 -> public = 0.49411, private = 0.47381 (time ~ 15 m)
    
    Как видно, при последнем переходе заметного улучшения уже не наблюдается, однако время тренировки возрастает многократно.

### Постобработка

Тестируем

In [26]:
df_pred = test(model, X_test)
df_pred.head()

Unnamed: 0,id,prediction
0,1,6771005.0
1,2,4947073.0
2,3,3652721.0
3,4,7189662.0
4,5,34107940.0


Удостоверимся, что нет отрицательных значений (так как в тренировочных метках все значения положительные);  
при не очень удачной предобработке такие значения могут появиться.

In [74]:
y_train[y_train < 0]

Series([], Name: target, dtype: int64)

In [27]:
df_pred['prediction'][df_pred['prediction'] <= 100000]

Series([], Name: prediction, dtype: float64)

Экспортируем

In [28]:
df_pred.to_csv("output/submission.csv", index=False)