# Imports

In [1]:
import pandas as pd
import numpy as np

# Input

In [2]:
df = pd.read_csv("input/train.csv", index_col=False)

In [3]:
df.head()

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f282,f283,f284,f285,f286,f287,f288,f289,f290,target
0,1,2014-01-29,69,38.0,7.0,10.0,1.0,2001.0,2.0,11.0,...,1,0,0,2,8,1,0,19,2,9500000
1,2,2014-04-07,55,,2.0,1.0,4.0,,2.0,10.0,...,0,0,0,0,4,0,0,2,0,3837949
2,3,2012-05-18,30,16.0,2.0,,,,,,...,46,9,2,11,38,1,8,97,11,6250000
3,4,2013-02-08,44,43.0,1.0,,,,,,...,17,4,1,12,12,0,1,55,7,2000000
4,5,2014-01-10,45,28.0,3.0,5.0,2.0,1960.0,2.0,5.0,...,20,2,0,4,16,1,4,47,5,6700000


In [4]:
df.shape

(24376, 292)

# Preprocessing

## Check types

In [5]:
df.dtypes.value_counts()

int64      157
float64    119
object      16
dtype: int64

## Get notnums w/ indices

In [6]:
notnum = df.dtypes[df.dtypes == type(object)]
notnum

f1      object
f11     object
f12     object
f29     object
f33     object
f34     object
f35     object
f36     object
f37     object
f38     object
f39     object
f40     object
f106    object
f114    object
f118    object
f152    object
dtype: object

In [7]:
notnum_index = list(notnum.index)
print(notnum_index)

['f1', 'f11', 'f12', 'f29', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40', 'f106', 'f114', 'f118', 'f152']


In [8]:
df[notnum_index].head()

Unnamed: 0,f1,f11,f12,f29,f33,f34,f35,f36,f37,f38,f39,f40,f106,f114,f118,f152
0,2014-01-29,Investment,Juzhnoe Butovo,no,no,no,no,no,no,no,no,no,no,no,no,good
1,2014-04-07,OwnerOccupier,Poselenie Filimonkovskoe,no,no,no,no,no,no,no,no,no,no,no,no,no data
2,2012-05-18,Investment,Lomonosovskoe,no,no,no,no,no,no,no,no,no,no,no,no,satisfactory
3,2013-02-08,Investment,Juzhnoe Tushino,no,no,no,no,no,no,no,no,no,no,no,no,poor
4,2014-01-10,Investment,Ochakovo-Matveevskoe,no,yes,no,no,no,no,no,no,no,no,no,no,satisfactory


## Process f1

In [9]:
ymd = df['f1'].str.split('-', expand=True).astype('int64').rename(columns={0: 'f1y', 1: 'f1m', 2: 'f1d'})
ymd

Unnamed: 0,f1y,f1m,f1d
0,2014,1,29
1,2014,4,7
2,2012,5,18
3,2013,2,8
4,2014,1,10
...,...,...,...
24371,2014,12,21
24372,2012,5,10
24373,2013,12,18
24374,2014,12,8


In [10]:
df_f1 = df.drop(['f1'], axis=1)
df_f1.insert(1, 'f1y', ymd['f1y'])
df_f1.insert(2, 'f1m', ymd['f1m'])
df_f1.insert(3, 'f1d', ymd['f1d'])

In [11]:
notnum_index = ['f1y', 'f1m', 'f1d'] + notnum_index[1:]
print(notnum_index)

['f1y', 'f1m', 'f1d', 'f11', 'f12', 'f29', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40', 'f106', 'f114', 'f118', 'f152']


In [12]:
df_f1[notnum_index].head()

Unnamed: 0,f1y,f1m,f1d,f11,f12,f29,f33,f34,f35,f36,f37,f38,f39,f40,f106,f114,f118,f152
0,2014,1,29,Investment,Juzhnoe Butovo,no,no,no,no,no,no,no,no,no,no,no,no,good
1,2014,4,7,OwnerOccupier,Poselenie Filimonkovskoe,no,no,no,no,no,no,no,no,no,no,no,no,no data
2,2012,5,18,Investment,Lomonosovskoe,no,no,no,no,no,no,no,no,no,no,no,no,satisfactory
3,2013,2,8,Investment,Juzhnoe Tushino,no,no,no,no,no,no,no,no,no,no,no,no,poor
4,2014,1,10,Investment,Ochakovo-Matveevskoe,no,yes,no,no,no,no,no,no,no,no,no,no,satisfactory


## Process f11

In [13]:
f11_unique = df_f1['f11'].unique()
f11_unique

array(['Investment', 'OwnerOccupier'], dtype=object)

In [14]:
f11_encode = dict(zip(f11_unique, range(len(f11_unique))))
f11_encode

{'Investment': 0, 'OwnerOccupier': 1}

In [15]:
df_f11 = df_f1.replace({'f11': f11_encode})
df_f11[notnum_index].head()

Unnamed: 0,f1y,f1m,f1d,f11,f12,f29,f33,f34,f35,f36,f37,f38,f39,f40,f106,f114,f118,f152
0,2014,1,29,0,Juzhnoe Butovo,no,no,no,no,no,no,no,no,no,no,no,no,good
1,2014,4,7,1,Poselenie Filimonkovskoe,no,no,no,no,no,no,no,no,no,no,no,no,no data
2,2012,5,18,0,Lomonosovskoe,no,no,no,no,no,no,no,no,no,no,no,no,satisfactory
3,2013,2,8,0,Juzhnoe Tushino,no,no,no,no,no,no,no,no,no,no,no,no,poor
4,2014,1,10,0,Ochakovo-Matveevskoe,no,yes,no,no,no,no,no,no,no,no,no,no,satisfactory


## Process f12

In [16]:
f12_unique = df_f11['f12'].unique()
f12_unique

array(['Juzhnoe Butovo', 'Poselenie Filimonkovskoe', 'Lomonosovskoe',
       'Juzhnoe Tushino', 'Ochakovo-Matveevskoe', 'Poselenie Sosenskoe',
       'Orehovo-Borisovo Juzhnoe', 'Hovrino', 'Sokol',
       'Nagatinskij Zaton', "Kon'kovo", 'Obruchevskoe',
       'Poselenie Vnukovskoe', 'Preobrazhenskoe', 'Poselenie Moskovskij',
       'Bibirevo', 'Jasenevo', 'Mitino', 'Birjulevo Vostochnoe',
       'Koptevo', 'Beskudnikovskoe', 'Butyrskoe', 'Losinoostrovskoe',
       'Novo-Peredelkino', 'Caricyno', 'Veshnjaki', 'Tverskoe', 'Perovo',
       'Jaroslavskoe', 'Strogino', 'Nekrasovka', 'Horoshevo-Mnevniki',
       'Filevskij Park', 'Bogorodskoe', 'Savelki',
       'Poselenie Desjonovskoe', "Chertanovo Central'noe",
       'Timirjazevskoe', 'Zapadnoe Degunino', 'Pokrovskoe Streshnevo',
       'Severnoe Butovo', 'Kapotnja', 'Sviblovo', 'Danilovskoe',
       'Sokolinaja Gora', 'Vostochnoe Izmajlovo', 'Matushkino',
       'Krjukovo', "Krasnosel'skoe", 'Ajeroport', 'Taganskoe',
       'Cheremushki

In [17]:
f12_encode = dict(zip(f12_unique, range(len(f12_unique))))
f12_encode

{'Juzhnoe Butovo': 0,
 'Poselenie Filimonkovskoe': 1,
 'Lomonosovskoe': 2,
 'Juzhnoe Tushino': 3,
 'Ochakovo-Matveevskoe': 4,
 'Poselenie Sosenskoe': 5,
 'Orehovo-Borisovo Juzhnoe': 6,
 'Hovrino': 7,
 'Sokol': 8,
 'Nagatinskij Zaton': 9,
 "Kon'kovo": 10,
 'Obruchevskoe': 11,
 'Poselenie Vnukovskoe': 12,
 'Preobrazhenskoe': 13,
 'Poselenie Moskovskij': 14,
 'Bibirevo': 15,
 'Jasenevo': 16,
 'Mitino': 17,
 'Birjulevo Vostochnoe': 18,
 'Koptevo': 19,
 'Beskudnikovskoe': 20,
 'Butyrskoe': 21,
 'Losinoostrovskoe': 22,
 'Novo-Peredelkino': 23,
 'Caricyno': 24,
 'Veshnjaki': 25,
 'Tverskoe': 26,
 'Perovo': 27,
 'Jaroslavskoe': 28,
 'Strogino': 29,
 'Nekrasovka': 30,
 'Horoshevo-Mnevniki': 31,
 'Filevskij Park': 32,
 'Bogorodskoe': 33,
 'Savelki': 34,
 'Poselenie Desjonovskoe': 35,
 "Chertanovo Central'noe": 36,
 'Timirjazevskoe': 37,
 'Zapadnoe Degunino': 38,
 'Pokrovskoe Streshnevo': 39,
 'Severnoe Butovo': 40,
 'Kapotnja': 41,
 'Sviblovo': 42,
 'Danilovskoe': 43,
 'Sokolinaja Gora': 44,
 'V

In [18]:
df_f12 = df_f11.replace({'f12': f12_encode})
df_f12[notnum_index].head()

Unnamed: 0,f1y,f1m,f1d,f11,f12,f29,f33,f34,f35,f36,f37,f38,f39,f40,f106,f114,f118,f152
0,2014,1,29,0,0,no,no,no,no,no,no,no,no,no,no,no,no,good
1,2014,4,7,1,1,no,no,no,no,no,no,no,no,no,no,no,no,no data
2,2012,5,18,0,2,no,no,no,no,no,no,no,no,no,no,no,no,satisfactory
3,2013,2,8,0,3,no,no,no,no,no,no,no,no,no,no,no,no,poor
4,2014,1,10,0,4,no,yes,no,no,no,no,no,no,no,no,no,no,satisfactory


## Process f29 - f118

In [19]:
notnum_bools = notnum_index[5:-1]
print(notnum_bools)

['f29', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40', 'f106', 'f114', 'f118']


In [20]:
bools_unique = set(np.array([df_f12[col].unique() for col in notnum_bools]).flatten())
bools_unique

{'no', 'yes'}

In [21]:
bools_encode = dict(zip(bools_unique, range(len(bools_unique))))
bools_encode

{'yes': 0, 'no': 1}

In [22]:
df_bools = df_f12.replace(dict.fromkeys(notnum_bools, bools_encode))
df_bools[notnum_index].head()

Unnamed: 0,f1y,f1m,f1d,f11,f12,f29,f33,f34,f35,f36,f37,f38,f39,f40,f106,f114,f118,f152
0,2014,1,29,0,0,1,1,1,1,1,1,1,1,1,1,1,1,good
1,2014,4,7,1,1,1,1,1,1,1,1,1,1,1,1,1,1,no data
2,2012,5,18,0,2,1,1,1,1,1,1,1,1,1,1,1,1,satisfactory
3,2013,2,8,0,3,1,1,1,1,1,1,1,1,1,1,1,1,poor
4,2014,1,10,0,4,1,0,1,1,1,1,1,1,1,1,1,1,satisfactory


## Process f152

In [23]:
f152_unique = df_bools['f152'].unique()
f152_unique

array(['good', 'no data', 'satisfactory', 'poor', 'excellent'],
      dtype=object)

In [24]:
f152_encode = dict(zip(['no data', 'poor', 'satisfactory', 'good', 'excellent'], range(5)))
f152_encode

{'no data': 0, 'poor': 1, 'satisfactory': 2, 'good': 3, 'excellent': 4}

In [25]:
df_f152 = df_bools.replace({'f152': f152_encode})
df_f152[notnum_index].head()

Unnamed: 0,f1y,f1m,f1d,f11,f12,f29,f33,f34,f35,f36,f37,f38,f39,f40,f106,f114,f118,f152
0,2014,1,29,0,0,1,1,1,1,1,1,1,1,1,1,1,1,3
1,2014,4,7,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
2,2012,5,18,0,2,1,1,1,1,1,1,1,1,1,1,1,1,2
3,2013,2,8,0,3,1,1,1,1,1,1,1,1,1,1,1,1,1
4,2014,1,10,0,4,1,0,1,1,1,1,1,1,1,1,1,1,2


## Obtain result

In [26]:
df_final = df_f152.copy()
df_final.head()

Unnamed: 0,id,f1y,f1m,f1d,f2,f3,f4,f5,f6,f7,...,f282,f283,f284,f285,f286,f287,f288,f289,f290,target
0,1,2014,1,29,69,38.0,7.0,10.0,1.0,2001.0,...,1,0,0,2,8,1,0,19,2,9500000
1,2,2014,4,7,55,,2.0,1.0,4.0,,...,0,0,0,0,4,0,0,2,0,3837949
2,3,2012,5,18,30,16.0,2.0,,,,...,46,9,2,11,38,1,8,97,11,6250000
3,4,2013,2,8,44,43.0,1.0,,,,...,17,4,1,12,12,0,1,55,7,2000000
4,5,2014,1,10,45,28.0,3.0,5.0,2.0,1960.0,...,20,2,0,4,16,1,4,47,5,6700000


## Check NaNs

In [27]:
nans_cols = df_final.isna().sum()
nans_cols[nans_cols > 0].count()

51

In [28]:
df_final[nans_cols[nans_cols > 0].index].dtypes.unique()

array([dtype('float64')], dtype=object)

In [29]:
nans_rows = df_final.isna().sum(axis=1)
nans_rows[nans_rows >= 40]

172      40
554      40
785      40
1052     40
1359     40
         ..
22921    40
23466    40
23550    40
24236    40
24316    41
Length: 155, dtype: int64

In [30]:
print(list(nans_rows[nans_rows >= 40].index))

[172, 554, 785, 1052, 1359, 1495, 1646, 1812, 1997, 2133, 2249, 2302, 2529, 2542, 2678, 2684, 2781, 2801, 2861, 3251, 3254, 3699, 3904, 3959, 4208, 4365, 4544, 4656, 4834, 4929, 4941, 5151, 5159, 5277, 5474, 5621, 5634, 5691, 5805, 6092, 6191, 6256, 6347, 6388, 6524, 6862, 6876, 7066, 7202, 7259, 7506, 7508, 7575, 7618, 7625, 7694, 7796, 7980, 8119, 8184, 8484, 8673, 8834, 9290, 9632, 9693, 9786, 10247, 10427, 10572, 10639, 11016, 11281, 11394, 11648, 11857, 12249, 12556, 12625, 13322, 13330, 13498, 13592, 13696, 13772, 13911, 13996, 14014, 14062, 14331, 14345, 14518, 14925, 14999, 15026, 15086, 15313, 15359, 15493, 15679, 15812, 15916, 16036, 16129, 16532, 16820, 16965, 17116, 17534, 17574, 17971, 18011, 18037, 18053, 18256, 18311, 18334, 18429, 18475, 18532, 18630, 18632, 18691, 19155, 19187, 19253, 19429, 19523, 19792, 20051, 20087, 20279, 20435, 20650, 20927, 21346, 21410, 21508, 21570, 21735, 22029, 22096, 22195, 22210, 22235, 22258, 22324, 22356, 22477, 22608, 22921, 23466, 23550

## Remove too dirty entries

In [31]:
df_nans = df_final.drop(list(nans_rows[nans_rows >= 40].index))
df_nans.shape

(24221, 294)

## Impute missing values with KNN

In [32]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(weights='distance')

In [33]:
df_imp = imputer.fit_transform(df_nans)

In [34]:
df_imp2 = pd.DataFrame(df_imp, columns=df_final.columns)
df_imp2.head()

Unnamed: 0,id,f1y,f1m,f1d,f2,f3,f4,f5,f6,f7,...,f282,f283,f284,f285,f286,f287,f288,f289,f290,target
0,1.0,2014.0,1.0,29.0,69.0,38.0,7.0,10.0,1.0,2001.0,...,1.0,0.0,0.0,2.0,8.0,1.0,0.0,19.0,2.0,9500000.0
1,2.0,2014.0,4.0,7.0,55.0,48.13174,2.0,1.0,4.0,1591.150833,...,0.0,0.0,0.0,0.0,4.0,0.0,0.0,2.0,0.0,3837949.0
2,3.0,2012.0,5.0,18.0,30.0,16.0,2.0,7.2937,3.732457,1961.927529,...,46.0,9.0,2.0,11.0,38.0,1.0,8.0,97.0,11.0,6250000.0
3,4.0,2013.0,2.0,8.0,44.0,43.0,1.0,8.78533,1.428214,1970.661667,...,17.0,4.0,1.0,12.0,12.0,0.0,1.0,55.0,7.0,2000000.0
4,5.0,2014.0,1.0,10.0,45.0,28.0,3.0,5.0,2.0,1960.0,...,20.0,2.0,0.0,4.0,16.0,1.0,4.0,47.0,5.0,6700000.0


In [35]:
imputed = nans_cols[nans_cols > 0]
df_full = df_nans.copy()
df_full.reset_index(drop=True, inplace=True)
df_full.head()

Unnamed: 0,id,f1y,f1m,f1d,f2,f3,f4,f5,f6,f7,...,f282,f283,f284,f285,f286,f287,f288,f289,f290,target
0,1,2014,1,29,69,38.0,7.0,10.0,1.0,2001.0,...,1,0,0,2,8,1,0,19,2,9500000
1,2,2014,4,7,55,,2.0,1.0,4.0,,...,0,0,0,0,4,0,0,2,0,3837949
2,3,2012,5,18,30,16.0,2.0,,,,...,46,9,2,11,38,1,8,97,11,6250000
3,4,2013,2,8,44,43.0,1.0,,,,...,17,4,1,12,12,0,1,55,7,2000000
4,5,2014,1,10,45,28.0,3.0,5.0,2.0,1960.0,...,20,2,0,4,16,1,4,47,5,6700000


In [36]:
df_full[-3:]

Unnamed: 0,id,f1y,f1m,f1d,f2,f3,f4,f5,f6,f7,...,f282,f283,f284,f285,f286,f287,f288,f289,f290,target
24218,24374,2013,12,18,85,,16.0,17.0,1.0,1.0,...,3,1,0,3,9,0,1,10,2,9609180
24219,24375,2014,12,8,47,27.0,2.0,10.0,2.0,1958.0,...,217,86,11,118,176,1,64,146,13,12900000
24220,24376,2015,4,30,85,60.0,10.0,22.0,4.0,2015.0,...,1,1,0,2,12,0,0,9,0,10371441


In [37]:
df_full[imputed.index] = df_imp2[imputed.index]
df_full.head()

Unnamed: 0,id,f1y,f1m,f1d,f2,f3,f4,f5,f6,f7,...,f282,f283,f284,f285,f286,f287,f288,f289,f290,target
0,1,2014,1,29,69,38.0,7.0,10.0,1.0,2001.0,...,1,0,0,2,8,1,0,19,2,9500000
1,2,2014,4,7,55,48.13174,2.0,1.0,4.0,1591.150833,...,0,0,0,0,4,0,0,2,0,3837949
2,3,2012,5,18,30,16.0,2.0,7.2937,3.732457,1961.927529,...,46,9,2,11,38,1,8,97,11,6250000
3,4,2013,2,8,44,43.0,1.0,8.78533,1.428214,1970.661667,...,17,4,1,12,12,0,1,55,7,2000000
4,5,2014,1,10,45,28.0,3.0,5.0,2.0,1960.0,...,20,2,0,4,16,1,4,47,5,6700000


In [38]:
True in df_full.isna().values

False

## Preprocess methods

In [61]:
def process_f1(df, index):
    ymd = df['f1'].str.split('-', expand=True).astype('int64').rename(columns={0: 'f1y', 1: 'f1m', 2: 'f1d'})

    df_res = df.drop(['f1'], axis=1)
    df_res.insert(1, 'f1y', ymd['f1y'])
    df_res.insert(2, 'f1m', ymd['f1m'])
    df_res.insert(3, 'f1d', ymd['f1d'])
    return df_res, ['f1y', 'f1m', 'f1d'] + index[1:]

In [62]:
def process_f11(df, index):
    f11_unique = df['f11'].unique()
    f11_encode = dict(zip(f11_unique, range(len(f11_unique))))
    return df.replace({'f11': f11_encode}), index

In [63]:
def process_f12(df, index):
    f12_unique = df['f12'].unique()
    f12_encode = dict(zip(f12_unique, range(len(f12_unique))))
    return df.replace({'f12': f12_encode}), index

In [64]:
def process_bool(df, index):
    bool_index = index[5:-1]
    bool_unique = set(np.array([df[col].unique() for col in bool_index]).flatten())
    bool_encode = dict(zip(bool_unique, range(len(bool_unique))))
    return df.replace(dict.fromkeys(bool_index, bool_encode)), index

In [65]:
def process_f152(df, index):
    f152_encode = dict(zip(['no data', 'poor', 'satisfactory', 'good', 'excellent'], range(5)))
    return df.replace({'f152': f152_encode}), index

In [66]:
def process_categorical(df):
    cat = df.dtypes[df.dtypes == type(object)]
    cat_index = list(cat.index)
    
    df_res, cat_index = process_f1(df, cat_index)
    df_res, cat_index = process_f11(df_res, cat_index)
    df_res, cat_index = process_f12(df_res, cat_index)
    df_res, cat_index = process_bool(df_res, cat_index)
    df_res, cat_index = process_f152(df_res, cat_index)
    return df_res

In [67]:
def drop_na(df):
    na_rows = df.isna().sum(axis=1)
    return df.drop(list(na_rows[na_rows >= 40].index))

In [68]:
def impute_na(df):
    imputer = KNNImputer(weights='distance')
    imp = imputer.fit_transform(df)
    df_imp = pd.DataFrame(imp, columns=df.columns)
    
    na_cols = df.isna().sum()
    imp_cols = na_cols[na_cols > 0]

    df_res = df.copy()
    df_res.reset_index(drop=True, inplace=True)
    df_res[imp_cols.index] = df_imp[imp_cols.index]
    return df_res

In [75]:
def process_na(df, drop, impute):
    df_res = df.copy()
    if drop:
        df_res = drop_na(df_res)
    if impute:
        df_res = impute_na(df_res)
    return df_res

In [76]:
from sklearn.impute import KNNImputer

def preprocess(df, drop_na=False, impute_na=True):
    df_res = process_categorical(df)
    df_res = process_na(df_res, drop_na, impute_na)
    return df_res

In [5]:
df_full = preprocess(df, drop_na=True)

In [6]:
df_full.head()

Unnamed: 0,id,f1y,f1m,f1d,f2,f3,f4,f5,f6,f7,...,f282,f283,f284,f285,f286,f287,f288,f289,f290,target
0,1,2014,1,29,69,38.0,7.0,10.0,1.0,2001.0,...,1,0,0,2,8,1,0,19,2,9500000
1,2,2014,4,7,55,48.13174,2.0,1.0,4.0,1591.150833,...,0,0,0,0,4,0,0,2,0,3837949
2,3,2012,5,18,30,16.0,2.0,7.2937,3.732457,1961.927529,...,46,9,2,11,38,1,8,97,11,6250000
3,4,2013,2,8,44,43.0,1.0,8.78533,1.428214,1970.661667,...,17,4,1,12,12,0,1,55,7,2000000
4,5,2014,1,10,45,28.0,3.0,5.0,2.0,1960.0,...,20,2,0,4,16,1,4,47,5,6700000


In [9]:
True in df_full.isna().values

False

# Perform regression

## Prepare train-test data

In [71]:
X_train, y_train = df_full.drop(['id', 'target'], axis=1), df_full['target']

In [72]:
X_train.shape, y_train.shape

((24221, 292), (24221,))

In [73]:
df_test = pd.read_csv("input/test.csv", index_col=False)
df_test.head()

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f281,f282,f283,f284,f285,f286,f287,f288,f289,f290
0,1,2014-03-29,63,63.0,11.0,17.0,1.0,,2.0,1.0,...,5,1,1,0,2,12,0,1,10,0
1,2,2013-02-19,64,,2.0,,,,,,...,5,1,1,0,2,12,0,0,9,0
2,3,2013-09-18,40,,4.0,17.0,1.0,,1.0,1.0,...,636,371,141,26,150,249,2,105,203,13
3,4,2014-07-04,37,1.0,25.0,1.0,1.0,1.0,1.0,1.0,...,28,17,6,2,4,16,0,0,47,3
4,5,2014-09-24,127,58.0,11.0,20.0,1.0,2006.0,3.0,33.0,...,95,37,5,1,5,33,1,6,85,5


In [77]:
df_full_test = preprocess(df_test)

In [78]:
df_full_test.head()

Unnamed: 0,id,f1y,f1m,f1d,f2,f3,f4,f5,f6,f7,...,f281,f282,f283,f284,f285,f286,f287,f288,f289,f290
0,1,2014,3,29,63,63.0,11.0,17.0,1.0,1589.595957,...,5,1,1,0,2,12,0,1,10,0
1,2,2013,2,19,64,42.571823,2.0,14.789893,1.511149,166.854366,...,5,1,1,0,2,12,0,0,9,0
2,3,2013,9,18,40,37.626005,4.0,17.0,1.0,1679.704588,...,636,371,141,26,150,249,2,105,203,13
3,4,2014,7,4,37,1.0,25.0,1.0,1.0,1.0,...,28,17,6,2,4,16,0,0,47,3
4,5,2014,9,24,127,58.0,11.0,20.0,1.0,2006.0,...,95,37,5,1,5,33,1,6,85,5


In [79]:
df_full_test.shape

(6095, 293)

In [80]:
True in df_full_test.isna().values

False

In [81]:
X_test = df_full_test.drop('id', axis=1)

In [82]:
X_test.shape

(6095, 292)

In [83]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge

scaler = StandardScaler()
regressor = Ridge()

## Normalize data

In [84]:
train_scaled = scaler.fit_transform(X_train)
X_train_tr = pd.DataFrame(train_scaled, columns=X_train.columns)

In [85]:
X_train_tr.head()

Unnamed: 0,f1y,f1m,f1d,f2,f3,f4,f5,f6,f7,f8,...,f281,f282,f283,f284,f285,f286,f287,f288,f289,f290
0,0.566728,-1.634526,1.444801,0.359608,0.013203,-0.133291,-0.405307,-0.634696,-0.007172,0.197705,...,-0.440754,-0.424177,-0.38109,-0.328027,-0.450456,-0.473003,0.906896,-0.421537,-0.737525,-0.826355
1,0.566728,-0.780984,-1.086324,0.018403,0.190439,-1.073551,-1.868144,1.609667,-0.010243,0.197705,...,-0.504868,-0.437717,-0.38109,-0.328027,-0.51874,-0.556991,-0.732107,-0.421537,-1.104437,-1.236324
2,-1.511976,-0.49647,0.179239,-0.590892,-0.371646,-1.073551,-0.845182,1.409512,-0.007465,-0.942421,...,0.400745,0.185136,-0.065478,0.039327,-0.143176,0.156909,0.906896,-0.035204,0.945951,1.01851
3,-0.472624,-1.350012,-0.971273,-0.249687,0.100669,-1.261603,-0.602736,-0.31434,-0.0074,0.197705,...,-0.128197,-0.207532,-0.240818,-0.14435,-0.109034,-0.389015,-0.732107,-0.373246,0.039464,0.19857
4,0.566728,-1.634526,-0.74117,-0.225315,-0.161729,-0.885499,-1.217994,0.113425,-0.007479,0.197705,...,-0.200326,-0.166911,-0.310954,-0.328027,-0.382171,-0.305026,0.906896,-0.228371,-0.133201,-0.2114


In [86]:
test_scaled = scaler.fit_transform(X_test)
X_test_tr = pd.DataFrame(test_scaled, columns=X_test.columns)

In [87]:
X_test_tr.head()

Unnamed: 0,f1y,f1m,f1d,f2,f3,f4,f5,f6,f7,f8,...,f281,f282,f283,f284,f285,f286,f287,f288,f289,f290
0,0.575667,-1.067951,1.441431,0.386926,1.286445,0.640468,0.708663,-0.597697,-0.39546,0.10512,...,-0.473551,-0.423854,-0.345272,-0.327407,-0.447353,-0.382809,-0.716461,-0.372181,-0.927383,-1.216347
1,-0.465487,-1.350848,0.291584,0.43044,0.289938,-1.061213,0.354855,-0.213653,-3.211091,-0.350922,...,-0.473551,-0.423854,-0.345272,-0.327407,-0.447353,-0.382809,-0.716461,-0.421476,-0.949138,-1.216347
2,-0.465487,0.629427,0.176599,-0.613908,0.048676,-0.683062,0.708663,-0.597697,-0.217133,-1.1691,...,4.680397,4.667975,4.624286,4.507128,4.708969,4.700043,2.565636,4.754473,3.271247,1.435897
3,0.575667,0.063634,-1.433187,-0.744451,-1.737977,3.287528,-1.852719,-0.597697,-3.53932,-1.1691,...,-0.285689,-0.203666,-0.167787,0.04448,-0.377673,-0.297023,-0.716461,-0.421476,-0.122464,-0.60429
4,0.575667,0.629427,0.866508,3.171853,1.04254,0.640468,1.188922,-0.597697,0.428611,1.37934,...,0.261561,0.071568,-0.203284,-0.141464,-0.342833,0.06757,0.924588,-0.125708,0.704209,-0.196253


## Check cross val scores

In [88]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import minmax_scale

def rmsle(y, y_pred, **kwargs):
    return np.sqrt(mean_squared_log_error(y, minmax_scale(y_pred, feature_range=(np.min(y), np.max(y)))))

rmsle_scorer = make_scorer(rmsle, greater_is_better=False)

scores = cross_val_score(regressor, X_train_tr, y_train, cv=5, scoring=rmsle_scorer)
scores

array([-0.75812569, -0.76538143, -1.28762622, -1.20150868, -1.10974069])

## Search params

In [89]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform

params = {'alpha': loguniform(1, 10000)}
rscv = RandomizedSearchCV(regressor, params, n_iter=20, scoring=rmsle_scorer, refit=True, cv=5, random_state=3927)

In [90]:
rscv.fit(X_train_tr, y_train)

RandomizedSearchCV(cv=5, estimator=Ridge(), n_iter=20,
                   param_distributions={'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000275C713FFA0>},
                   random_state=3927,
                   scoring=make_scorer(rmsle, greater_is_better=False))

In [91]:
rscv.best_params_

{'alpha': 623.7301793522852}

In [92]:
scores = cross_val_score(rscv.best_estimator_, X_train_tr, y_train, cv=5, scoring=rmsle_scorer)
scores

array([-0.71884237, -0.71580498, -1.33434919, -0.95926213, -1.15974216])

## Predict

In [93]:
pred = rscv.predict(X_test_tr)
pred

array([ 5102263.11317858,  2747641.67303342,    37712.61901112, ...,
        9366572.32340748,  4554081.11103063, 11487477.43251889])

In [95]:
data = list(zip(range(1, len(pred) + 1), pred))
df_pred = pd.DataFrame(data, columns=['id', 'prediction'])
df_pred.head()

Unnamed: 0,id,prediction
0,1,5102263.0
1,2,2747642.0
2,3,37712.62
3,4,4454173.0
4,5,18749360.0


## Export to .csv

In [96]:
df_pred.to_csv("output/submission.csv", index=False)