# Imports

In [27]:
import pandas
import numpy as np

# Input

In [2]:
df = pandas.read_csv("input/train.csv", index_col=False)

In [3]:
df.head()

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f282,f283,f284,f285,f286,f287,f288,f289,f290,target
0,1,2014-01-29,69,38.0,7.0,10.0,1.0,2001.0,2.0,11.0,...,1,0,0,2,8,1,0,19,2,9500000
1,2,2014-04-07,55,,2.0,1.0,4.0,,2.0,10.0,...,0,0,0,0,4,0,0,2,0,3837949
2,3,2012-05-18,30,16.0,2.0,,,,,,...,46,9,2,11,38,1,8,97,11,6250000
3,4,2013-02-08,44,43.0,1.0,,,,,,...,17,4,1,12,12,0,1,55,7,2000000
4,5,2014-01-10,45,28.0,3.0,5.0,2.0,1960.0,2.0,5.0,...,20,2,0,4,16,1,4,47,5,6700000


# Preprocessing

## Check types

In [4]:
df.dtypes.value_counts()

int64      157
float64    119
object      16
dtype: int64

## Get notnums w/ indices

In [5]:
notnum = df.dtypes[df.dtypes == type(object)]
notnum

f1      object
f11     object
f12     object
f29     object
f33     object
f34     object
f35     object
f36     object
f37     object
f38     object
f39     object
f40     object
f106    object
f114    object
f118    object
f152    object
dtype: object

In [6]:
notnum_index = list(notnum.index)
print(notnum_index)

['f1', 'f11', 'f12', 'f29', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40', 'f106', 'f114', 'f118', 'f152']


In [7]:
df[notnum_index].head()

Unnamed: 0,f1,f11,f12,f29,f33,f34,f35,f36,f37,f38,f39,f40,f106,f114,f118,f152
0,2014-01-29,Investment,Juzhnoe Butovo,no,no,no,no,no,no,no,no,no,no,no,no,good
1,2014-04-07,OwnerOccupier,Poselenie Filimonkovskoe,no,no,no,no,no,no,no,no,no,no,no,no,no data
2,2012-05-18,Investment,Lomonosovskoe,no,no,no,no,no,no,no,no,no,no,no,no,satisfactory
3,2013-02-08,Investment,Juzhnoe Tushino,no,no,no,no,no,no,no,no,no,no,no,no,poor
4,2014-01-10,Investment,Ochakovo-Matveevskoe,no,yes,no,no,no,no,no,no,no,no,no,no,satisfactory


## Process f1

In [8]:
ymd = df['f1'].str.split('-', expand=True).astype('int64').rename(columns={0: 'f1y', 1: 'f1m', 2: 'f1d'})
ymd

Unnamed: 0,f1y,f1m,f1d
0,2014,1,29
1,2014,4,7
2,2012,5,18
3,2013,2,8
4,2014,1,10
...,...,...,...
24371,2014,12,21
24372,2012,5,10
24373,2013,12,18
24374,2014,12,8


In [9]:
df_f1 = df.drop(['f1'], axis=1)
df_f1.insert(0, 'f1y', ymd['f1y'])
df_f1.insert(1, 'f1m', ymd['f1m'])
df_f1.insert(2, 'f1d', ymd['f1d'])

In [10]:
notnum_index = ['f1y', 'f1m', 'f1d'] + notnum_index[1:]
print(notnum_index)

['f1y', 'f1m', 'f1d', 'f11', 'f12', 'f29', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40', 'f106', 'f114', 'f118', 'f152']


In [11]:
df_f1[notnum_index].head()

Unnamed: 0,f1y,f1m,f1d,f11,f12,f29,f33,f34,f35,f36,f37,f38,f39,f40,f106,f114,f118,f152
0,2014,1,29,Investment,Juzhnoe Butovo,no,no,no,no,no,no,no,no,no,no,no,no,good
1,2014,4,7,OwnerOccupier,Poselenie Filimonkovskoe,no,no,no,no,no,no,no,no,no,no,no,no,no data
2,2012,5,18,Investment,Lomonosovskoe,no,no,no,no,no,no,no,no,no,no,no,no,satisfactory
3,2013,2,8,Investment,Juzhnoe Tushino,no,no,no,no,no,no,no,no,no,no,no,no,poor
4,2014,1,10,Investment,Ochakovo-Matveevskoe,no,yes,no,no,no,no,no,no,no,no,no,no,satisfactory


## Process f11

In [12]:
f11_unique = df_f1['f11'].unique()
f11_unique

array(['Investment', 'OwnerOccupier'], dtype=object)

In [13]:
f11_encode = dict(zip(f11_unique, range(len(f11_unique))))
f11_encode

{'Investment': 0, 'OwnerOccupier': 1}

In [14]:
df_f11 = df_f1.replace({'f11': f11_encode})
df_f11[notnum_index].head()

Unnamed: 0,f1y,f1m,f1d,f11,f12,f29,f33,f34,f35,f36,f37,f38,f39,f40,f106,f114,f118,f152
0,2014,1,29,0,Juzhnoe Butovo,no,no,no,no,no,no,no,no,no,no,no,no,good
1,2014,4,7,1,Poselenie Filimonkovskoe,no,no,no,no,no,no,no,no,no,no,no,no,no data
2,2012,5,18,0,Lomonosovskoe,no,no,no,no,no,no,no,no,no,no,no,no,satisfactory
3,2013,2,8,0,Juzhnoe Tushino,no,no,no,no,no,no,no,no,no,no,no,no,poor
4,2014,1,10,0,Ochakovo-Matveevskoe,no,yes,no,no,no,no,no,no,no,no,no,no,satisfactory


## Process f12

In [17]:
f12_unique = df_f11['f12'].unique()
f12_unique

array(['Juzhnoe Butovo', 'Poselenie Filimonkovskoe', 'Lomonosovskoe',
       'Juzhnoe Tushino', 'Ochakovo-Matveevskoe', 'Poselenie Sosenskoe',
       'Orehovo-Borisovo Juzhnoe', 'Hovrino', 'Sokol',
       'Nagatinskij Zaton', "Kon'kovo", 'Obruchevskoe',
       'Poselenie Vnukovskoe', 'Preobrazhenskoe', 'Poselenie Moskovskij',
       'Bibirevo', 'Jasenevo', 'Mitino', 'Birjulevo Vostochnoe',
       'Koptevo', 'Beskudnikovskoe', 'Butyrskoe', 'Losinoostrovskoe',
       'Novo-Peredelkino', 'Caricyno', 'Veshnjaki', 'Tverskoe', 'Perovo',
       'Jaroslavskoe', 'Strogino', 'Nekrasovka', 'Horoshevo-Mnevniki',
       'Filevskij Park', 'Bogorodskoe', 'Savelki',
       'Poselenie Desjonovskoe', "Chertanovo Central'noe",
       'Timirjazevskoe', 'Zapadnoe Degunino', 'Pokrovskoe Streshnevo',
       'Severnoe Butovo', 'Kapotnja', 'Sviblovo', 'Danilovskoe',
       'Sokolinaja Gora', 'Vostochnoe Izmajlovo', 'Matushkino',
       'Krjukovo', "Krasnosel'skoe", 'Ajeroport', 'Taganskoe',
       'Cheremushki

In [18]:
f12_encode = dict(zip(f12_unique, range(len(f12_unique))))
f12_encode

{'Juzhnoe Butovo': 0,
 'Poselenie Filimonkovskoe': 1,
 'Lomonosovskoe': 2,
 'Juzhnoe Tushino': 3,
 'Ochakovo-Matveevskoe': 4,
 'Poselenie Sosenskoe': 5,
 'Orehovo-Borisovo Juzhnoe': 6,
 'Hovrino': 7,
 'Sokol': 8,
 'Nagatinskij Zaton': 9,
 "Kon'kovo": 10,
 'Obruchevskoe': 11,
 'Poselenie Vnukovskoe': 12,
 'Preobrazhenskoe': 13,
 'Poselenie Moskovskij': 14,
 'Bibirevo': 15,
 'Jasenevo': 16,
 'Mitino': 17,
 'Birjulevo Vostochnoe': 18,
 'Koptevo': 19,
 'Beskudnikovskoe': 20,
 'Butyrskoe': 21,
 'Losinoostrovskoe': 22,
 'Novo-Peredelkino': 23,
 'Caricyno': 24,
 'Veshnjaki': 25,
 'Tverskoe': 26,
 'Perovo': 27,
 'Jaroslavskoe': 28,
 'Strogino': 29,
 'Nekrasovka': 30,
 'Horoshevo-Mnevniki': 31,
 'Filevskij Park': 32,
 'Bogorodskoe': 33,
 'Savelki': 34,
 'Poselenie Desjonovskoe': 35,
 "Chertanovo Central'noe": 36,
 'Timirjazevskoe': 37,
 'Zapadnoe Degunino': 38,
 'Pokrovskoe Streshnevo': 39,
 'Severnoe Butovo': 40,
 'Kapotnja': 41,
 'Sviblovo': 42,
 'Danilovskoe': 43,
 'Sokolinaja Gora': 44,
 'V

In [19]:
df_f12 = df_f11.replace({'f12': f12_encode})
df_f12[notnum_index].head()

Unnamed: 0,f1y,f1m,f1d,f11,f12,f29,f33,f34,f35,f36,f37,f38,f39,f40,f106,f114,f118,f152
0,2014,1,29,0,0,no,no,no,no,no,no,no,no,no,no,no,no,good
1,2014,4,7,1,1,no,no,no,no,no,no,no,no,no,no,no,no,no data
2,2012,5,18,0,2,no,no,no,no,no,no,no,no,no,no,no,no,satisfactory
3,2013,2,8,0,3,no,no,no,no,no,no,no,no,no,no,no,no,poor
4,2014,1,10,0,4,no,yes,no,no,no,no,no,no,no,no,no,no,satisfactory


## Process f29 - f118

In [23]:
notnum_bools = notnum_index[5:-1]
print(notnum_bools)

['f29', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40', 'f106', 'f114', 'f118']


In [30]:
bools_unique = set(np.array([df_f12[col].unique() for col in notnum_bools]).flatten())
bools_unique

{'no', 'yes'}

In [31]:
bools_encode = dict(zip(bools_unique, range(len(bools_unique))))
bools_encode

{'no': 0, 'yes': 1}

In [34]:
df_bools = df_f12.replace(dict.fromkeys(notnum_bools, bools_encode))
df_bools[notnum_index].head()

Unnamed: 0,f1y,f1m,f1d,f11,f12,f29,f33,f34,f35,f36,f37,f38,f39,f40,f106,f114,f118,f152
0,2014,1,29,0,0,0,0,0,0,0,0,0,0,0,0,0,0,good
1,2014,4,7,1,1,0,0,0,0,0,0,0,0,0,0,0,0,no data
2,2012,5,18,0,2,0,0,0,0,0,0,0,0,0,0,0,0,satisfactory
3,2013,2,8,0,3,0,0,0,0,0,0,0,0,0,0,0,0,poor
4,2014,1,10,0,4,0,1,0,0,0,0,0,0,0,0,0,0,satisfactory


## Process f152

In [35]:
f152_unique = df_bools['f152'].unique()
f152_unique

array(['good', 'no data', 'satisfactory', 'poor', 'excellent'],
      dtype=object)

In [36]:
f152_encode = dict(zip(['no data', 'poor', 'satisfactory', 'good', 'excellent'], range(5)))
f152_encode

{'no data': 0, 'poor': 1, 'satisfactory': 2, 'good': 3, 'excellent': 4}

In [37]:
df_f152 = df_bools.replace({'f152': f152_encode})
df_f152[notnum_index].head()

Unnamed: 0,f1y,f1m,f1d,f11,f12,f29,f33,f34,f35,f36,f37,f38,f39,f40,f106,f114,f118,f152
0,2014,1,29,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3
1,2014,4,7,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2012,5,18,0,2,0,0,0,0,0,0,0,0,0,0,0,0,2
3,2013,2,8,0,3,0,0,0,0,0,0,0,0,0,0,0,0,1
4,2014,1,10,0,4,0,1,0,0,0,0,0,0,0,0,0,0,2


## Obtain result

In [38]:
df_final = df_f152.copy()
df_final.head()

Unnamed: 0,f1y,f1m,f1d,id,f2,f3,f4,f5,f6,f7,...,f282,f283,f284,f285,f286,f287,f288,f289,f290,target
0,2014,1,29,1,69,38.0,7.0,10.0,1.0,2001.0,...,1,0,0,2,8,1,0,19,2,9500000
1,2014,4,7,2,55,,2.0,1.0,4.0,,...,0,0,0,0,4,0,0,2,0,3837949
2,2012,5,18,3,30,16.0,2.0,,,,...,46,9,2,11,38,1,8,97,11,6250000
3,2013,2,8,4,44,43.0,1.0,,,,...,17,4,1,12,12,0,1,55,7,2000000
4,2014,1,10,5,45,28.0,3.0,5.0,2.0,1960.0,...,20,2,0,4,16,1,4,47,5,6700000
