# Feature engineering

## Import libraries

In [40]:
import pandas as pd
from collections import Counter

## Get the datasets

In [41]:
train = pd.read_pickle('../datasets/train_01.pkl')
test = pd.read_pickle('../datasets/test_01.pkl')

## Engineering

### Clean the column `features`

In [42]:
train['features'] = train['features'].apply(lambda x : [item.replace('[', '').replace(']', '').replace("'", "").replace('"', '').replace(' ', '') for item in x])

In [43]:
train['features']

4         [DiningRoom, Pre-War, LaundryinBuilding, Dishw...
6         [Doorman, Elevator, LaundryinBuilding, Dishwas...
9         [Doorman, Elevator, LaundryinBuilding, Laundry...
10                                                       []
15        [Doorman, Elevator, FitnessCenter, LaundryinBu...
                                ...                        
124000               [Elevator, Dishwasher, HardwoodFloors]
124002    [CommonOutdoorSpace, CatsAllowed, DogsAllowed,...
124004    [DiningRoom, Elevator, Pre-War, LaundryinBuild...
124008    [Pre-War, LaundryinUnit, Dishwasher, NoFee, Ou...
124009    [DiningRoom, Elevator, LaundryinBuilding, Dish...
Name: features, Length: 48343, dtype: object

### Let's count the top 20 signs

In [44]:
all_features = []

for item in train['features']:
    all_features.extend(item)

In [45]:
print("Number of unique features: ", len(set(all_features)))

Number of unique features:  1529


In [46]:
cnt = Counter(all_features)
cnt.most_common(20)

[('Elevator', 25375),
 ('HardwoodFloors', 23146),
 ('CatsAllowed', 23135),
 ('DogsAllowed', 21652),
 ('Doorman', 20479),
 ('Dishwasher', 20081),
 ('NoFee', 17793),
 ('LaundryinBuilding', 16082),
 ('FitnessCenter', 12989),
 ('Pre-War', 8971),
 ('LaundryinUnit', 8437),
 ('RoofDeck', 6417),
 ('OutdoorSpace', 5132),
 ('DiningRoom', 4890),
 ('HighSpeedInternet', 4223),
 ('Balcony', 2898),
 ('SwimmingPool', 2643),
 ('LaundryInBuilding', 2564),
 ('NewConstruction', 2504),
 ('Terrace', 2177)]

### Creating a one-hot tables

In [47]:
top_features = [i[0] for i in cnt.most_common(20)]

for feature in top_features:
    train[feature] = train['features'].apply(lambda x : 1 if feature in x else 0)
    test[feature] = test['features'].apply(lambda x : 1 if feature in x else 0)

In [48]:
feature_list = ['bathrooms', 'bedrooms'] + top_features

In [49]:
oneHot_train = train[feature_list + ['price']]
oneHot_test = test[feature_list + ['price']]

## Saving the resulting tables

In [50]:
oneHot_train.to_pickle('../datasets/train_02_onehot.pkl')
oneHot_test.to_pickle('../datasets/test_02_onehot.pkl')