# Feature engineering

## Import libraries

In [23]:
import pandas as pd
from collections import Counter

## Get the datasets

In [24]:
train = pd.read_pickle('../datasets/train_01.pkl')
test = pd.read_json('../datasets/test.json')

## Engineering

### Clean the column `features`

In [27]:
train['features'] = train['features'].apply(lambda x : [item.replace('[', '').replace(']', '').replace("'", "").replace('"', '').replace(' ', '') for item in x])

In [14]:
train['features']

4         [DiningRoom, Pre-War, LaundryinBuilding, Dishw...
6         [Doorman, Elevator, LaundryinBuilding, Dishwas...
9         [Doorman, Elevator, LaundryinBuilding, Laundry...
10                                                       []
15        [Doorman, Elevator, FitnessCenter, LaundryinBu...
                                ...                        
124000               [Elevator, Dishwasher, HardwoodFloors]
124002    [CommonOutdoorSpace, CatsAllowed, DogsAllowed,...
124004    [DiningRoom, Elevator, Pre-War, LaundryinBuild...
124008    [Pre-War, LaundryinUnit, Dishwasher, NoFee, Ou...
124009    [DiningRoom, Elevator, LaundryinBuilding, Dish...
Name: features, Length: 49352, dtype: object

### Let's count the top 20 signs

In [28]:
all_features = []

for item in train['features']:
    all_features.extend(item)

In [29]:
print("Number of unique features: ", len(set(all_features)))

Number of unique features:  1546


In [30]:
cnt = Counter(all_features)
cnt.most_common(20)

[('Elevator', 25915),
 ('CatsAllowed', 23540),
 ('HardwoodFloors', 23527),
 ('DogsAllowed', 22035),
 ('Doorman', 20898),
 ('Dishwasher', 20426),
 ('NoFee', 18062),
 ('LaundryinBuilding', 16344),
 ('FitnessCenter', 13252),
 ('Pre-War', 9148),
 ('LaundryinUnit', 8738),
 ('RoofDeck', 6542),
 ('OutdoorSpace', 5268),
 ('DiningRoom', 5136),
 ('HighSpeedInternet', 4299),
 ('Balcony', 2992),
 ('SwimmingPool', 2730),
 ('LaundryInBuilding', 2593),
 ('NewConstruction', 2559),
 ('Terrace', 2283)]

### Creating a one-hot tables

In [33]:
top_features = [i[0] for i in cnt.most_common(20)]

for feature in top_features:
    train[feature] = train['features'].apply(lambda x : 1 if feature in x else 0)
    test[feature] = test['features'].apply(lambda x : 1 if feature in x else 0)

In [34]:
feature_list = ['bathrooms', 'bedrooms'] + top_features

In [None]:
oneHot_train = train[feature_list + ['price']]
oneHot_test = test[feature_list + ['price']]

## Saving the resulting tables

In [38]:
oneHot_train.to_pickle('../datasets/train_02_onehot.pkl')
oneHot_test.to_pickle('../datasets/test_02_onehot.pkl')