Reference: https://www.kaggle.com/rakhlin/two-sigma-connect-rental-listing-inquiries/another-python-version-of-it-is-lit-by-branden/comments

In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from itertools import product
from sklearn.model_selection import StratifiedKFold

### Load data

In [4]:
X_train = pd.read_json("../Data//train.json").sort_values(by="listing_id")
X_test = pd.read_json("../Data//test.json").sort_values(by="listing_id")

In [5]:
X_train.shape, X_test.shape

((49352, 15), (74659, 14))

### Create functions

In [6]:
def add_features(df):
    fmt = lambda s: s.replace("\u00a0", "").strip().lower()
    df["photo_count"] = df["photos"].apply(len)
    df["street_address"] = df['street_address'].apply(fmt)
    df["display_address"] = df["display_address"].apply(fmt)
    df["desc_wordcount"] = df["description"].apply(len)
    df["pricePerBed"] = df['price'] / df['bedrooms']
    df["pricePerBath"] = df['price'] / df['bathrooms']
    df["pricePerRoom"] = df['price'] / (df['bedrooms'] + df['bathrooms'])
    df["bedPerBath"] = df['bedrooms'] / df['bathrooms']
    df["bedBathDiff"] = df['bedrooms'] - df['bathrooms']
    df["bedBathSum"] = df["bedrooms"] + df['bathrooms']
    df["bedsPerc"] = df["bedrooms"] / (df['bedrooms'] + df['bathrooms'])

    df = df.fillna(-1).replace(np.inf, -1)
    return df

### Factorize ... looks the same as label encoder

In [7]:
def factorize(df1, df2, column):
    ps = df1[column].append(df2[column])
    factors = ps.factorize()[0]
    df1[column] = factors[:len(df1)]
    df2[column] = factors[len(df1):]
    return df1, df2

In [17]:
print X_train['building_id'].factorize()[0]
print X_train['building_id'].factorize()[1]
print X_train['building_id'].factorize()[0].shape, X_train['building_id'].factorize()[1].shape

[   0    1    2 ..., 5867   47 1359]
Index([u'2a21319016fe50100e0b8ebb5a4f9cf0',
       u'104bfeddd65a0890b071c3a09cf81704',
       u'8775706158cbc96d12dd441d42e11deb',
       u'bc4e62116277654d4df66ab77a1152f8',
       u'18c5b031bad8cef779efa7e2398a42a3', u'0',
       u'a5c5a4cc6ef64d40bda51b452e3af4ea',
       u'b6e23dcf448fcaef1ded4a4fd869b82d',
       u'9a66a4682c43ff01758518b6a23c4dde',
       u'696917fb3bd7ea448cdf7fc79aeb78c2',
       ...
       u'ab0bd3836225c0acb3cf3077f0b9e9a9',
       u'45f2ab6987b543ae051bd40d5cb875f8',
       u'b4313c13d1141438c0a80bd494ca1b83',
       u'361fd472fb724af8354ed2f097645e60',
       u'ca82e7287cecb2e8e077e88d04a8a18e',
       u'cb87da191382fa9fecad08f8f1337417',
       u'517ee72f38f6a21c19d8c25607c95300',
       u'1bb8124c124498219af05125c4058bf9',
       u'a81dafc1ae99df742983bf320922054b',
       u'bd255b014cba65c3586be50477567f2d'],
      dtype='object', length=7585)
(49352,) (7585,)


In [16]:
X_train['building_id'].unique().shape

(7585,)

In [19]:
X_train['building_id'].factorize()[0].max(), X_train['building_id'].factorize()[0].min()

(7584, 0)

In [38]:
def designate_single_observations(df1, df2, column):
    ps = df1[column].append(df2[column])
    grouped = ps.groupby(ps).size().to_frame().rename(columns={0: "size"})
    df1.loc[df1.join(grouped, on=column, how="left")["size"] <= 1, column] = -1 #A single observation in test/train set
    df2.loc[df2.join(grouped, on=column, how="left")["size"] <= 1, column] = -1
    return df1, df2

In [27]:
temp = X_train['building_id']
print temp.groupby(temp).size().sort_values(ascending=False).head(10)
print temp.value_counts().head(10) #above is doing the same as this line

building_id
0                                   8286
96274288c84ddd7d5c5d8e425ee75027     275
11e1dec9d14b1a9e528386a2504b3afc     215
80a120d6bc3aba97f40fee8c2204524b     213
bb8658a3e432fb62a440615333376345     212
f68bf347f99df026f4faad43cc604048     191
c94301249b8c09429d329864d58e5b82     167
ce6d18bf3238e668b2bf23f4110b7b67     165
57ef86c28a8ae482dc3a3c3af28e8e48     159
128d4af0683efc5e1eded8dc8044d5e3     153
dtype: int64
0                                   8286
96274288c84ddd7d5c5d8e425ee75027     275
11e1dec9d14b1a9e528386a2504b3afc     215
80a120d6bc3aba97f40fee8c2204524b     213
bb8658a3e432fb62a440615333376345     212
f68bf347f99df026f4faad43cc604048     191
c94301249b8c09429d329864d58e5b82     167
ce6d18bf3238e668b2bf23f4110b7b67     165
57ef86c28a8ae482dc3a3c3af28e8e48     159
128d4af0683efc5e1eded8dc8044d5e3     153
Name: building_id, dtype: int64


In [29]:
temp.groupby(temp).size().to_frame().rename(columns={0: "size"}).head(4)

Unnamed: 0_level_0,size
building_id,Unnamed: 1_level_1
0,8286
00005cb939f9986300d987652c933e15,1
00024d77a43f0606f926e2312513845c,3
000ae4b7db298401cdae2b0ba1ea8146,1


In [45]:
def hcc_encode(train_df, test_df, variable, target, prior_prob, k, f=1, g=1, r_k=None, update_df=None):
    """
    See "A Preprocessing Scheme for High-Cardinality Categorical Attributes in
    Classification and Prediction Problems" by Daniele Micci-Barreca
    """
    hcc_name = "_".join(["hcc", variable, target])

    grouped = train_df.groupby(variable)[target].agg({"size": "size", "mean": "mean"})
    grouped["lambda"] = 1 / (g + np.exp((k - grouped["size"]) / f))
    grouped[hcc_name] = grouped["lambda"] * grouped["mean"] + (1 - grouped["lambda"]) * prior_prob

    df = test_df[[variable]].join(grouped, on=variable, how="left")[hcc_name].fillna(prior_prob)
    if r_k: df *= np.random.uniform(1 - r_k, 1 + r_k, len(test_df))     # Add uniform noise. Not mentioned in original paper

    if update_df is None: update_df = test_df
    if hcc_name not in update_df.columns: update_df[hcc_name] = np.nan
    update_df.update(df)
    return

### Make target integer, one hot encoded, calculate target priors 

In [31]:
# Make target integer, one hot encoded, calculate target priors
X_train = X_train.replace({"interest_level": {"low": 0, "medium": 1, "high": 2}})
X_train = X_train.join(pd.get_dummies(X_train["interest_level"], prefix="pred").astype(int))
prior_0, prior_1, prior_2 = X_train[["pred_0", "pred_1", "pred_2"]].mean()

### Add common features

In [32]:
X_train = add_features(X_train)
X_test = add_features(X_test)

In [36]:
pd.options.display.max_columns = 100
X_train.head(2)

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,longitude,manager_id,photos,price,street_address,pred_0,pred_1,pred_2,photo_count,desc_wordcount,pricePerBed,pricePerBath,pricePerRoom,bedPerBath,bedBathDiff,bedBathSum,bedsPerc
111817,1.0,1,2a21319016fe50100e0b8ebb5a4f9cf0,2016-04-01 22:12:41,X-LARGE Flex 1BR Loft! ~~ PRIME Greenwich Vill...,astor place,"[Doorman, Elevator, Laundry In Building]",2,40.7302,6811957,-73.9924,f07272f8ceb99db4c1a7cbbd9ae7b75b,[https://photos.renthop.com/2/6811957_3dad56e8...,3195,1 astor place,0,0,1,5,94,3195.0,3195.0,1597.5,1.0,0.0,2.0,0.5
117995,1.0,0,104bfeddd65a0890b071c3a09cf81704,2016-04-01 22:56:00,"This Enormous Studio Features: Harwood Floors,...",east 54th street,"[Cats Allowed, Dogs Allowed, No Fee, Laundry I...",1,40.7576,6811965,-73.9677,3b630ec9cb6eee53b92cfac7f42e3bf4,[https://photos.renthop.com/2/6811965_b8f942e6...,2000,230 east 54th street,0,1,0,2,357,-1.0,2000.0,2000.0,0.0,-1.0,1.0,0.0


In [42]:
X_train.shape

(49352, 27)

### Special designation for building_ids, manager_ids, display_address with only 1 observation

In [39]:
for col in ('building_id', 'manager_id', 'display_address'):
    X_train, X_test = designate_single_observations(X_train, X_test, col)

In [44]:
X_train[X_train.building_id==-1].head(3)

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,longitude,manager_id,photos,price,street_address,pred_0,pred_1,pred_2,photo_count,desc_wordcount,pricePerBed,pricePerBath,pricePerRoom,bedPerBath,bedBathDiff,bedBathSum,bedsPerc
114382,1.0,0,-1,2016-04-02 01:39:01,,west 83rd street,[Furnished],0,40.7844,6812685,-73.9728,62b685cc0d876c3a1a51d63a0d6a8082,[],2995,67 west 83rd street,1,0,0,0,0,-1.0,2995.0,2995.0,0.0,-1.0,1.0,0.0
115406,1.0,2,-1,2016-04-02 01:40:30,Be the first to live in this brand new Complet...,-1,"[Prewar, Dining Room, Hardwood Floors, High Ce...",1,40.7701,6812744,-73.9067,-1,[https://photos.renthop.com/2/6812744_592039b5...,2495,42-09 23rd avenue,0,1,0,5,928,1247.5,2495.0,831.666667,2.0,1.0,3.0,0.666667
94607,1.0,2,-1,2016-04-02 02:10:08,24 story luxury building in Manhattan’s Murray...,"e 39th st,","[Doorman, Elevator, Fitness Center, Stainless ...",0,40.7492,6812815,-73.9789,9d9e32a8be582241565541ec8073a1c8,[https://photos.renthop.com/2/6812815_cd9257bf...,3199,"112 e 39th st,",1,0,0,5,324,1599.5,3199.0,1066.333333,2.0,1.0,3.0,0.666667


### High-Cardinality Categorical encoding

In [46]:
skf = StratifiedKFold(5)
attributes = product(("building_id", "manager_id"), zip(("pred_1", "pred_2"), (prior_1, prior_2)))
for variable, (target, prior) in attributes:
    hcc_encode(X_train, X_test, variable, target, prior, k=5, r_k=None)
    for train, test in skf.split(np.zeros(len(X_train)), X_train['interest_level']):
        hcc_encode(X_train.iloc[train], X_train.iloc[test], variable, target, prior, k=5, r_k=0.01, update_df=X_train)

### Factorize building_id, display_address, manager_id, street_address

In [48]:
for col in ('building_id', 'display_address', 'manager_id', 'street_address'):
    X_train, X_test = factorize(X_train, X_test, col)

In [49]:
X_train.head(3)

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,longitude,manager_id,photos,price,street_address,pred_0,pred_1,pred_2,photo_count,desc_wordcount,pricePerBed,pricePerBath,pricePerRoom,bedPerBath,bedBathDiff,bedBathSum,bedsPerc,hcc_building_id_pred_1,hcc_building_id_pred_2,hcc_manager_id_pred_1,hcc_manager_id_pred_2
111817,1.0,1,0,2016-04-01 22:12:41,X-LARGE Flex 1BR Loft! ~~ PRIME Greenwich Vill...,0,"[Doorman, Elevator, Laundry In Building]",2,40.7302,6811957,-73.9924,0,[https://photos.renthop.com/2/6811957_3dad56e8...,3195,0,0,0,1,5,94,3195.0,3195.0,1597.5,1.0,0.0,2.0,0.5,0.212999,5.354268e-14,0.43172,0.212433
117995,1.0,0,1,2016-04-01 22:56:00,"This Enormous Studio Features: Harwood Floors,...",1,"[Cats Allowed, Dogs Allowed, No Fee, Laundry I...",1,40.7576,6811965,-73.9677,1,[https://photos.renthop.com/2/6811965_b8f942e6...,2000,1,0,1,0,2,357,-1.0,2000.0,2000.0,0.0,-1.0,1.0,0.0,0.215991,0.07402877,0.330009,0.498483
114617,2.0,3,2,2016-04-01 22:57:15,--- East 31st St & Lexington Avenue --- This S...,2,"[Common Outdoor Space, Cats Allowed, Private O...",2,40.7388,6811966,-73.9851,1,[https://photos.renthop.com/2/6811966_8b83c24d...,5850,2,0,0,1,7,411,1950.0,2925.0,1170.0,1.5,1.0,5.0,0.6,0.466949,0.1988932,0.333489,0.501509


### Create binarized features

#### Understand what MultiLabelBinarizer does ...

In [50]:
fmt = lambda feat: [s.replace("\u00a0", "").strip().lower().replace(" ", "_") for s in feat]  # format features
X_train["features"] = X_train["features"].apply(fmt)
X_test["features"] = X_test["features"].apply(fmt)
features = [f for f_list in list(X_train["features"]) + list(X_test["features"]) for f in f_list]
ps = pd.Series(features)
grouped = ps.groupby(ps).agg(len)
features = grouped[grouped >= 10].index.sort_values().values    # limit to features with >=10 observations
mlb = MultiLabelBinarizer().fit([features])
columns = ['feature_' + s for s in mlb.classes_]
flt = lambda l: [i for i in l if i in mlb.classes_]     # filter out features not present in MultiLabelBinarizer
X_train = X_train.join(pd.DataFrame(data=mlb.transform(X_train["features"].apply(flt)), columns=columns, index=X_train.index))
X_test = X_test.join(pd.DataFrame(data=mlb.transform(X_test["features"].apply(flt)), columns=columns, index=X_test.index))

In [52]:
X_train.head(5)

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,longitude,manager_id,photos,price,street_address,pred_0,pred_1,pred_2,photo_count,desc_wordcount,pricePerBed,pricePerBath,pricePerRoom,bedPerBath,bedBathDiff,bedBathSum,bedsPerc,hcc_building_id_pred_1,hcc_building_id_pred_2,hcc_manager_id_pred_1,hcc_manager_id_pred_2,feature_1_month_free,feature_24/7_concierge,feature_24/7_doorman,feature_24/7_doorman_concierge,feature_actual_apt._photos,feature_air_conditioning,feature_all_pets_ok,feature_all_utilities_included,feature_assigned-parking-space,feature_attended_lobby,feature_backyard,feature_balcony,feature_basement_storage,feature_basketball_court,feature_bike_room,feature_bike_storage,feature_billiards_room,feature_billiards_table_and_wet_bar,feature_brand_new,...,feature_shares_ok,feature_short_term_allowed,feature_simplex,feature_skylight,feature_skylight_atrium,feature_southern_exposure,feature_spa_services,feature_ss_appliances,feature_stainless_steel,feature_stainless_steel_appliances,feature_state-of-the-art_fitness_center,feature_storage,feature_storage_available,feature_storage_facilities_available,feature_storage_room,feature_sublet,feature_subway,feature_sundeck,feature_swimming_pool,feature_tenant_lounge,feature_terrace,feature_terraces_/_balconies,feature_tons_of_natural_light,feature_valet,feature_valet_parking,feature_valet_service,feature_valet_services,feature_valet_services_including_dry_cleaning,feature_video_intercom,feature_view,feature_virtual_doorman,feature_virtual_tour,feature_walk-in_closet,feature_walk-up,feature_walk_in_closet,feature_walk_in_closet(s),feature_washer/dryer,feature_washer/dryer_hookup,feature_washer/dryer_in-unit,feature_washer/dryer_in_building,feature_washer/dryer_in_unit,feature_washer_&_dryer,feature_washer_in_unit,feature_wheelchair_access,feature_wheelchair_ramp,feature_wifi,feature_wifi_access,feature_wood-burning_fireplace,feature_yard,feature_yoga_classes
111817,1.0,1,0,2016-04-01 22:12:41,X-LARGE Flex 1BR Loft! ~~ PRIME Greenwich Vill...,0,"[doorman, elevator, laundry_in_building]",2,40.7302,6811957,-73.9924,0,[https://photos.renthop.com/2/6811957_3dad56e8...,3195,0,0,0,1,5,94,3195.0,3195.0,1597.5,1.0,0.0,2.0,0.5,0.212999,5.354268e-14,0.43172,0.212433,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
117995,1.0,0,1,2016-04-01 22:56:00,"This Enormous Studio Features: Harwood Floors,...",1,"[cats_allowed, dogs_allowed, no_fee, laundry_i...",1,40.7576,6811965,-73.9677,1,[https://photos.renthop.com/2/6811965_b8f942e6...,2000,1,0,1,0,2,357,-1.0,2000.0,2000.0,0.0,-1.0,1.0,0.0,0.215991,0.07402877,0.330009,0.498483,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
114617,2.0,3,2,2016-04-01 22:57:15,--- East 31st St & Lexington Avenue --- This S...,2,"[common_outdoor_space, cats_allowed, private_o...",2,40.7388,6811966,-73.9851,1,[https://photos.renthop.com/2/6811966_8b83c24d...,5850,2,0,0,1,7,411,1950.0,2925.0,1170.0,1.5,1.0,5.0,0.6,0.466949,0.1988932,0.333489,0.501509,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
117474,1.0,1,3,2016-04-01 23:26:07,Reduced Fee!! Priced To Rent!\rLarge Newly Upd...,3,"[common_outdoor_space, cats_allowed, dogs_allo...",1,40.7939,6811973,-73.9738,2,[https://photos.renthop.com/2/6811973_c87c8f6d...,2745,3,0,1,0,5,816,2745.0,2745.0,1372.5,1.0,0.0,2.0,0.5,0.237941,0.06869109,0.223406,0.001406,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
103891,1.0,1,4,2016-04-02 00:48:13,Phenomenal deal of the century!! This spacious...,4,"[cats_allowed, dogs_allowed, doorman, elevator...",1,40.7784,6811975,-73.9491,3,[https://photos.renthop.com/2/6811975_370cb787...,2400,4,0,1,0,7,536,2400.0,2400.0,1200.0,1.0,0.0,2.0,0.5,0.133603,0.0660451,0.529925,0.104448,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Save data

In [None]:
'''
X_train = X_train.sort_index(axis=1).sort_values(by="listing_id")
X_test = X_test.sort_index(axis=1).sort_values(by="listing_id")
columns_to_drop = ["photos", "pred_0","pred_1", "pred_2", "description", "features", "created"]
X_train.drop([c for c in X_train.columns if c in columns_to_drop], axis=1).\
    to_csv("../Data/train_rakhlin.csv", index=False, encoding='utf-8')
X_test.drop([c for c in X_test.columns if c in columns_to_drop], axis=1).\
    to_csv("data/train_rakhlin.csv", index=False, encoding='utf-8')
'''