In [14]:
import warnings
warnings.filterwarnings('ignore')

In [15]:
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [16]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=1000):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.8
    param['colsample_bytree'] = 0.8
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=30)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

Let us read the train and test files and store it.

In [17]:
data_path = ""
train_file = data_path + "train.json"
test_file = data_path + "test.json"
train_df = pd.read_json(train_file)
test_df = pd.read_json(test_file)
print(train_df.shape)
print(test_df.shape)

(49352, 15)
(74659, 14)


We do not need any pre-processing for numerical features and so create a list with those features.

In [18]:
features_to_use  = ["bathrooms", "bedrooms", "latitude", "longitude", "price"]

Now let us create some new features from the given features.

In [19]:
# count of photos #
train_df["num_photos"] = train_df["photos"].apply(len)
test_df["num_photos"] = test_df["photos"].apply(len)

# count of "features" #
train_df["num_features"] = train_df["features"].apply(len)
test_df["num_features"] = test_df["features"].apply(len)

# count of words present in description column #
train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
test_df["num_description_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))

# convert the created column to datetime object so as to extract more features 
train_df["created"] = pd.to_datetime(train_df["created"])
test_df["created"] = pd.to_datetime(test_df["created"])

# Let us extract some features like year, month, day, hour from date columns #
train_df["created_year"] = train_df["created"].dt.year
test_df["created_year"] = test_df["created"].dt.year
train_df["created_month"] = train_df["created"].dt.month
test_df["created_month"] = test_df["created"].dt.month
train_df["created_day"] = train_df["created"].dt.day
test_df["created_day"] = test_df["created"].dt.day
train_df["created_hour"] = train_df["created"].dt.hour
test_df["created_hour"] = test_df["created"].dt.hour

# adding all these new features to use list #
features_to_use.extend(["num_photos", "num_features", "num_description_words","created_year", "created_month", "created_day", "listing_id", "created_hour"])

We have 4 categorical features in our data

 - display_address
 - manager_id
 - building_id
 - listing_id

So let us label encode these features.

In [20]:
categorical = ["display_address", "manager_id", "building_id", "street_address"]
for f in categorical:
        if train_df[f].dtype=='object':
            #print(f)
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(train_df[f].values) + list(test_df[f].values))
            train_df[f] = lbl.transform(list(train_df[f].values))
            test_df[f] = lbl.transform(list(test_df[f].values))
            features_to_use.append(f)

In [21]:
print (train_df.columns)
print (test_df.columns)

Index(['bathrooms', 'bedrooms', 'building_id', 'created', 'description',
       'display_address', 'features', 'interest_level', 'latitude',
       'listing_id', 'longitude', 'manager_id', 'photos', 'price',
       'street_address', 'num_photos', 'num_features', 'num_description_words',
       'created_year', 'created_month', 'created_day', 'created_hour'],
      dtype='object')
Index(['bathrooms', 'bedrooms', 'building_id', 'created', 'description',
       'display_address', 'features', 'latitude', 'listing_id', 'longitude',
       'manager_id', 'photos', 'price', 'street_address', 'num_photos',
       'num_features', 'num_description_words', 'created_year',
       'created_month', 'created_day', 'created_hour'],
      dtype='object')


In [22]:
image_date = pd.read_csv("listing_image_time.csv")

# rename columns so you can join tables later on
image_date.columns = ["listing_id", "time_stamp"]

# reassign the only one timestamp from April, all others from Oct/Nov
image_date.loc[80240,"time_stamp"] = 1478129766 

image_date["img_date"]                  = pd.to_datetime(image_date["time_stamp"], unit="s")
image_date["img_days_passed"]           = (image_date["img_date"].max() - image_date["img_date"]).astype("timedelta64[D]").astype(int)
image_date["img_date_month"]            = image_date["img_date"].dt.month
image_date["img_date_week"]             = image_date["img_date"].dt.week
image_date["img_date_day"]              = image_date["img_date"].dt.day
image_date["img_date_dayofweek"]        = image_date["img_date"].dt.dayofweek
image_date["img_date_dayofyear"]        = image_date["img_date"].dt.dayofyear
image_date["img_date_hour"]             = image_date["img_date"].dt.hour
image_date["img_date_monthBeginMidEnd"] = image_date["img_date_day"].apply(lambda x: 1 if x<10 else 2 if x<20 else 3)

train_df = pd.merge(train_df, image_date, on="listing_id", how="left")
test_df = pd.merge(test_df, image_date, on="listing_id", how="left")

In [23]:
print (train_df.columns)
print (test_df.columns)

Index(['bathrooms', 'bedrooms', 'building_id', 'created', 'description',
       'display_address', 'features', 'interest_level', 'latitude',
       'listing_id', 'longitude', 'manager_id', 'photos', 'price',
       'street_address', 'num_photos', 'num_features', 'num_description_words',
       'created_year', 'created_month', 'created_day', 'created_hour',
       'time_stamp', 'img_date', 'img_days_passed', 'img_date_month',
       'img_date_week', 'img_date_day', 'img_date_dayofweek',
       'img_date_dayofyear', 'img_date_hour', 'img_date_monthBeginMidEnd'],
      dtype='object')
Index(['bathrooms', 'bedrooms', 'building_id', 'created', 'description',
       'display_address', 'features', 'latitude', 'listing_id', 'longitude',
       'manager_id', 'photos', 'price', 'street_address', 'num_photos',
       'num_features', 'num_description_words', 'created_year',
       'created_month', 'created_day', 'created_hour', 'time_stamp',
       'img_date', 'img_days_passed', 'img_date_month', 

In [24]:
# for f in image_date.columns.tolist()[2:]:
#     features_to_use.append(f)
# features_to_use

We have features column which is a list of string values. So we can first combine all the strings together to get a single string and then apply count vectorizer on top of it.

In [25]:
train_df['features'] = train_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
test_df['features'] = test_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
print(train_df["features"].head())
tfidf = CountVectorizer(stop_words='english', max_features=200)
tr_sparse = tfidf.fit_transform(train_df["features"])
te_sparse = tfidf.transform(test_df["features"])

0                                                     
1    Doorman Elevator Fitness_Center Cats_Allowed D...
2    Laundry_In_Building Dishwasher Hardwood_Floors...
3                               Hardwood_Floors No_Fee
4                                              Pre-War
Name: features, dtype: object


Now let us stack both the dense and sparse features into a single dataset and also get the target variable.

In [26]:
test_df[features_to_use].dtypes

bathrooms                float64
bedrooms                   int64
latitude                 float64
longitude                float64
price                      int64
num_photos                 int64
num_features               int64
num_description_words      int64
created_year               int64
created_month              int64
created_day                int64
listing_id                 int64
created_hour               int64
display_address            int64
manager_id                 int64
building_id                int64
street_address             int64
dtype: object

In [27]:
train_X = sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr()
test_X = sparse.hstack([test_df[features_to_use], te_sparse]).tocsr()

target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
print(train_X.shape, test_X.shape)

(49352, 217) (74659, 217)


Now let us do some cross validation to check the scores. 

Please run it in local to get the cv scores. I am commenting it out here for time.

In [28]:
cv_scores = []
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2016)
for dev_index, val_index in kf.split(range(train_X.shape[0])):
        dev_X, val_X = train_X[dev_index,:], train_X[val_index,:]
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        preds, model = runXGB(dev_X, dev_y, val_X, val_y)
        cv_scores.append(log_loss(val_y, preds))
        print(cv_scores)
        break

[0]	train-mlogloss:1.04102	test-mlogloss:1.04206
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 30 rounds.
[1]	train-mlogloss:0.98849	test-mlogloss:0.990561
[2]	train-mlogloss:0.943702	test-mlogloss:0.94669
[3]	train-mlogloss:0.904819	test-mlogloss:0.908604
[4]	train-mlogloss:0.870021	test-mlogloss:0.874652
[5]	train-mlogloss:0.839307	test-mlogloss:0.844943
[6]	train-mlogloss:0.813479	test-mlogloss:0.819983
[7]	train-mlogloss:0.790236	test-mlogloss:0.797448
[8]	train-mlogloss:0.770182	test-mlogloss:0.778248
[9]	train-mlogloss:0.751936	test-mlogloss:0.760885
[10]	train-mlogloss:0.7354	test-mlogloss:0.745048
[11]	train-mlogloss:0.722037	test-mlogloss:0.732237
[12]	train-mlogloss:0.708419	test-mlogloss:0.719473
[13]	train-mlogloss:0.697253	test-mlogloss:0.70915
[14]	train-mlogloss:0.686692	test-mlogloss:0.699255
[15]	train-mlogloss:0.676662	test-mlogloss:0.690078
[16]	train-mlogloss:0.667788	test-

[155]	train-mlogloss:0.450641	test-mlogloss:0.555883
[156]	train-mlogloss:0.450034	test-mlogloss:0.555679
[157]	train-mlogloss:0.449586	test-mlogloss:0.555602
[158]	train-mlogloss:0.448978	test-mlogloss:0.555536
[159]	train-mlogloss:0.448257	test-mlogloss:0.555338
[160]	train-mlogloss:0.447706	test-mlogloss:0.555221
[161]	train-mlogloss:0.446962	test-mlogloss:0.555187
[162]	train-mlogloss:0.446253	test-mlogloss:0.555089
[163]	train-mlogloss:0.445537	test-mlogloss:0.554996
[164]	train-mlogloss:0.444998	test-mlogloss:0.554969
[165]	train-mlogloss:0.444134	test-mlogloss:0.554862
[166]	train-mlogloss:0.443509	test-mlogloss:0.554728
[167]	train-mlogloss:0.442758	test-mlogloss:0.55467
[168]	train-mlogloss:0.442086	test-mlogloss:0.554563
[169]	train-mlogloss:0.441512	test-mlogloss:0.554377
[170]	train-mlogloss:0.44108	test-mlogloss:0.554378
[171]	train-mlogloss:0.440594	test-mlogloss:0.554207
[172]	train-mlogloss:0.439963	test-mlogloss:0.553982
[173]	train-mlogloss:0.439448	test-mlogloss:0.55

[311]	train-mlogloss:0.36978	test-mlogloss:0.54852
[312]	train-mlogloss:0.369256	test-mlogloss:0.548568
[313]	train-mlogloss:0.368811	test-mlogloss:0.548615
[314]	train-mlogloss:0.368329	test-mlogloss:0.548648
[315]	train-mlogloss:0.368023	test-mlogloss:0.54864
[316]	train-mlogloss:0.367469	test-mlogloss:0.548639
[317]	train-mlogloss:0.366897	test-mlogloss:0.548632
[318]	train-mlogloss:0.366648	test-mlogloss:0.54866
[319]	train-mlogloss:0.366329	test-mlogloss:0.548652
[320]	train-mlogloss:0.365928	test-mlogloss:0.548758
[321]	train-mlogloss:0.36534	test-mlogloss:0.548657
[322]	train-mlogloss:0.364877	test-mlogloss:0.548706
[323]	train-mlogloss:0.36455	test-mlogloss:0.548812
[324]	train-mlogloss:0.36411	test-mlogloss:0.548762
[325]	train-mlogloss:0.363691	test-mlogloss:0.548728
[326]	train-mlogloss:0.363181	test-mlogloss:0.548649
[327]	train-mlogloss:0.36287	test-mlogloss:0.548637
[328]	train-mlogloss:0.362528	test-mlogloss:0.548697
[329]	train-mlogloss:0.362216	test-mlogloss:0.548751
[

Now let us build the final model and get the predictions on the test set.

In [29]:
preds, model = runXGB(train_X, train_y, test_X, num_rounds=400)
out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test_df.listing_id.values
out_df.to_csv("xgb.csv", index=False)


Hope this helps the python users as a good starting point.