In [19]:
import pandas as pd
import numpy as np
import scipy.sparse


import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.metrics import log_loss
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import FeatureHasher
import scipy.sparse
from sklearn.pipeline import Pipeline


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


In [59]:
train = pd.read_json('input/train.json', orient='columns')
train.head(2)

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,longitude,manager_id,photos,price,street_address
10.0,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue
10000.0,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,-73.9667,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue


## Helper Functions

In [3]:
def createSamples(X_train, y_train, X_test, y_test, num_train= 1000, num_test=400):
    
    nums = np.arange(X_train.shape[0])
    np.random.shuffle(nums)
    X_train_samp = X_train.iloc[nums[:num_train]]
    y_train_samp = y_train.iloc[nums[:num_train]]


    test_nums = np.arange(X_test.shape[0])
    np.random.shuffle(test_nums)

    X_test_samp = X_test.iloc[test_nums[:num_test]]
    y_test_samp = y_test.iloc[test_nums[:num_test]]
    return X_train_samp, y_train_samp, X_test_samp, y_test_samp

def get_accuracy(preds, y_test_samp):
    return (preds == y_test_samp).sum() / preds.shape[0] 

In [4]:
def subAndSample(subset, num_train=20000, num_test=2000, comp=False):
    X = subset.drop('enc_interest', axis=1)
    y = subset.enc_interest
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
    if comp:
        return X_train, y_train, X_test, y_test 
    else:
        return createSamples(X_train, y_train, X_test, y_test, num_train=num_train, num_test=num_test)

In [8]:
def standardize(X_train, X_test):
    sc = StandardScaler()
    sc.fit(X_train)
    X_train = sc.transform(X_train)
    X_test = sc.transform(X_test)
    return X_train, X_test

In [16]:
def hashFeature(X_train, X_test, feature):
    h = FeatureHasher(input_type='string')
    hashed = h.fit_transform(X_train[feature])
    test_hashed = h.transform(X_test[feature])
    return hashed, test_hashed

In [60]:
train['enc_interest'] = train.interest_level.map({'low': 0, 'medium': 1, 'high':2})

train['created'] = pd.to_datetime(train.created)
train['hour'] = train.created.dt.hour
train['feat_str'] = train.features.apply(lambda x: " ".join(x))

train['num_photos'] = train.photos.apply(lambda x: len(x))
train['no_photos'] = train.num_photos == 0
train['no_photos'] = train.no_photos.astype(int)

In [61]:
X_train, y_train, X_test, y_test = subAndSample(train, comp=True)

tr_build_id, test_build_id = hashFeature(X_train, X_test, 'building_id')
tr_man_id, test_man_id = hashFeature(X_train, X_test, 'manager_id')



text_proc_feat = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))),
                      ('tfidf', TfidfTransformer(use_idf=True)) ])
text_proc_desc = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))),
                      ('tfidf', TfidfTransformer(use_idf=True)) ])

text_proc_feat.fit(X_train.feat_str.values)
tr_feat_text = text_proc_feat.transform(X_train.feat_str.values)
test_feat_text = text_proc_feat.transform(X_test.feat_str.values)


text_proc_desc.fit(X_train.description.values)
tr_desc_mat = text_proc_desc.transform(X_train.description.values)
test_desc_mat = text_proc_desc.transform(X_test.description.values)


In [62]:
to_drop = ['building_id', 'created', 'description',
       'display_address', 'features', 'feat_str', 'interest_level',
       'listing_id',  'manager_id', 'photos', 'street_address']


X_train = X_train.drop(to_drop, axis=1)
X_test = X_test.drop(to_drop, axis=1)


In [63]:
X_train.head()

Unnamed: 0,bathrooms,bedrooms,latitude,longitude,price,hour,num_photos,no_photos
19544.0,1.0,2,40.7492,-73.9789,3090,3,8,0
62173.0,2.0,2,40.7084,-74.0057,5075,6,5,0
106499.0,1.0,2,40.753,-73.9695,4000,6,6,0
93075.0,1.0,0,40.7338,-73.9865,2500,6,3,0
49075.0,2.0,3,40.7252,-73.9947,5750,6,6,0


In [64]:
X_train, X_test = standardize(X_train,X_test)

X_train_concat = scipy.sparse.hstack((X_train,
                                      tr_build_id, 
                                      tr_man_id,
                                      tr_feat_text, 
                                      tr_desc_mat))

X_test_concat = scipy.sparse.hstack((X_test, 
                                     test_build_id, 
                                     test_man_id,
                                     test_feat_text, 
                                     test_desc_mat))


In [65]:
dtrain = xgb.DMatrix(X_train_concat, label= y_train)
dval = xgb.DMatrix(X_test_concat, label=y_test)

# specify parameters via map
watchlist = [(dtrain,'train'), (dval,'val')]
param = {'max_depth':3, 'eta':1, 'silent':1, 'num_class': 3, 'objective':'multi:softprob' }
num_round = 20
bst = xgb.train(param, dtrain, num_round, watchlist)


[0]	train-merror:0.295256	val-merror:0.296221
[1]	train-merror:0.293635	val-merror:0.294398
[2]	train-merror:0.284415	val-merror:0.287408
[3]	train-merror:0.280439	val-merror:0.287914
[4]	train-merror:0.276462	val-merror:0.284571
[5]	train-merror:0.272713	val-merror:0.282038
[6]	train-merror:0.269598	val-merror:0.282747
[7]	train-merror:0.268914	val-merror:0.281836
[8]	train-merror:0.26666	val-merror:0.278594
[9]	train-merror:0.262683	val-merror:0.2791
[10]	train-merror:0.260834	val-merror:0.277682
[11]	train-merror:0.259163	val-merror:0.27829
[12]	train-merror:0.256326	val-merror:0.274035
[13]	train-merror:0.254553	val-merror:0.274947
[14]	train-merror:0.252425	val-merror:0.27363
[15]	train-merror:0.250424	val-merror:0.273225
[16]	train-merror:0.248677	val-merror:0.272515
[17]	train-merror:0.246929	val-merror:0.273123
[18]	train-merror:0.245586	val-merror:0.273529
[19]	train-merror:0.243408	val-merror:0.273022


In [66]:
preds = bst.predict(dval)
print(log_loss(y_test, preds))
log_loss(y_train, bst.predict(dtrain))

0.611140264479


0.5554388867362321