In [302]:
import pandas as pd
import numpy as np
import scipy.sparse


import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.metrics import log_loss
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [2]:

train = pd.read_json('input/train.json', orient='columns')
train.head(2)

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,longitude,manager_id,photos,price,street_address
10.0,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue
10000.0,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,-73.9667,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue


- start with bath, beds, manager id, build id, price
- transform manager id and building id into average interest level
- standardize inputs
- split into train/validation stratfied
- take sample from both train and validation
- train lr

## Helper Functions

In [3]:
def createSamples(X_train, y_train, X_test, y_test, num_train= 1000, num_test=400):
    
    nums = np.arange(X_train.shape[0])
    np.random.shuffle(nums)
    X_train_samp = X_train.iloc[nums[:num_train]]
    y_train_samp = y_train.iloc[nums[:num_train]]


    test_nums = np.arange(X_test.shape[0])
    np.random.shuffle(test_nums)

    X_test_samp = X_test.iloc[test_nums[:num_test]]
    y_test_samp = y_test.iloc[test_nums[:num_test]]
    return X_train_samp, y_train_samp, X_test_samp, y_test_samp

def get_accuracy(preds, y_test_samp):
    return (preds == y_test_samp).sum() / preds.shape[0] 

In [366]:
def subAndSample(subset, num_train=20000, num_test=2000, comp=False):
    X = subset.drop('enc_interest', axis=1)
    y = subset.enc_interest
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
    if comp:
        return X_train, y_train, X_test, y_test 
    else:
        return createSamples(X_train, y_train, X_test, y_test, num_train=num_train, num_test=num_test)

In [7]:
def standardize(X_train, X_test):
    sc = StandardScaler()
    sc.fit(X_train)
    X_train = sc.transform(X_train)
    X_test = sc.transform(X_test)
    return X_train, X_test

In [8]:
train['enc_interest'] = train.interest_level.map({'low': 0, 'medium': 1, 'high':2})

## only baths, beds, price

In [508]:
subset = train[['bathrooms', 'bedrooms', 'price', 'enc_interest']]

In [510]:
X_train, y_train, X_test, y_test = subAndSample(subset, comp=True)

X_train, X_test = standardize(X_train, X_test)

In [511]:
dtrain = xgb.DMatrix(X_train, label= y_train)
dval = xgb.DMatrix(X_test, label=y_test)

# specify parameters via map
watchlist = [(dtrain,'train'), (dval,'val')]
param = {'max_depth':3, 'eta':1, 'silent':1, 'num_class': 3, 'objective':'multi:softprob' }
num_round = 50
bst = xgb.train(param, dtrain, num_round, watchlist)


[0]	train-merror:0.29979	val-merror:0.30078
[1]	train-merror:0.299055	val-merror:0.300577
[2]	train-merror:0.299055	val-merror:0.300577
[3]	train-merror:0.299005	val-merror:0.300375
[4]	train-merror:0.298802	val-merror:0.300172
[5]	train-merror:0.298549	val-merror:0.300172
[6]	train-merror:0.29789	val-merror:0.300071
[7]	train-merror:0.297738	val-merror:0.300071
[8]	train-merror:0.297738	val-merror:0.300071
[9]	train-merror:0.297612	val-merror:0.29997
[10]	train-merror:0.297333	val-merror:0.29997
[11]	train-merror:0.297333	val-merror:0.29997
[12]	train-merror:0.297333	val-merror:0.29997
[13]	train-merror:0.297333	val-merror:0.29997
[14]	train-merror:0.297409	val-merror:0.300172
[15]	train-merror:0.297384	val-merror:0.300172
[16]	train-merror:0.297156	val-merror:0.300577
[17]	train-merror:0.297181	val-merror:0.300274
[18]	train-merror:0.297485	val-merror:0.300274
[19]	train-merror:0.296801	val-merror:0.30078
[20]	train-merror:0.297282	val-merror:0.300679
[21]	train-merror:0.296674	val-m

In [512]:
preds = bst.predict(dval)
print(log_loss(y_test, preds))
log_loss(y_train, bst.predict(dtrain))

0.703851058787


0.6799519294229075

In [373]:
gs.fit(X_train_concat, y_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] max_depth=2 .....................................................
[CV] ............................ max_depth=2, score=0.688623 -  51.5s
[CV] max_depth=2 .....................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   51.5s remaining:    0.0s


[CV] ............................ max_depth=2, score=0.698198 -  49.8s
[CV] max_depth=2 .....................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.7min remaining:    0.0s


[CV] ............................ max_depth=2, score=0.692192 -  49.6s
[CV] max_depth=4 .....................................................
[CV] ............................ max_depth=4, score=0.687126 - 1.0min
[CV] max_depth=4 .....................................................
[CV] ............................ max_depth=4, score=0.707207 - 1.0min
[CV] max_depth=4 .....................................................
[CV] ............................ max_depth=4, score=0.686186 - 1.1min
[CV] max_depth=6 .....................................................
[CV] ............................ max_depth=6, score=0.685629 - 1.3min
[CV] max_depth=6 .....................................................
[CV] ............................ max_depth=6, score=0.695195 - 1.2min
[CV] max_depth=6 .....................................................
[CV] ............................ max_depth=6, score=0.675676 - 1.2min


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  9.5min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='multiclass:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_depth': [2, 4, 6]}, pre_dispatch='2*n_jobs',
       refit=True, scoring=None, verbose=3)

In [226]:
lr = LogisticRegression(multi_class='multinomial', solver='lbfgs')

lr.fit(X_train, y_train)

preds = lr.predict(X_test)

print(get_accuracy(preds, y_test))
probs = lr.predict_proba(X_test)
get_accuracy(lr.predict(X_train), y_train)
print(log_loss(y_test, probs))

0.694357207983
0.726617534243


# Features nlp

In [9]:
train['feat_str'] = train.features.apply(lambda x: " ".join(x))

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
train_counts = count_vect.fit_transform(train.feat_str)

In [16]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(train_counts)
train_tf = tf_transformer.transform(train_counts)


In [15]:
TfidfTransformer?

In [19]:
from sklearn.naive_bayes import MultinomialNB

In [24]:
clf = MultinomialNB()

In [25]:
clf.fit(train_tf, train.enc_interest)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [29]:
preds.sum()

1070

In [30]:
preds = clf.predict_proba(train_tf)

In [31]:
log_loss(train.enc_interest, preds)

0.7584492522367966

In [28]:
get_accuracy(train.enc_interest, preds)

0.6948046685038094

In [77]:
subset = train[['feat_str', 'enc_interest']]
X_train, X_test, y_train, y_test = subAndSample(subset, comp=True)

from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB())])

X_train_concat = np.array(np.concatenate([a for a in X_train.values]))
X_test_concat = np.array(np.concatenate([a for a in X_test.values]))

text_clf.fit(X_train_concat, y_train.values)
preds = text_clf.predict(X_test_concat)
probs = text_clf.predict_proba(X_test_concat)
print(get_accuracy(y_test, preds))
log_loss(y_test,probs)

0.691723229663


0.76581763436551764

In [81]:
subset = train[['feat_str', 'enc_interest']]
X_train, X_test, y_train, y_test = subAndSample(subset, comp=True)

from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer(use_idf=False)),
                      ('clf', MultinomialNB())])

X_train_concat = np.array(np.concatenate([a for a in X_train.values]))
X_test_concat = np.array(np.concatenate([a for a in X_test.values]))

text_clf.fit(X_train_concat, y_train.values)
preds = text_clf.predict(X_test_concat)
probs = text_clf.predict_proba(X_test_concat)
print(get_accuracy(y_test, preds))
log_loss(y_test,probs)

0.693344139398


0.76033893198013003

In [85]:
subset = train[['feat_str', 'enc_interest']]
X_train, X_test, y_train, y_test = subAndSample(subset, comp=True)

from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                      ('tfidf', TfidfTransformer(use_idf=False)),
                      ('clf', MultinomialNB())])

X_train_concat = np.array(np.concatenate([a for a in X_train.values]))
X_test_concat = np.array(np.concatenate([a for a in X_test.values]))

text_clf.fit(X_train_concat, y_train.values)
preds = text_clf.predict(X_test_concat)
probs = text_clf.predict_proba(X_test_concat)
print(get_accuracy(y_test, preds))
log_loss(y_test,probs)

0.694154594266


0.76523884493207528

In [172]:
subset = train[['feat_str', 'enc_interest']]
X_train, X_test, y_train, y_test = subAndSample(subset, comp=True)

from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english', ngram_range=(1, 2))),
                      ('tfidf', TfidfTransformer(use_idf=False)),
                      ('clf', MultinomialNB())])

X_train_concat = np.array(np.concatenate([a for a in X_train.values]))
X_test_concat = np.array(np.concatenate([a for a in X_test.values]))

text_clf.fit(X_train_concat, y_train.values)
preds = text_clf.predict(X_test_concat)
probs = text_clf.predict_proba(X_test_concat)
print(get_accuracy(y_test, preds))
log_loss(y_test,probs)

0.692128457097


0.79231916508995459

In [90]:
from sklearn.linear_model import SGDClassifier

In [171]:
subset = train[['feat_str', 'enc_interest']]
X_train, X_test, y_train, y_test = subAndSample(subset, comp=True)

from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english', ngram_range=(1, 2))),
                      ('tfidf', TfidfTransformer(use_idf=False)),
                      ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-4, n_iter=10)) ])

X_train_concat = np.array(np.concatenate([a for a in X_train.values]))
X_test_concat = np.array(np.concatenate([a for a in X_test.values]))

text_clf.fit(X_train_concat, y_train.values)
preds = text_clf.predict(X_test_concat)
train_preds = text_clf.predict(X_train_concat)

print(get_accuracy(y_test, preds))
print(get_accuracy(y_train, train_preds))



0.695674197143
0.696486917758


- split to train and test
- fit svm on train
- predict svm on test
- add predictions as feature to classifier
- get accuracy

In [227]:
subset = train[['feat_str','bathrooms', 'bedrooms', 'price', 'enc_interest']]
X_train, X_test, y_train, y_test = subAndSample(subset, comp=True)


from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english', ngram_range=(1, 2))),
                      ('tfidf', TfidfTransformer(use_idf=False)),
                      ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-5, n_iter=5)) ])


text_clf.fit(X_train.feat_str.values, y_train.values)

preds = text_clf.predict(X_test.feat_str.values)

train_preds = text_clf.predict(X_train.feat_str.values)

X_test['svm_preds'] = preds
X_train['svm_preds'] = train_preds

X_train = X_train.drop('feat_str', axis=1)
X_test= X_test.drop('feat_str', axis=1)

X_train, X_test = standardize(X_train, X_test)

In [228]:
lr = LogisticRegression(multi_class='multinomial', solver='lbfgs')
lr.fit(X_train, y_train)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [229]:
preds = lr.predict(X_test)
probs = lr.predict_proba(X_test)
print(get_accuracy(preds, y_test))


print(log_loss(y_test, probs))

0.69851078918
0.722678949691


In [205]:
subset = train[['feat_str','bathrooms', 'bedrooms', 'price', 'enc_interest']]
X_train, X_test, y_train, y_test = subAndSample(subset, comp=True)


from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english', ngram_range=(1, 2))),
                      ('tfidf', TfidfTransformer(use_idf=False)),
                      ('clf', MultinomialNB())])


text_clf.fit(X_train.feat_str.values, y_train.values)

preds = text_clf.predict(X_test.feat_str.values)

train_preds = text_clf.predict(X_train.feat_str.values)

X_test['svm_preds'] = preds
X_train['svm_preds'] = train_preds

X_train = X_train.drop('feat_str', axis=1)
X_test= X_test.drop('feat_str', axis=1)

X_train, X_test = standardize(X_train, X_test)

In [206]:
lr = LogisticRegression(multi_class='multinomial', solver='lbfgs')
lr.fit(X_train, y_train)


preds = lr.predict(X_test)

print(get_accuracy(preds, y_test))

get_accuracy(lr.predict(X_train), y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [183]:
lr.coef_

array([[ -1.39618363e-01,  -4.36800222e-01,   5.95738072e-04,
         -4.06530331e-01],
       [  3.69696279e-02,   2.33803701e-03,   3.88956992e-05,
          1.70429378e-01],
       [  1.02648735e-01,   4.34462185e-01,  -6.34633771e-04,
          2.36100953e-01]])

In [None]:
clf', SGDClassifier(loss='hinge', penalty='l2',
...                                            alpha=1e-3, n_iter=5, random_state=42))

### Concatenate text and regular matrices

In [277]:
import scipy.sparse

from sklearn.pipeline import Pipeline
text_proc = Pipeline([('vect', CountVectorizer(ngram_range=(1, 3))),
                      ('tfidf', TfidfTransformer(use_idf=True)) ])


subset = train[['feat_str','bathrooms', 'bedrooms', 'price', 'enc_interest']]
X_train, X_test, y_train, y_test = subAndSample(subset, comp=True)

text_proc.fit(X_train.feat_str.values)

tr_text_mat = text_proc.transform(X_train.feat_str.values)

test_text_mat = text_proc.transform(X_test.feat_str.values)

X_train =X_train.drop('feat_str', axis=1)
X_test =X_test.drop('feat_str', axis=1)
X_train, X_test = standardize(X_train, X_test)

X_train_concat = scipy.sparse.hstack((X_train,tr_text_mat))

X_test_concat = scipy.sparse.hstack((X_test, test_text_mat))

In [278]:
lr = LogisticRegression(multi_class='multinomial', solver='lbfgs')
lr.fit(X_train_concat, y_train)

preds = lr.predict(X_test_concat)

print(get_accuracy(preds, y_test))

get_accuracy(lr.predict(X_train_concat), y_train)

0.706716644717


0.71573668346799724

###  Xgboost with concatenated features

In [297]:
import scipy.sparse

from sklearn.pipeline import Pipeline
text_proc = Pipeline([('vect', CountVectorizer(ngram_range=(1, 3))),
                      ('tfidf', TfidfTransformer(use_idf=True)) ])


subset = train[['feat_str','bathrooms', 'bedrooms', 'price', 'enc_interest']]
X_train, X_test, y_train, y_test = subAndSample(subset, comp=True)

text_proc.fit(X_train.feat_str.values)

tr_text_mat = text_proc.transform(X_train.feat_str.values)

test_text_mat = text_proc.transform(X_test.feat_str.values)

X_train =X_train.drop('feat_str', axis=1)
X_test =X_test.drop('feat_str', axis=1)
X_train, X_test = standardize(X_train, X_test)

X_train_concat = scipy.sparse.hstack((X_train,tr_text_mat))

X_test_concat = scipy.sparse.hstack((X_test, test_text_mat))

In [298]:
dtrain = xgb.DMatrix(X_train_concat, label= y_train)
dval = xgb.DMatrix(X_test_concat, label=y_test)

# specify parameters via map
watchlist = [(dtrain,'train'), (dval,'val')]
param = {'max_depth':3, 'eta':1, 'silent':1, 'num_class': 3, 'objective':'multi:softprob' }
num_round = 15
bst = xgb.train(param, dtrain, num_round, watchlist)

preds = bst.predict(dval)

print(log_loss(y_test, preds))

log_loss(y_train, bst.predict(dtrain))

[0]	train-merror:0.294344	val-merror:0.293385
[1]	train-merror:0.292622	val-merror:0.292574
[2]	train-merror:0.289405	val-merror:0.291764
[3]	train-merror:0.288189	val-merror:0.291257
[4]	train-merror:0.285606	val-merror:0.291662
[5]	train-merror:0.282237	val-merror:0.289535
[6]	train-merror:0.281224	val-merror:0.288927
[7]	train-merror:0.279856	val-merror:0.288623
[8]	train-merror:0.279046	val-merror:0.288623
[9]	train-merror:0.278286	val-merror:0.288319
[10]	train-merror:0.277222	val-merror:0.286901
[11]	train-merror:0.275044	val-merror:0.286496
[12]	train-merror:0.274107	val-merror:0.2868
[13]	train-merror:0.273524	val-merror:0.286394
[14]	train-merror:0.272967	val-merror:0.285989


Settings that seem best so far:
- use_idf =True
- no stopwords
- ngram range 2 or 3

Xgboost on text features concatenated with regular features seems like the way to go

### Description

In [307]:
from sklearn.pipeline import Pipeline
text_proc = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))),
                      ('tfidf', TfidfTransformer(use_idf=True)) ])



In [308]:
subset = train[['description','bathrooms', 'bedrooms', 'price', 'enc_interest']]

In [309]:
X_train, X_test, y_train, y_test = subAndSample(subset, comp=True)

text_proc.fit(X_train.description.values)

tr_text_mat = text_proc.transform(X_train.description.values)

test_text_mat = text_proc.transform(X_test.description.values)

X_train =X_train.drop('description', axis=1)
X_test =X_test.drop('description', axis=1)
X_train, X_test = standardize(X_train, X_test)

X_train_concat = scipy.sparse.hstack((X_train,tr_text_mat))

X_test_concat = scipy.sparse.hstack((X_test, test_text_mat))

In [314]:
dtrain = xgb.DMatrix(X_train_concat, label= y_train)
dval = xgb.DMatrix(X_test_concat, label=y_test)

# specify parameters via map
watchlist = [(dtrain,'train'), (dval,'val')]
param = {'max_depth':3, 'eta':1, 'silent':1, 'num_class': 3, 'objective':'multi:softprob' }
num_round = 15
bst = xgb.train(param, dtrain, num_round, watchlist)


[0]	train-merror:0.297688	val-merror:0.294803
[1]	train-merror:0.295307	val-merror:0.295715
[2]	train-merror:0.293584	val-merror:0.29612
[3]	train-merror:0.28943	val-merror:0.294803
[4]	train-merror:0.285758	val-merror:0.293283
[5]	train-merror:0.284162	val-merror:0.293081
[6]	train-merror:0.283174	val-merror:0.29146
[7]	train-merror:0.279628	val-merror:0.292676
[8]	train-merror:0.277551	val-merror:0.290852
[9]	train-merror:0.275474	val-merror:0.289535
[10]	train-merror:0.272764	val-merror:0.288623
[11]	train-merror:0.2717	val-merror:0.287813
[12]	train-merror:0.269978	val-merror:0.28913
[13]	train-merror:0.268154	val-merror:0.288826
[14]	train-merror:0.264887	val-merror:0.287711


In [315]:
preds = bst.predict(dval)
print(log_loss(y_test, preds))
log_loss(y_train, bst.predict(dtrain))

0.664866422157


0.6263888617674026

### Features and Description

In [316]:
import scipy.sparse

from sklearn.pipeline import Pipeline
text_proc_feat = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))),
                      ('tfidf', TfidfTransformer(use_idf=True)) ])
text_proc_desc = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))),
                      ('tfidf', TfidfTransformer(use_idf=True)) ])


subset = train[['feat_str', 'description', 'bathrooms', 'bedrooms', 'price', 'enc_interest']]
X_train, X_test, y_train, y_test = subAndSample(subset, comp=True)



text_proc_feat.fit(X_train.feat_str.values)
tr_feat_text = text_proc.transform(X_train.feat_str.values)
test_feat_text = text_proc.transform(X_test.feat_str.values)


text_proc_desc.fit(X_train.description.values)
tr_desc_mat = text_proc_desc.transform(X_train.description.values)
test_desc_mat = text_proc_desc.transform(X_test.description.values)


X_train = X_train.drop(['feat_str', 'description'], axis=1)
X_test = X_test.drop(['feat_str', 'description'], axis=1)
X_train, X_test = standardize(X_train, X_test)


X_train_concat = scipy.sparse.hstack((X_train,tr_feat_text, tr_desc_mat))
X_test_concat = scipy.sparse.hstack((X_test, test_feat_text, test_desc_mat))

In [321]:
dtrain = xgb.DMatrix(X_train_concat, label= y_train)
dval = xgb.DMatrix(X_test_concat, label=y_test)

# specify parameters via map
watchlist = [(dtrain,'train'), (dval,'val')]
param = {'max_depth':3, 'eta':1, 'silent':1, 'num_class': 3, 'objective':'multi:softprob' }
num_round = 10
bst = xgb.train(param, dtrain, num_round, watchlist)


[0]	train-merror:0.294952	val-merror:0.294296
[1]	train-merror:0.293534	val-merror:0.293283
[2]	train-merror:0.291431	val-merror:0.291865
[3]	train-merror:0.286416	val-merror:0.289738
[4]	train-merror:0.283655	val-merror:0.287813
[5]	train-merror:0.276817	val-merror:0.285179
[6]	train-merror:0.274157	val-merror:0.28295
[7]	train-merror:0.27127	val-merror:0.282646
[8]	train-merror:0.268965	val-merror:0.282747
[9]	train-merror:0.26742	val-merror:0.280215


In [322]:
preds = bst.predict(dval)
print(log_loss(y_test, preds))
log_loss(y_train, bst.predict(dtrain))

0.644885487823


0.6205642350065651

In [323]:
param = {'max_depth':3, 'eta':1, 'silent':1, 'num_class': 3, 'objective':'multi:softprob' }
num_round = 15
bst = xgb.train(param, dtrain, num_round, watchlist)


[0]	train-merror:0.294952	val-merror:0.294296
[1]	train-merror:0.293534	val-merror:0.293283
[2]	train-merror:0.291431	val-merror:0.291865
[3]	train-merror:0.286416	val-merror:0.289738
[4]	train-merror:0.283655	val-merror:0.287813
[5]	train-merror:0.276817	val-merror:0.285179
[6]	train-merror:0.274157	val-merror:0.28295
[7]	train-merror:0.27127	val-merror:0.282646
[8]	train-merror:0.268965	val-merror:0.282747
[9]	train-merror:0.26742	val-merror:0.280215
[10]	train-merror:0.26514	val-merror:0.280924
[11]	train-merror:0.263646	val-merror:0.280417
[12]	train-merror:0.261645	val-merror:0.280113
[13]	train-merror:0.259365	val-merror:0.277378
[14]	train-merror:0.25744	val-merror:0.276872


In [324]:
preds = bst.predict(dval)
print(log_loss(y_test, preds))
log_loss(y_train, bst.predict(dtrain))

0.63915561909


0.6025585486170203

In [326]:
param = {'max_depth':3, 'eta':1, 'silent':1, 'num_class': 3, 'objective':'multi:softprob' }
num_round = 25
bst = xgb.train(param, dtrain, num_round, watchlist)


[0]	train-merror:0.294952	val-merror:0.294296
[1]	train-merror:0.293534	val-merror:0.293283
[2]	train-merror:0.291431	val-merror:0.291865
[3]	train-merror:0.286416	val-merror:0.289738
[4]	train-merror:0.283655	val-merror:0.287813
[5]	train-merror:0.276817	val-merror:0.285179
[6]	train-merror:0.274157	val-merror:0.28295
[7]	train-merror:0.27127	val-merror:0.282646
[8]	train-merror:0.268965	val-merror:0.282747
[9]	train-merror:0.26742	val-merror:0.280215
[10]	train-merror:0.26514	val-merror:0.280924
[11]	train-merror:0.263646	val-merror:0.280417
[12]	train-merror:0.261645	val-merror:0.280113
[13]	train-merror:0.259365	val-merror:0.277378
[14]	train-merror:0.25744	val-merror:0.276872
[15]	train-merror:0.256123	val-merror:0.276264
[16]	train-merror:0.254831	val-merror:0.274643
[17]	train-merror:0.252982	val-merror:0.274339
[18]	train-merror:0.251159	val-merror:0.274643
[19]	train-merror:0.249842	val-merror:0.274643
[20]	train-merror:0.24893	val-merror:0.274238
[21]	train-merror:0.246473	va

In [327]:
preds = bst.predict(dval)
print(log_loss(y_test, preds))
log_loss(y_train, bst.predict(dtrain))

0.634272339866


0.5737917812620611

In [333]:
param = {'max_depth':3, 'eta':1, 'silent':1, 'num_class': 3, 'objective':'multi:softprob' }
num_round = 55
bst = xgb.train(param, dtrain, num_round, watchlist)


[0]	train-merror:0.294952	val-merror:0.294296
[1]	train-merror:0.293534	val-merror:0.293283
[2]	train-merror:0.291431	val-merror:0.291865
[3]	train-merror:0.286416	val-merror:0.289738
[4]	train-merror:0.283655	val-merror:0.287813
[5]	train-merror:0.276817	val-merror:0.285179
[6]	train-merror:0.274157	val-merror:0.28295
[7]	train-merror:0.27127	val-merror:0.282646
[8]	train-merror:0.268965	val-merror:0.282747
[9]	train-merror:0.26742	val-merror:0.280215
[10]	train-merror:0.26514	val-merror:0.280924
[11]	train-merror:0.263646	val-merror:0.280417
[12]	train-merror:0.261645	val-merror:0.280113
[13]	train-merror:0.259365	val-merror:0.277378
[14]	train-merror:0.25744	val-merror:0.276872
[15]	train-merror:0.256123	val-merror:0.276264
[16]	train-merror:0.254831	val-merror:0.274643
[17]	train-merror:0.252982	val-merror:0.274339
[18]	train-merror:0.251159	val-merror:0.274643
[19]	train-merror:0.249842	val-merror:0.274643
[20]	train-merror:0.24893	val-merror:0.274238
[21]	train-merror:0.246473	va

In [334]:
preds = bst.predict(dval)
print(log_loss(y_test, preds))
log_loss(y_train, bst.predict(dtrain))

0.634050668408


0.51333511762367889

### Cross Val Xgboost

In [335]:
from xgboost import XGBClassifier

In [342]:
xgbc = XGBClassifier(objective='multiclass:softprob')

In [343]:
xgbc.fit?

In [348]:
from sklearn.grid_search import GridSearchCV



In [369]:
import scipy.sparse

from sklearn.pipeline import Pipeline
text_proc_feat = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))),
                      ('tfidf', TfidfTransformer(use_idf=True)) ])
text_proc_desc = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))),
                      ('tfidf', TfidfTransformer(use_idf=True)) ])


subset = train[['feat_str', 'description', 'bathrooms', 'bedrooms', 'price', 'enc_interest']]
X_train, y_train, X_test, y_test = subAndSample(subset, num_train=2000)


In [371]:
text_proc_feat.fit(X_train.feat_str.values)
tr_feat_text = text_proc.transform(X_train.feat_str.values)



text_proc_desc.fit(X_train.description.values)
tr_desc_mat = text_proc_desc.transform(X_train.description.values)

X_train = X_train.drop(['feat_str', 'description'], axis=1)

X_train = StandardScaler().fit_transform(X_train)


X_train_concat = scipy.sparse.hstack((X_train,tr_feat_text, tr_desc_mat))


In [372]:
gs = GridSearchCV(estimator=XGBClassifier(objective='multiclass:softprob'), param_grid=params, verbose=3)

In [373]:
gs.fit(X_train_concat, y_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] max_depth=2 .....................................................
[CV] ............................ max_depth=2, score=0.688623 -  51.5s
[CV] max_depth=2 .....................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   51.5s remaining:    0.0s


[CV] ............................ max_depth=2, score=0.698198 -  49.8s
[CV] max_depth=2 .....................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.7min remaining:    0.0s


[CV] ............................ max_depth=2, score=0.692192 -  49.6s
[CV] max_depth=4 .....................................................
[CV] ............................ max_depth=4, score=0.687126 - 1.0min
[CV] max_depth=4 .....................................................
[CV] ............................ max_depth=4, score=0.707207 - 1.0min
[CV] max_depth=4 .....................................................
[CV] ............................ max_depth=4, score=0.686186 - 1.1min
[CV] max_depth=6 .....................................................
[CV] ............................ max_depth=6, score=0.685629 - 1.3min
[CV] max_depth=6 .....................................................
[CV] ............................ max_depth=6, score=0.695195 - 1.2min
[CV] max_depth=6 .....................................................
[CV] ............................ max_depth=6, score=0.675676 - 1.2min


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  9.5min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='multiclass:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_depth': [2, 4, 6]}, pre_dispatch='2*n_jobs',
       refit=True, scoring=None, verbose=3)

In [374]:
params = {'max_depth':[6,8,10]}

In [375]:
gs = GridSearchCV(estimator=XGBClassifier(objective='multiclass:softprob'), param_grid=params, verbose=3)

## Location

In [386]:
train['address_lower'] = train.display_address.apply(lambda x: x.lower())

In [388]:
train.address_lower.nunique()

8630

In [393]:
addr = train.groupby('address_lower').enc_interest.count().sort_values(ascending=False)

In [408]:
addr[:300].sum()

24450

In [462]:
addr_df = pd.DataFrame(np.zeros((train.shape[0],50)), index=train.index, columns = addr.index.values[:50])

In [463]:
num_ops = train.shape[0] * 50

In [464]:
num_ops

2467600

In [468]:
i = 0
for r,v in train.iterrows():
    i+=1
    if i % 1000 == 0:
        print(i)
    for col in addr_df.columns:
        if v[-1] == col:
            addr_df.loc[r,col] = 1


1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000


In [471]:
addr_sparse = scipy.sparse.csr_matrix(addr_df.values)

In [478]:
train_concat = pd.concat([train, addr_df], axis=1)

In [None]:
train_concat = train.concat

In [479]:
train.columns

to_drop = ['building_id', 'created', 'description',
       'display_address', 'features', 'interest_level', 'latitude',
       'listing_id', 'longitude', 'manager_id', 'photos', 'street_address', 'feat_str', 'address_lower']

train_concat = train_concat.drop(to_drop, axis=1)

Index(['bathrooms', 'bedrooms', 'building_id', 'created', 'description',
       'display_address', 'features', 'interest_level', 'latitude',
       'listing_id', 'longitude', 'manager_id', 'photos', 'price',
       'street_address', 'enc_interest', 'feat_str', 'address_lower'],
      dtype='object')

In [484]:
train_concat.shape

(49352, 54)

In [521]:
subset= train[['bathrooms', 'bedrooms', 'price','enc_interest', 'address_lower']]

In [522]:
X_train, y_train, X_test, y_test = subAndSample(subset, comp=True)

In [524]:
h = FeatureHasher(input_type='string')

hashed = h.fit_transform(X_train.address_lower)


In [526]:
test_hashed = h.transform(X_test.address_lower)

In [530]:
X_train = X_train.drop('address_lower', axis=1)
X_test = X_test.drop('address_lower', axis=1)

In [532]:
X_train, X_test = standardize(X_train, X_test)

In [535]:
X_train_concat = scipy.sparse.hstack((X_train,hashed))

In [536]:
X_test_concat = scipy.sparse.hstack((X_test, test_hashed))

In [542]:
dtrain = xgb.DMatrix(X_train_concat, label= y_train)
dval = xgb.DMatrix(X_test_concat, label=y_test)

# specify parameters via map
watchlist = [(dtrain,'train'), (dval,'val')]
param = {'max_depth':3, 'eta':1, 'silent':1, 'num_class': 3, 'objective':'multi:softprob' }
num_round = 20
bst = xgb.train(param, dtrain, num_round, watchlist)


[0]	train-merror:0.299511	val-merror:0.300071
[1]	train-merror:0.296649	val-merror:0.299058
[2]	train-merror:0.296269	val-merror:0.298551
[3]	train-merror:0.2948	val-merror:0.296525
[4]	train-merror:0.294775	val-merror:0.297032
[5]	train-merror:0.294116	val-merror:0.296626
[6]	train-merror:0.294192	val-merror:0.29612
[7]	train-merror:0.29328	val-merror:0.296323
[8]	train-merror:0.293052	val-merror:0.297842
[9]	train-merror:0.293255	val-merror:0.298146
[10]	train-merror:0.292774	val-merror:0.29845
[11]	train-merror:0.291786	val-merror:0.29845
[12]	train-merror:0.291431	val-merror:0.299159
[13]	train-merror:0.290798	val-merror:0.298551
[14]	train-merror:0.290646	val-merror:0.297842
[15]	train-merror:0.289658	val-merror:0.298247
[16]	train-merror:0.289836	val-merror:0.298247
[17]	train-merror:0.289506	val-merror:0.298754
[18]	train-merror:0.288443	val-merror:0.297842
[19]	train-merror:0.287759	val-merror:0.298146


In [543]:
preds = bst.predict(dval)
print(log_loss(y_test, preds))
log_loss(y_train, bst.predict(dtrain))

0.689737723406


0.66608877414496059