In [1]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
%matplotlib inline

from scipy import sparse
import string
from bs4 import BeautifulSoup

from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

from sklearn.metrics import log_loss
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

In [2]:
train = pd.read_json('data/train.json')
test = pd.read_json('data/test.json')

In [3]:
train[:1]

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,longitude,manager_id,photos,price,street_address
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue


### Generate features 

In [4]:
def clear_description(desc):
    soup = BeautifulSoup(desc, 'html.parser')
    return soup.get_text(separator=' ')

def generate_features(data):
    data['num_photos'] = data['photos'].apply(len)
    data['num_features'] = data['features'].apply(len)
    data['num_description_words'] = data['description'].apply(lambda x: len(x.split(" ")))
    data['created'] = pd.to_datetime(data['created'])

    ### Separate date and time
    data['created_year'] = data['created'].dt.year
    data['created_month'] = data['created'].dt.month
    data['created_day'] = data['created'].dt.day
    data['created_dow'] = data['created'].dt.dayofweek
    data['created_hour'] = data['created'].dt.hour
    
    ## Clean description from html tags
    data['description'] = data['description'].apply(clear_description)
    
    ## Prepare `features` column
    data['features'] = data["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
    
    if 'interest_level' in data.columns:
        target_num_map = {'high':0, 'medium':1, 'low':2}
        data.interest_level = data.interest_level.apply(lambda x: target_num_map[x])

In [5]:
%%time
generate_features(test)
generate_features(train)

  'Beautiful Soup.' % markup)


CPU times: user 28.3 s, sys: 335 ms, total: 28.6 s
Wall time: 28.8 s


#### Encode categorical  

In [6]:
encoder = LabelEncoder()
categorical = ["building_id", "display_address", "manager_id", "street_address"]

for c in categorical:
    encoder.fit(list(train[c].values) + list(test[c].values))
    train[c] = encoder.transform(train[c])
    test[c] = encoder.transform(test[c])

#### Vectorize description and features 

In [94]:
%%time
vectorizer = CountVectorizer(stop_words='english', max_features=200)
tr_desc = vectorizer.fit_transform(train['description'])
te_desc = vectorizer.transform(test['description'])

tfidf = CountVectorizer(stop_words='english', max_features=200)
tr_features = tfidf.fit_transform(train["features"])
te_features = tfidf.transform(test["features"])

CPU times: user 12.9 s, sys: 665 ms, total: 13.5 s
Wall time: 13.5 s


In [70]:
features_to_use = ['bathrooms', 'bedrooms', 'building_id', 'display_address', 
                'latitude', 'longitude', 'manager_id', 'price', 'num_photos',
                'num_features', 'num_description_words', 'created_month',
                'created_day', 'created_dow', 'created_hour']

In [9]:
train[features_to_use].head()

Unnamed: 0,bathrooms,bedrooms,building_id,display_address,latitude,longitude,manager_id,price,num_photos,num_features,num_description_words,created_month,created_day,created_dow,created_hour
10,1.5,3,3797,12282,40.7145,-73.9425,1568,3000,5,0,95,6,24,4,7
10000,1.0,2,8986,9080,40.7947,-73.9667,1988,5465,11,5,9,6,12,6,12
100004,1.0,1,8889,13719,40.7388,-74.0018,3733,2850,8,4,94,4,17,6,3
100007,1.0,1,1848,10866,40.7539,-73.9677,282,3275,3,2,80,4,18,0,2
100013,1.0,4,0,15072,40.8241,-73.9493,2618,3350,3,1,68,4,28,3,1


#### Prepare data to fit 

Merge sparse matrix with other features

In [101]:
train_X = sparse.hstack([train[features_to_use], tr_features]).tocsr()
test_X = sparse.hstack([test[features_to_use], te_features]).tocsr()

# train_X = train[features_to_use]
# test_X = test[features_to_use]

In [102]:
xtr, xcv, ytr, ycv = train_test_split(train_X, train.interest_level, test_size = 0.3, random_state=42)

### 1. Random Forest 

In [121]:
%%time
rf = RandomForestClassifier(n_estimators=1000, oob_score=True)
scores = -cross_val_score(rf, train_X, train.interest_level, scoring='neg_log_loss', cv=3)

CPU times: user 2min 11s, sys: 3.47 s, total: 2min 14s
Wall time: 2min 16s


In [122]:
scores.mean()

0.608686444494284

***Best:*** `0.608686444494284` 

#### Predict 

In [108]:
%%time
rf.fit(xtr, ytr)
rf_preds = rf.predict_proba(test[num_feats])

CPU times: user 1min 5s, sys: 3.19 s, total: 1min 8s
Wall time: 1min 8s


In [109]:
rf_preds[:3]

array([[ 0.065,  0.348,  0.587],
       [ 0.167,  0.426,  0.407],
       [ 0.051,  0.192,  0.757]])

### 2. Logistic Regression on vectorized `description` and `features`  columns

In [77]:
lr = LogisticRegression(
    penalty='l2',
    dual=True,
    solver='liblinear'
)

ovr = OneVsRestClassifier(lr)

In [22]:
%%time
scores_desc = -cross_val_score(ovr, tr_desc, train.interest_level, scoring='neg_log_loss', cv=3)

CPU times: user 13.7 s, sys: 190 ms, total: 13.9 s
Wall time: 13.9 s


In [23]:
scores_desc.mean()

0.79238603515857242

### 3. XGBoost 

In [106]:
# Set params
import xgboost as xgb

params = {"objective": "multi:softprob",
          "num_class": 3,
          "booster": "gbtree",
          "eta": 0.02,
          "max_depth": 6,
          "subsample": 0.9,
          "min_child_weight": 1,
          "colsample_bytree": 0.7,
          "silent": 1,
          "eval_metric": 'mlogloss',
          "seed": 0
          }
num_trees = 2000
stop = 20

In [103]:
dtrain = xgb.DMatrix(xtr, label=ytr)
dvalid = xgb.DMatrix(xcv, label=ycv)
dtest = xgb.DMatrix(test_X)

In [107]:
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, num_trees, evals=watchlist, early_stopping_rounds=stop, verbose_eval=True)

[0]	train-mlogloss:1.08697	eval-mlogloss:1.08727
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 20 rounds.
[1]	train-mlogloss:1.07468	eval-mlogloss:1.07539
[2]	train-mlogloss:1.06295	eval-mlogloss:1.06397
[3]	train-mlogloss:1.05223	eval-mlogloss:1.05358
[4]	train-mlogloss:1.04177	eval-mlogloss:1.04349
[5]	train-mlogloss:1.03148	eval-mlogloss:1.03345
[6]	train-mlogloss:1.02175	eval-mlogloss:1.024
[7]	train-mlogloss:1.01224	eval-mlogloss:1.01476
[8]	train-mlogloss:1.00233	eval-mlogloss:1.00508
[9]	train-mlogloss:0.992665	eval-mlogloss:0.995631
[10]	train-mlogloss:0.983381	eval-mlogloss:0.986627
[11]	train-mlogloss:0.974509	eval-mlogloss:0.978037
[12]	train-mlogloss:0.965581	eval-mlogloss:0.969343
[13]	train-mlogloss:0.95666	eval-mlogloss:0.96064
[14]	train-mlogloss:0.948341	eval-mlogloss:0.952541
[15]	train-mlogloss:0.94077	eval-mlogloss:0.94523
[16]	train-mlogloss:0.933062	eval-mlogloss:0.937769

***Best***: `(train-auc:0.353312, eval-auc:0.550563 [max_depth:6])` - ***leaderboard***: `0.56061`

`train-mlogloss:0.344899	eval-mlogloss:0.54977`

In [91]:
xgb_preds = gbm.predict(dtest)
indices = xgb_preds < 0
xgb_preds[indices] = 0
xgb_preds[:-5]

array([[  6.25390559e-02,   2.98715234e-01,   6.38745725e-01],
       [  8.87128562e-02,   1.43057719e-01,   7.68229425e-01],
       [  1.33674983e-02,   1.35355309e-01,   8.51277173e-01],
       ..., 
       [  6.11754193e-04,   1.94060132e-01,   8.05328071e-01],
       [  1.64191779e-02,   9.16121379e-02,   8.91968727e-01],
       [  8.36444451e-05,   2.17441167e-03,   9.97741938e-01]], dtype=float32)

### Submit 

In [92]:
submit = pd.DataFrame(columns=['listing_id', 'high', 'medium', 'low'])
submit['listing_id'] = test['listing_id']
submit[['high', 'medium', 'low']] = xgb_preds
submit.to_csv("data/submits/xgb_with_features_100_best.csv", index=False)

In [93]:
submit.head()

Unnamed: 0,listing_id,high,medium,low
0,7142618,0.062539,0.298715,0.638746
1,7210040,0.088713,0.143058,0.768229
100,7103890,0.013367,0.135355,0.851277
1000,7143442,0.042551,0.356995,0.600454
100000,6860601,0.031233,0.202682,0.766085
