In [1]:
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split

In [2]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=1000):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = 8088
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20,
                         verbose_eval=25)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

In [3]:
data_path = "../input/"
train_file = data_path + "train.json"
test_file = data_path + "test.json"
train_df = pd.read_json(train_file)
test_df = pd.read_json(test_file)
print(train_df.shape)
print(test_df.shape)

In [4]:
features_to_use  = ["bathrooms", "bedrooms", "latitude", "longitude", "price"]

In [5]:
# count of photos #
train_df["num_photos"] = train_df["photos"].apply(len)
test_df["num_photos"] = test_df["photos"].apply(len)

# count of "features" #
train_df["num_features"] = train_df["features"].apply(len)
test_df["num_features"] = test_df["features"].apply(len)

# count of words present in description column #
train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
test_df["num_description_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))

# convert the created column to datetime object so as to extract more features 
train_df["created"] = pd.to_datetime(train_df["created"])
test_df["created"] = pd.to_datetime(test_df["created"])

# Let us extract some features like year, month, day, hour from date columns #
train_df["created_year"] = train_df["created"].dt.year
test_df["created_year"] = test_df["created"].dt.year
train_df["created_month"] = train_df["created"].dt.month
test_df["created_month"] = test_df["created"].dt.month
train_df["created_day"] = train_df["created"].dt.day
test_df["created_day"] = test_df["created"].dt.day
train_df["created_hour"] = train_df["created"].dt.hour
test_df["created_hour"] = test_df["created"].dt.hour

train_df["price_per_bed"] = train_df["price"]/train_df["bedrooms"].clip(lower=1)
test_df["price_per_bed"] = test_df["price"]/test_df["bedrooms"].clip(lower=1)
train_df['created_date']=np.array(train_df.created.values, dtype='datetime64[D]'
                                 ).astype(np.float32)
test_df['created_date']=np.array(test_df.created.values, dtype='datetime64[D]'
                                 ).astype(np.float32)

# adding all these new features to use list #
features_to_use.extend(["num_photos", "num_features", "num_description_words","created_year", 
                        "created_month", "created_day", "listing_id", "created_hour", 
                        "price_per_bed", "created_date"])

In [6]:
categorical = ["display_address", "manager_id", "building_id", "street_address"]
for f in categorical:
        if train_df[f].dtype=='object':
            #print(f)
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(train_df[f].values) + list(test_df[f].values))
            train_df[f] = lbl.transform(list(train_df[f].values))
            test_df[f] = lbl.transform(list(test_df[f].values))
            features_to_use.append(f)

In [7]:
train_df['features'] = train_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
test_df['features'] = test_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
print(train_df["features"].head())
tfidf = CountVectorizer(stop_words='english', max_features=200)
tfidf.fit(train_df["features"])

In [8]:
features_to_use=[i for i in features_to_use if 'manager_id' not in i]

In [9]:
train_df_tr, train_df_te =train_test_split(train_df, random_state=0)
tr_sparse_tr = tfidf.transform(train_df_tr["features"])
tr_sparse_te = tfidf.transform(train_df_te["features"])
te_sparse = tfidf.transform(test_df["features"])

In [10]:
temp = pd.concat([train_df_tr.manager_id,pd.get_dummies(train_df_tr.interest_level)], axis = 1
                ).groupby('manager_id').mean()
temp.columns = ['high_frac','low_frac', 'medium_frac']
temp['count'] = train_df_tr.groupby('manager_id').count().iloc[:,1]

temp['manager_skill'] = temp['high_frac']*2 + temp['medium_frac']
unranked_managers_ixes = temp['count']<20
ranked_managers_ixes = ~unranked_managers_ixes
mean_values = temp.loc[ranked_managers_ixes, [
    'high_frac','low_frac', 'medium_frac','manager_skill']].mean()
temp.loc[unranked_managers_ixes,[
    'high_frac','low_frac', 'medium_frac','manager_skill']] = mean_values.values

train_df_tr = train_df_tr.merge(temp.reset_index(),how='left', on='manager_id')
train_df_te = train_df_te.merge(temp.reset_index(),how='left', on='manager_id')
new_manager_ixes = train_df_te['high_frac'].isnull()
train_df_te.loc[new_manager_ixes,['high_frac','low_frac', 'medium_frac','manager_skill'
                            ]] = mean_values.values
test_df = test_df.merge(temp.reset_index(),how='left', on='manager_id')
new_manager_ixes = test_df['high_frac'].isnull()
test_df.loc[new_manager_ixes,['high_frac','low_frac', 'medium_frac','manager_skill'
                            ]] = mean_values.values

In [11]:
features_to_use.extend(['high_frac','low_frac', 'medium_frac','manager_skill'])

In [12]:
features_to_use=['bathrooms', 'bedrooms', 'latitude', 'longitude', 'price', 'num_photos', 
                 'num_features', 'num_description_words', 'created_year', 'created_month', 
                 'created_day', 'listing_id', 'created_hour', 'price_per_bed', 'created_date',
                 'display_address', 'manager_id', 'building_id', 'street_address', 'manager_skill']

In [13]:
train_X_tr = sparse.hstack([train_df_tr[features_to_use], tr_sparse_tr]).tocsr()
train_X_te = sparse.hstack([train_df_te[features_to_use], tr_sparse_te]).tocsr()
test_X = sparse.hstack([test_df[features_to_use], te_sparse]).tocsr()

features_sparse = ['features_'+str(i) for i in range(tr_sparse_tr.shape[1])]
target_num_map = {'high':0, 'medium':1, 'low':2}
train_y_tr = np.array(train_df_tr['interest_level'].apply(lambda x: target_num_map[x]))
train_y_te = np.array(train_df_te['interest_level'].apply(lambda x: target_num_map[x]))
print(train_X_tr.shape, train_X_te.shape, test_X.shape)

In [14]:
preds, model = runXGB(train_X_tr, train_y_tr, train_X_te, train_y_te)
print(log_loss(train_y_te, preds))

In [15]:
fscores=model.get_fscore()
df_features=pd.DataFrame(fscores, index=['score']).T.reset_index()
df_features['f_index']=df_features['index'].apply(lambda x: int(x[1:]))
df_features['f_name']=np.array(features_to_use+features_sparse)[df_features['f_index'].values]
pd.Series(index = df_features['f_name'].values, 
          data = df_features['score'].values).sort_values()[-30:].plot(
    kind = 'bar')

In [16]:
'''preds, model = runXGB(train_X, train_y, test_X, num_rounds=400)
out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test_df.listing_id.values
out_df.to_csv("0315.csv", index=False)'''

In [29]:
df_ff=df_features[df_features.f_name.str.startswith('features')].sort_values(['score'], ascending=False).reset_index(drop=True)


In [33]:
df_ff['f_text']=np.array(tfidf.get_feature_names())[df_ff.f_index-20]
df_ff.head(30)['f_text'].values