In [None]:
import os
import sys
import pandas as pd

In [None]:
train_df = pd.read_json("../input/train.json")
test_df = pd.read_json("../input/test.json")
print(train_df.shape)
print(test_df.shape)

In [None]:
train_df.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
int_level = train_df['interest_level'].value_counts()

plt.figure(figsize=(8,4))
sns.barplot(int_level.index, int_level.values, alpha = 0.8, color = sns.color_palette()[1])
plt.ylabel('Number of Occurrences', fontsize = 12)
plt.xlabel('Interest level', fontsize = 12)
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(x='bedrooms', hue='interest_level', data=train_df)
plt.ylabel('Number of Occurrences', fontsize = 12)
plt.xlabel('bedrooms', fontsize = 12)
plt.show()

In [None]:
import numpy as np
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
import xgboost as xgb

In [None]:
features_to_use = ["bathrooms", "bedrooms", "latitude", "longitude", "price"]
train_X = train_df[features_to_use].as_matrix()
test_X = test_df[features_to_use].as_matrix()

target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x:target_num_map[x]))
print(train_X.shape, test_X.shape)

In [None]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=1000):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds
    
    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)
    
    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain, 'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)
        
    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

In [None]:
cv_scores = []
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2014)
for dev_index, val_index in kf.split(range(train_X.shape[0])):
    dev_X, val_X = train_X[dev_index,:], train_X[val_index,:]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    preds, model = runXGB(dev_X, dev_y, val_X, val_y)
    cv_scores.append(log_loss(val_y, preds))
    print(cv_scores)
    break

In [None]:
preds, model = runXGB(train_X, train_y, test_X, num_rounds=400)
out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test_df.listing_id.values
out_df.to_csv("xgb.starter.csv", index=False)