In [None]:
import numpy as np
import pandas as pd
import json
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn import preprocessing

In [None]:
df_train = []
with open("../input/train.json") as f:
    for line in f:
        df_train.append(json.loads(line))
df_train = pd.DataFrame(df_train[0])

In [None]:
df_test = []
with open("../input/test.json") as f:
    for line in f:
        df_test.append(json.loads(line))
df_test = pd.DataFrame(df_test[0])

In [None]:
le = preprocessing.LabelEncoder()
le.fit(["low", "medium", "high"])

In [None]:
def build_features(df, has_label = True):
    df["nb_photos"] = df["photos"].apply(len)
    if has_label:
        df = df[["listing_id", "bathrooms", "bedrooms", "nb_photos", "price", "interest_level"]]
    else:
        df = df[["listing_id", "bathrooms", "bedrooms", "nb_photos", "price"]]
    df["price_per_bathroom"] = df["price"] / df["bathrooms"].apply(lambda x: x+0.01)
    df["price_per_bedroom"] = df["price"] / df["bedrooms"].apply(lambda x: x+0.01)
    df["at_least_one_bathroom"] = df["bathrooms"].apply(lambda x: 1 if x > 0 else 0)
    df["at_least_one_bedroom"] = df["bedrooms"].apply(lambda x: 1 if x > 0 else 0)
    return df

In [None]:
train = df_train
test = df_test

In [None]:
train = build_features(train)
x_test = build_features(test, has_label = False)
x_train = train.drop("interest_level", axis = 1)
y_train = train["interest_level"]
y_train = le.transform(y_train)

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, train_size = 0.7)

In [None]:
d_train = xgb.DMatrix(x_train.drop("listing_id", axis = 1), label = y_train)
d_valid = xgb.DMatrix(x_valid.drop("listing_id", axis = 1), label = y_valid)

In [None]:
x_train.head()

In [None]:
params = dict()
params["eta"] = 0.2
params['objective'] = 'multi:softprob'
params['num_class'] = 3
params['eval_metric'] = 'mlogloss'

watchlist = [(d_train, 'train'), (d_valid, 'valid')]
bst = xgb.train(params, d_train, 50, watchlist, early_stopping_rounds = 20, verbose_eval = 10)

In [None]:
bst.get_fscore()

In [None]:
d_test = xgb.DMatrix(x_test.drop("listing_id", axis = 1))
p_test = bst.predict(d_test)

In [None]:
p_test_df = pd.DataFrame(p_test, columns = ["low", "medium", "high"], index = x_test.index)

In [None]:
sub = pd.DataFrame()
sub['listing_id'] = x_test['listing_id']
sub['high'] = p_test_df["high"]
sub['medium'] = p_test_df["medium"]
sub['low'] = p_test_df["low"]
sub.to_csv('basic_submission.csv', index=False)

In [None]:
sub