In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

In [None]:
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
def preprocess(data):
    data['num_photos'] = data['photos'].apply(len)
    data['num_features'] = data['features'].apply(len)

    def fill_empty(row):
        if row.num_features == 0:
            return ['null']
        return row.features

    data['features_filled'] = data.apply(lambda row: fill_empty(row), axis = 1)
    
    data['features_word'] = data['features_filled'].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
    tfidf = CountVectorizer(stop_words = 'english', max_features = 200)
    data_sparse = tfidf.fit_transform(data['features_word'])
    
    data['created_year'] = pd.to_datetime(data['created']).dt.year
    data['created_month'] = pd.to_datetime(data['created']).dt.month
    data['created_day'] = pd.to_datetime(data['created']).dt.day

    return data

In [None]:
train = pd.read_json('../input/train.json')
label_num_map = { 'high': 0, 'medium': 1, 'low': 2 }
train['label'] = train['interest_level'].apply(lambda x: label_num_map[x])

interest_level = pd.get_dummies(train['interest_level'])
interest_level
train = pd.concat([train, interest_level], axis = 1)

train = preprocess(train)
use_features = ['bedrooms', 'bathrooms', 'price', 'num_features', 'num_photos']
train_X = train[use_features]
train_y = train['label']
gradientB_model = GradientBoostingClassifier(n_estimators=20, learning_rate=1.0, max_depth=1, random_state=0).fit(train_X, train_y)
gradientB_model = gradientB_model.fit(train_X, train_y)

test = pd.read_json('../input/test.json')
listing_id = test['listing_id']
test = preprocess(test)[use_features]
result = gradientB_model.predict(test)

result_df = pd.DataFrame({ 'level': result })
il = pd.get_dummies(result_df['level'])
result_df = pd.concat([result_df, il], axis = 1)
result_df = result_df[[0, 1, 2]]
result_df.index = test.index
result_df.rename(columns = {0: 'high', 1: 'medium', 2: 'low'}, inplace = True)
result_df = pd.concat([result_df, listing_id], axis = 1)
result_df.set_index('listing_id', inplace = True)
result_df.to_csv('result.csv')