**Для помощи в решении использовался кернел https://www.kaggle.com/sudalairajkumar/xgb-starter-in-python**.

In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
import sklearn

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

In [None]:
train_data = pd.read_json('../input/two-sigma-connect-rental-listing-inquiries/train.json.zip')
test_data = pd.read_json('../input/two-sigma-connect-rental-listing-inquiries/test.json.zip')

In [None]:
def data_transform(train_data, test_data):
        
    Y_train = train_data['interest_level']
    train_data = train_data.drop('interest_level', axis=1)
    
    dt = pd.concat([train_data, test_data])
    
    # time columns definition
    dt['created'] = pd.to_datetime(dt['created'])
    dt['year_created'] = dt['created'].dt.year
    dt['month_created'] = dt['created'].dt.month
    dt['day_created'] = dt['created'].dt.day
    dt['hour_created'] = dt['created'].dt.hour
    
    # bedroom_price column definition
    dt['bedroom_price'] = dt['price'] / dt['bedrooms']
    dt.loc[dt['bedrooms'] == 0, 'bedroom_price'] = dt['price']

    # room_count column definition
    dt['room_count'] = dt['bedrooms'] + dt['bathrooms']

    # description_word_count column definition
    dt['description_word_count'] = dt['description'].apply(lambda desc: len(desc.split(' ')))

    # features_word_count column definition
    dt['features_word_count'] = dt['features'].apply(len)

    # photo_count column definition
    dt['photo_count'] = dt['photos'].apply(len)
    
    # columns encoding
    for column_name in ['building_id', 'display_address', 'manager_id', 'street_address']:
        label_encoder = LabelEncoder()
        label_encoder.fit(dt[column_name].values)
        dt[column_name] = label_encoder.transform(dt[column_name].values)
        
    # features transform
    dt['features'] = dt['features'].apply(lambda x: ' '.join(['_'.join(i.split(' ')) for i in x]))
    
    X_train = dt.iloc[:len(train_data)]
    X_test = dt.iloc[len(train_data):]
    
    count_vec = CountVectorizer(max_features=200)
    train_features_token_counts = count_vec.fit_transform(X_train['features'])
    test_features_token_counts = count_vec.transform(X_test['features'])

    # columns drop
    X_train = X_train.drop(['description', 'photos', 'created', 'features'], axis=1)
    X_test = X_test.drop(['description', 'photos', 'created', 'features'], axis=1)
    
    X_train = pd.concat([X_train.reset_index(), pd.DataFrame(train_features_token_counts.toarray(), columns=count_vec.get_feature_names())], axis=1)
    X_test = pd.concat([X_test.reset_index(), pd.DataFrame(test_features_token_counts.toarray(), columns=count_vec.get_feature_names())], axis=1)
    
    return X_train, X_test, Y_train


X_train, X_test, Y_train = data_transform(train_data, test_data)
print(X_train.shape)
print(Y_train.shape)

In [None]:
xgbc = xgb.XGBClassifier(verbosity=1, eval_metric='mlogloss', eta=0.1, objective='multi:softprob', n_estimators=1000, max_depth=6, colsample_bytree=0.7, subsample=0.7, min_child_weight=1)
xgbc.fit(X_train[:45000], Y_train[:45000],  eval_set=[(X_train[:45000], Y_train[:45000]), (X_train[45000:], Y_train[45000:])], early_stopping_rounds=40)

In [None]:
listing_ids = test_data['listing_id']
Y_test = xgbc.predict_proba(X_test)

submit = pd.DataFrame(data={'listing_id': listing_ids, 'high': Y_test[:, 0], 'medium': Y_test[:, 2], 'low': Y_test[:, 1]})
submit.to_csv('submit.csv', index=False)