In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from scipy import sparse

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from catboost import CatBoostClassifier
from xgboost import XGBClassifier

In [None]:
train_df = pd.read_json('../input/two-sigma-connect-rental-listing-inquiries/train.json.zip').reset_index(drop=True)
test_df = pd.read_json('../input/two-sigma-connect-rental-listing-inquiries/test.json.zip').reset_index(drop=True)

In [None]:
train_df['features'] = train_df['features'].apply(lambda x: ' '.join(x))
test_df['features'] = test_df['features'].apply(lambda x: ' '.join(x))
vect = TfidfVectorizer(stop_words='english', max_features=200)
train_sparse = vect.fit_transform(train_df['features'])
test_sparse = vect.transform(test_df['features'])

In [None]:
data = pd.concat([train_df, test_df])
data.drop(['listing_id'], axis=1, inplace=True)

data['photos_num'] = data['photos'].apply(len)
data.drop(['photos'], axis=1, inplace=True)

data['desc_len'] = data['description'].apply(lambda x: len(x.split(' ')))
data.drop(['description'], axis=1, inplace=True)

data['created'] = pd.to_datetime(data['created'], format='%Y-%m-%d %H:%M:%S')
data['created_month'] = data['created'].dt.month
data['created_weekday'] = data['created'].dt.weekday
data['created_hour'] = data['created'].dt.hour
data.drop(['created'], axis=1, inplace=True)

data['coords'] = data['longitude'].round(3).astype(str) + '_' + data['latitude'].round(3).astype(str)

coords_freq = data['coords'].value_counts().to_dict()
data['spot_freq'] = data['coords'].apply(lambda x: coords_freq.get(x, min(coords_freq.values())))
data.drop(['coords'], axis=1, inplace=True)

data['logprice'] = np.log(data['price'])

data['half_bathrooms'] = data['bathrooms'] - data['bathrooms'].apply(int)

data['price_per_bedroom'] = data['price'] / data['bedrooms']

data['rooms'] = data['bathrooms'] + data['bedrooms']

data['price_per_room'] = data['price'] / data['rooms']

data['features_num'] = data['features'].apply(len)
data.drop(['features'], axis=1, inplace=True)

In [None]:
categorical = ['building_id', 'manager_id', 'display_address', 'street_address']
data[categorical] = data[categorical].apply(LabelEncoder().fit_transform)

In [None]:
y_train = data.iloc[:len(train_df)][['interest_level']]
y_train = y_train.apply(LabelEncoder().fit_transform)

X_train = data.iloc[:len(train_df)].drop(['interest_level'], axis=1)
X_test = data.iloc[-len(test_df):].drop(['interest_level'], axis=1)

In [None]:
X_train.head(5)

In [None]:
X_test.head(5)

In [None]:
X_train = sparse.hstack([X_train, train_sparse]).tocsr()
X_test = sparse.hstack([X_test, test_sparse]).tocsr()

In [None]:
def fit_catboost(X_train, Y_train, X_test):
    classifiers = []
    
    params = {'n_estimators': 1000,
              'loss_function': 'MultiClass',
              'eval_metric': 'MultiClass',
              'learning_rate': 0.02,
              'max_depth': 6,
              'verbose': 100,
              'random_seed': 1
             }

    model = CatBoostClassifier(**params)
    cv = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
    
    for train_index, val_index in cv.split(X_train, Y_train):
        x_train, x_val = X_train[train_index,:], X_train[val_index,:]
        y_train, y_val = Y_train.iloc[train_index], Y_train.iloc[val_index]
        model = model.fit(x_train, y_train, eval_set=[(x_val, y_val)], use_best_model=True)
        classifiers.append(model)

    model_results = np.zeros((X_test.shape[0], 3))
    
    for clf in classifiers:
        model_results += clf.predict_proba(X_test)
    
    predictions = model_results / 5

    return predictions

In [None]:
preds = fit_catboost(X_train, y_train, X_test)

In [None]:
# After label encoding we have: 0 - high, 1 - low, 2 - medium
# Swap 1st and 2nd columns to match submission order
preds[:, [2, 1]] = preds[:, [1, 2]]

ids = test_df['listing_id']
out = pd.concat([pd.DataFrame(ids), pd.DataFrame(preds, columns=['high', 'medium', 'low'])], axis=1)
out.to_csv("submission.csv", index=False)