In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from path import Path

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
from collections import Counter

np.set_printoptions(precision=4)

# Загрузка данных

In [None]:
path_to_zip_file = Path('/kaggle/input/two-sigma-connect-rental-listing-inquiries/sample_submission.csv.zip')
sub_ex = pd.read_csv(path_to_zip_file, index_col='listing_id')
## sub_ex

In [None]:
def get_data(path_to_zip_file):
    df = pd.read_json(path_to_zip_file, convert_dates=['created'])
    df.set_index('listing_id', inplace=True)
    return df

In [None]:
raw_train_data = get_data('/kaggle/input/two-sigma-connect-rental-listing-inquiries/train.json.zip')
raw_train_data.head(1)

In [None]:
raw_test_data = get_data('/kaggle/input/two-sigma-connect-rental-listing-inquiries/test.json.zip')
raw_test_data.head(1)

# Исследование данных

In [None]:
raw_train_data.info()

In [None]:
raw_train_data.interest_level.value_counts().plot.bar()

In [None]:
raw_train_data.groupby(by=("interest_level")).mean().reset_index()

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(raw_train_data.corr(), annot=True)

In [None]:
len(raw_train_data.street_address.unique())

# Количество фотографий

In [None]:
class PhotoCounter(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        print('Счет количества фотографий')
        X_ = X.copy() # creating a copy to avoid changes to original dataset
        X_['photos_num'] = X.photos.apply(len)
        return X_

# Разворачивание особенностей

In [None]:
## cnt = Counter()
## for fs in raw_train_data.features:
##     feats = set(map(str.lower, fs))
##     for word in feats:
##         cnt[word] += 1
## num_most_common = 10
## MOST_COMMON_FEATS = [k for k, _ in cnt.most_common(num_most_common)]
MOST_COMMON_FEATS = [
    'elevator',
    'hardwood floors',
    'cats allowed',
    'dogs allowed',
    'doorman',
    'dishwasher',
    'laundry in building',
    'no fee',
    'fitness center',
    'laundry in unit']

In [None]:
class MostCommonFeatsAdder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        print('Разворачивание особенностей')
        X_ = X.copy() # creating a copy to avoid changes to original dataset
        mc_feats_lists = {f: [] for f in MOST_COMMON_FEATS}
        for fs in X_.features:
            feats = set(map(str.lower, fs))
            for feat in mc_feats_lists.keys():
                mc_feats_lists[feat].append(1 if feat in feats else 0)
        most_common_feats = pd.DataFrame(mc_feats_lists, index=X_.index)
        X_ = pd.concat([X_, most_common_feats], axis=1)
        return X_

# День недели

In [None]:
class WeekdayAdder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        print('Добавление дня недели')
        X_ = X.copy() # creating a copy to avoid changes to original dataset
        X_['created_weekday'] = X_.created.apply(lambda x: x.weekday())
        return X_

# Отбор колонок

In [None]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        print(f'Выбраны колонки: {PASS_COLS}')
        return X[PASS_COLS]

# Построение модели

In [None]:
PASS_COLS = ['bathrooms', 'bedrooms', 'price', 'photos_num'] + MOST_COMMON_FEATS + ['created_weekday']
[(i, c) for i, c in enumerate(PASS_COLS)]

In [None]:
cat_features = [14]
cbc = CatBoostClassifier(verbose=False, cat_features=cat_features)

In [None]:
pipeline = Pipeline([
    ('photo_counter', PhotoCounter()), 
    ('mc_feats_adder', MostCommonFeatsAdder()), 
    ('weekdays_adding', WeekdayAdder()), 
    ('selector', ColumnSelector()), 
    ('cbc', cbc)
])

In [None]:
y_train = raw_train_data.interest_level
x_train = raw_train_data.drop(columns=['interest_level'])

In [None]:
#grid = {'cbc__learning_rate': [0.03, 0.1],
#        'cbc__depth': [4, 6],
#        'cbc__l2_leaf_reg': [1, 3]}
grid = {'cbc__learning_rate': [0.1],
        'cbc__depth': [6],
        'cbc__l2_leaf_reg': [1, 3]}
clf = GridSearchCV(pipeline, param_grid=grid, n_jobs=-1, verbose=1)
model = clf.fit(x_train, y_train)

In [None]:
model.best_params_

In [None]:
model.best_score_

In [None]:
x_test = raw_test_data

In [None]:
def submission_predict(clf, x, submission_col_order=['high', 'medium', 'low']):
    preds = clf.predict(x)
    preds = preds.flatten()
    
    probas = clf.predict_proba(x)
    
    max_prob_cols = probas.argmax(axis=1)
    order = [ max_prob_cols[np.where(preds == i)[0][0]] for i in submission_col_order ]
    
    probas = pd.DataFrame(probas)
    probas = probas[order]
    probas.columns = submission_col_order
    
    probas.index = x.index
    
    return probas, preds

probas, preds = submission_predict(model, x_test)

In [None]:
probas

In [None]:
probas.to_csv(f'submission.csv')