In [None]:
pip install --upgrade seaborn

In [None]:
import seaborn as sns

In [None]:
sns.__version__

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import osmnx as ox
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.image as mpimg
%matplotlib inline
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore");

import os
print(os.listdir("../input"))


import zipfile
with zipfile.ZipFile('../input/two-sigma-connect-rental-listing-inquiries/test.json.zip', 'r') as zip_obj:
   # Extract all the contents of zip file in current directory
   zip_obj.extractall('/kaggle/working/')

with zipfile.ZipFile('../input/two-sigma-connect-rental-listing-inquiries/train.json.zip', 'r') as zip_obj:
   # Extract all the contents of zip file in current directory
   zip_obj.extractall('/kaggle/working/')
    
    
print('After zip extraction:')
print(os.listdir("/kaggle/working/"))

In [None]:
data_root = '/kaggle/working'
print(os.listdir(data_root))

In [None]:
train_df = pd.read_json("../working/train.json")
train_df.head(3)

Рассмотрим данные более подробно

In [None]:
data_df = train_df.copy()

Выясним какие категории обявлений существуют

In [None]:
data_df['interest_level'].value_counts()/data_df.shape[0]

Отметим, что категории распределены не равномерно, преобладает категория с низким интересом, категория с высоким интересом занимает лишь 7%.

In [None]:
data_df.info()

**Exploratory data analysis**

**Гипотезы**
1. Interest_level может зависеть от price
2. Price зависит от bathrooms и bedrooms
3. Существует зависимость кол-ва photos и interest_level
4. Существует зависимость display_address и price
5. Существует зависимость features и price
6. Существует зависимость created и price

Вычислим среднюю стоимость в каждой категории

In [None]:
data_df.groupby('interest_level')['price'].mean()

In [None]:
plt.figure(figsize=(15, 7))

sns.boxplot(y='price', data=data_df)

plt.title('Price', fontsize=20)
#plt.ylabel('Age', fontsize=14)
#plt.xlabel('Education', fontsize=14)

plt.xticks(fontsize=14)
plt.yticks(fontsize=14);

Полученный график, позволяет сделать вывод о том, в данном наборе присутствуют выбросы, которые могут негативно повлиять на анализ данных. Исключим выбросы признака 'price'.

In [None]:
q = data_df['price'].quantile(0.99)
data_df = data_df[data_df['price'] < q]

In [None]:
data_df['price'].describe()

In [None]:
sns.displot(
    data = data_df,
    x = data_df['price']/1000,
    hue='interest_level',
    kind = "kde"   
    #common_norm=False # независимая нормализация каждого подмножества
)

plt.title('Price', fontsize=20)
plt.xlabel('Price', fontsize=14)
plt.ylabel('Dentsity', fontsize=14)

plt.xticks(fontsize=14)
plt.yticks(fontsize=14);

In [None]:
data_df.bathrooms.unique()

In [None]:
data_df.building_id.value_counts()

In [None]:
data_df.loc[data_df.bathrooms == 2.5].head(4)

Т.к. признак 'interest_level' является категориальным, проведем его кодирование с помощью средней стоимостью квартир внутри группы

In [None]:
def code_mean (data, cat_feature, real_feature):
    return (data[cat_feature].map(data.groupby(cat_feature)[real_feature].mean()))

In [None]:
data_df['interest_level_mean'] = code_mean(data_df, 'interest_level', 'price')

In [None]:
data_df['interest_level_mean'].value_counts()

Отметим, что средняя стоимость обьявлений категории 'low' - 4.176,  'medium' - 3.158, 'high' - 2.700. 

Рассмотрим более подробно признак 'price'

In [None]:
data_df['price'].describe()

Посторим график зависимости стоимости недвижимости от кол-ва комнат

In [None]:
data_df.plot(kind="scatter", x = "bedrooms", y = "price")

Полученный график, позволяет сделать вывод о том, в данном наборе присутствуют выбросы, которые могут негативно повлиять на анализ данных. Исключим выбросы признака 'price'.

In [None]:
data_df['price'].describe()

In [None]:
plt.figure(figsize=(8,6))
sns.distplot(data_df.price.values, bins=50, kde=True)
plt.xlabel('price', fontsize=12)
plt.show()

Рассмотрим атрибут 'cerated'.

In [None]:
data_df['created'].sort_values(ascending=False)

In [None]:
import datetime

origin = datetime.datetime(2016,1,1)
data_df['created_code'] = data_df['created'].apply(lambda x: (datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') - origin).days)
data_df['created_code'].describe()


Добавим численные атрибуты: 'num_photos', 'num_features'

In [None]:
data_df['num_photos'] = data_df['photos'].apply(len)
data_df['num_features'] = data_df['features'].apply(len)
data_df['num_description_words'] = data_df['description'].apply(lambda x: len(x.split(' ')))

In [None]:
data_df = data_df.drop(['photos', 'created', 'features', 'description', 'listing_id', 'street_address'], axis=1)

In [None]:
data_df.info()

In [None]:
data_df.hist(bins=50, figsize=(20,15))

In [None]:
corr_matrix = data_df.corr()


In [None]:
corr_matrix["interest_level_mean"].sort_values(ascending=False)

In [None]:
lat_target = data_df.latitude # массив координат объектов
lng_target = data_df.longitude
#fig, ax = ox.plot_graph(G, figsize=(10,20), close=False, show=False)
#ax.scatter(lng_target, lat_target) # объектов


In [None]:
data_df.loc[data_df['interest_level'] == 'low', ['interest_level_coded']] = 0
data_df.loc[data_df['interest_level'] == 'medium', ['interest_level_coded']] = 1
data_df.loc[data_df['interest_level'] == 'high', ['interest_level_coded']] = 2

In [None]:
data_df['interest_level_coded'].head(10)

In [None]:
#G = ox.graph_from_place('New York, USA', network_type='walk')

#N = data_df['interest_level_coded'].count()

In [None]:
import seaborn as sns

#fig, ax = ox.plot_graph(G, figsize=(10,20), close=False, show=False, bgcolor='grey',)
#c = data_df['interest_level_coded']
             
#scatter = ax.scatter(lng_target, lat_target, c = c)# объектов

# produce a legend with the unique colors from the scatter
#legend1 = ax.legend(*scatter.legend_elements() ,loc="upper right", title="Interests")


In [None]:
from wordcloud import WordCloud

plt.figure(figsize = (12, 12))
text = ' '.join(train_df['description'].values)
wordcloud = WordCloud(max_font_size=None, background_color='white', width=1200, height=1000).generate(text)
plt.imshow(wordcloud)
plt.title('Top Words in Apartment Description', fontsize=14)
plt.axis("off")

In [None]:
list_of_features = list(train_df['features'].values)
plt.figure(figsize = (10, 10))
text = ' '.join(['_'.join(i.split(' ')) for j in list_of_features for i in j])
wordcloud = WordCloud(max_font_size=None, background_color='white', collocations=False, width=1200, height=1000).generate(text)
plt.imshow(wordcloud)
plt.title('Top Features', fontsize=14)
plt.axis("off")
plt.show()

In [None]:
plt.figure(figsize = (12, 12))
train_df['display_address'] = train_df['display_address'].apply(lambda x: x.replace(' ', '_'))
text = ' '.join(data_df['display_address'].values)
wordcloud = WordCloud(max_font_size=None, background_color='white', width=1200, height=1000).generate(text)
plt.imshow(wordcloud)
plt.title('Display Addresses', fontsize=14)
plt.axis("off")
plt.show()

Рассмотрим категориальные атрибуты

In [None]:
temp = pd.DataFrame(data_df.dtypes)
temp.columns = ["DataType"]

In [None]:
temp

In [None]:
categorical_columns = temp.index[temp["DataType"] == 'object'].values

Исследуем колличество уникальных значений категориальных атрибутов

In [None]:
for column in categorical_columns:
    print(column+ " column has :", str(len(data_df[column].unique()))+" distinct values")

**Подготовка данных**

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
test_df = pd.read_json("../working/test.json")
test_df.info()

In [None]:
train_df.loc[train_df['interest_level'] == 'low', ['interest_level_coded']] = 0
train_df.loc[train_df['interest_level'] == 'medium', ['interest_level_coded']] = 1
train_df.loc[train_df['interest_level'] == 'high', ['interest_level_coded']] = 2

In [None]:
X_train = train_df.drop(columns=['interest_level', 'interest_level_coded'], axis=1).copy()
y_train = train_df['interest_level_coded'].copy()
#y_train = pd.factorize(y_train)[0]
X_train.shape, y_train.shape

Удалим строки с выбросами по атрибуту 'price'

In [None]:
X_test = test_df.copy()

In [None]:
X_test.shape

In [None]:
X_test.head()

In [None]:
cat_attrs = ['building_id', 'display_address', 'manager_id']

In [None]:
import itertools
from collections import Counter
a = list(X_train['features'].values.flatten())
feature_list = list(itertools.chain.from_iterable(a))
top_25_features = [ x for x, y in Counter(feature_list).most_common(25)]
top_25_features

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
class CustomObjectAttrs(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X['building_id'] = pd.factorize(X['building_id'])[0]
        X['manager_id'] = pd.factorize(X['manager_id'])[0]
        X['display_address'] = pd.factorize(['display_address'])[0]
        return X

In [None]:
class CustomNumAttrs(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X['photos'] = X['photos'].apply(len)
        X['description'] = X['description'].apply(lambda x: len(x.split(' ')))
        X['building_id_cod'] = X['building_id'].map(X.groupby('building_id').size())
        X['manager_id_cod'] = X['manager_id'].map(X.groupby('manager_id').size())
        X['display_address_cod'] = X['display_address'].map(X.groupby('display_address').size())
        X = X.drop(['building_id'], axis=1)
        X = X.drop(['manager_id'], axis=1)
        X = X.drop(['display_address'], axis=1)

        return X

In [None]:
import datetime

class CustomDateAttrs(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        #origin = datetime.datetime(2016,1,1)
        #X['created_code'] = X['created'].apply(lambda x: (datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') - origin).days)
        X["created"] = pd.to_datetime(X["created"])
        X["created_year"] = X["created"].dt.year
        X["created_month"] = X["created"].dt.month
        X["created_day"] = X["created"].dt.day
        X["created_hour"] = X["created"].dt.hour
        X = X.drop(['created'], axis=1)
        return X

In [None]:
encoded_features = []

class CustomMultiLabelBinarizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.mlb_enc = None
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        try:
            X['features'] = X['features'].apply(lambda x: ['no_feature', ] if len(x)==0 else self.get_features(x))
            if self.mlb_enc==None:
                self.mlb_enc = MultiLabelBinarizer()
                X_enc = pd.DataFrame(self.mlb_enc.fit_transform(X['features']), columns=self.mlb_enc.classes_, 
                                     index=X.index)
                encoded_features.append(self.mlb_enc.classes_)
            else:
                X_enc = pd.DataFrame(self.mlb_enc.transform(X['features']), columns=self.mlb_enc.classes_, 
                                     index=X.index)
            X = pd.concat([X, X_enc], axis=1)
            X = X.drop('features', axis=1)
        except Exception as e:
            print("CustomMultiLabelBinarizer: Exception caught for {}: {}".format(e))
        return X
    
    @staticmethod
    def get_features(x):
        if len(x)==0:
            return ['no_feature', ]
        
        features = [feature for feature in x if feature in top_25_features]
        if len(features)==0:
            features.append('other')
        return features

In [None]:
pre_process = ColumnTransformer([('drop_cols', 'drop', ['street_address', 'listing_id']),
                                 ('num_imputer', SimpleImputer(strategy='median'), ['bathrooms', 'bedrooms', 'price', 'latitude', 'longitude']),
                                 ('custom_date_attr', CustomDateAttrs(), ['created', ]),
                                 ('custom_num_attrs', CustomNumAttrs(), ['description', 'photos', 'building_id', 'manager_id', 'display_address']),
                                 ('list_encoder', CustomMultiLabelBinarizer(), ['features', ])
                                 ])

X_train_transformed = pre_process.fit_transform(X_train)
X_test_transformed = pre_process.transform(X_test)

X_train_transformed.shape, X_test_transformed.shape

In [None]:
feature_columns = ['bathrooms', 'bedrooms', 'price', 'latitude', 'longitude', 'building_id_num', 'manager_id_num', 'display_address_num'] + ['created_year', 'created_month', 'created_day', 'created_hour'] + ['description', 'photos'] + list(encoded_features[0])
print(len(feature_columns), feature_columns)

In [None]:
X_train_transformed = pd.DataFrame(X_train_transformed, columns=feature_columns)
X_test_transformed = pd.DataFrame(X_test_transformed, columns=feature_columns)

In [None]:
X_train_transformed.head(5)

Разделим набор Train на наборы Train и Val в соотношении 90:10

In [None]:
from sklearn.model_selection import train_test_split

X_train_transformed, X_val_transformed, Y_train, Y_val = train_test_split(X_train_transformed, y_train, test_size=0.1, random_state=2018)

print('Shape of x_train:', X_train_transformed.shape)
print('Shape of x_val:', X_val_transformed.shape)
print('Shape of y_train:', Y_train.shape)
print('Shape of y_val:', Y_val.shape)

In [None]:
Y_train

In [None]:
Y_val

Стандартизаци и нормализация данных

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X_train_transformed)

X_train_scaled = scaler.transform(X_train_transformed)
X_val_scaled = scaler.transform(X_val_transformed)
X_test_scaled = scaler.transform(X_test_transformed )

**Построим нейронную сеть**

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras import regularizers
from keras import initializers
from keras.layers import BatchNormalization

In [None]:
def act(x):
    return keras.activations.elu(x, alpha=0.3)

In [None]:
he_inint = keras.initializers.VarianceScaling(
    scale=1.0, mode="fan_avg", distribution="truncated_normal", seed=None
)

In [None]:
model = keras.Sequential(
    [
        keras.Input(shape=X_train_scaled.shape[1]),
        layers.Dense(256, kernel_initializer = he_inint, activation=act),
        layers.BatchNormalization(),
        layers.Dense(256, kernel_initializer = he_inint, activation=act),
        layers.Dropout(0.5),
        layers.Dense(128, kernel_initializer = he_inint, activation=act),
        layers.Dropout(0.5),
        #layers.ActivityRegularization(l1=0.0, l2=0.01),
        layers.Dense(128, kernel_initializer = he_inint, activation=act),
        layers.Dropout(0.5),
        # layers.ActivityRegularization(l1=0.0, l2=0.01),
        layers.Dense(64, kernel_initializer = he_inint, activation=act),
        layers.Dropout(0.5),
        layers.Dense(64, kernel_initializer = he_inint, activation=act),
        layers.Dropout(0.5),
        layers.Dense(3, activation="softmax"),
    ]
)

model.summary()

In [None]:
opt = keras.optimizers.SGD(learning_rate=0.01, momentum=0.0, nesterov=True, name="SGD")

model.compile(optimizer = opt, loss = "sparse_categorical_crossentropy", metrics=["accuracy"])

model.fit(X_train_scaled, Y_train, validation_data = (X_val_scaled, Y_val), epochs=60, batch_size=64)

In [None]:
plt.plot(model.history.history['loss'])
plt.plot(model.history.history['val_loss'])
plt.title("Model's Training & Validation loss across apochs")
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()

In [None]:
plt.plot(model.history.history['accuracy'])
plt.plot(model.history.history['val_accuracy'])
plt.title("Model's Training & Validation accuracy across apochs")
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()

In [None]:
#prediction=model.predict(X_val_scaled)

In [None]:
prediction=model.predict(X_test_scaled)

In [None]:
import xgboost as xgb

def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=1000):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.03
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

In [None]:
out_df = pd.DataFrame(prediction)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = X_test.listing_id.values
#new_order = ["listing_id", "high", "medium", "low"]
out_df = out_df.reindex(columns=["listing_id"] + list(out_df.columns[:-1]))
#out_df = out_df[out_df.columns[new_order]]
out_df.to_csv("cz.csv", index=False)

In [None]:
out_df.head()