# Cian flats EDA

In [14]:
import numpy as np
import pandas as pd
import re

import matplotlib.pyplot as plt
import matplotlib.ticker as mticker

import babel.numbers
import json

import seaborn as sns

# USE THIS STYLE
# plt.style.use('https://github.com/dhaitz/matplotlib-stylesheets/raw/master/pitayasmoothie-light.mplstyle')
# 
# OR THIS STYLE
import aquarel

import warnings

warnings.filterwarnings("ignore")

theme = aquarel.load_theme("arctic_light")
theme.set_font(family="serif")
theme.apply()

# Сделаем автоподгрузку всех изменений при перепрогонке ячейки
%load_ext autoreload
%autoreload 2

%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
custom_pallete = {
    "red": "#BF616A",
    "orange": "#D08770",
    "yellow": "#EBCB8B",
    "green": "#A3BE8C",
    "purple": "#B48EAD",
    "light_green": "#8FBCBB",
    "light_gray_blue": "#88C0D0",
    "light_blue": "#81A1C1",
    "blue": "#5E81AC",
}

## Load data

In [16]:
df_cian = pd.read_csv("./data/cian_houses_training.csv")
df_cian.head()

Unnamed: 0,price,geo_lat,geo_lng,metro,floor,floor_count,square,living_square,kitchen_square,year,...,house_type,heating,breakdown,accomodation_type,author,room_count,floor_ratio,room_ratio,living_ratio,kitchen_ratio
0,18374400,55.80253,37.620945,Марьина Роща,34,49,38.28,11.8,10.23,2027,...,Монолитно-кирпичный,unknown,unknown,Новостройка,КОРТРОС,1,0.693878,38.28,0.308255,0.267241
1,8170000,55.552637,37.337172,Аэропорт Внуково,2,5,31.0,14.61,10.0,2021,...,unknown,unknown,Нет,Вторичка,unknown,1,0.4,31.0,0.47129,0.322581
2,20206500,55.810466,37.624247,Алексеевская,34,37,28.5,21.4,7.62,2024,...,Монолитно-кирпичный,unknown,unknown,Новостройка,КОРТРОС,1,0.918919,28.5,0.750877,0.267368
3,17119620,55.706597,37.632285,Тульская,6,20,38.82,10.6,18.6,2024,...,Монолитный,unknown,unknown,Новостройка,unknown,1,0.3,38.82,0.273055,0.479134
4,7550000,55.551456,37.339499,Филатов луг,2,5,30.0,15.0,9.0,2021,...,unknown,unknown,Нет,Вторичка,unknown,1,0.4,30.0,0.5,0.3


In [17]:
df_cian.shape

(6524, 23)

In [18]:
df_cian.isna().drop_duplicates()

Unnamed: 0,price,geo_lat,geo_lng,metro,floor,floor_count,square,living_square,kitchen_square,year,...,house_type,heating,breakdown,accomodation_type,author,room_count,floor_ratio,room_ratio,living_ratio,kitchen_ratio
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


Все необходимые поля заполнены. (Важно, т.к. решается задача регрессии)

In [19]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

import xgboost as xgb
import lightgbm as lgb

In [20]:
y = df_cian["price"]
X = df_cian.drop(columns=["price"])

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=42,
)

In [21]:
numerical_features = [f for f in X_train.columns if X_train.dtypes[f] != 'object']
categorical_features = [f for f in X_train.columns if X_train.dtypes[f] == 'object']

numerical_features, categorical_features

(['geo_lat',
  'geo_lng',
  'floor',
  'floor_count',
  'square',
  'living_square',
  'kitchen_square',
  'year',
  'ceiling_height',
  'room_count',
  'floor_ratio',
  'room_ratio',
  'living_ratio',
  'kitchen_ratio'],
 ['metro',
  'finish_type',
  'view',
  'house_type',
  'heating',
  'breakdown',
  'accomodation_type',
  'author'])

In [22]:
preprocessor = make_column_transformer(
    (StandardScaler(), numerical_features),
    (OneHotEncoder(handle_unknown="ignore", drop="first"), categorical_features)
)

In [23]:
clf = make_pipeline(preprocessor, LinearRegression())
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
mean_absolute_error(y_test, y_pred), r2_score(y_test, y_pred)

(36250624.227694176, 0.773252909753818)

In [24]:
# xgb_reg = xgb.XGBRegressor()
clf = make_pipeline(preprocessor, xgb.XGBRegressor())
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
mean_absolute_error(y_test, y_pred), r2_score(y_test, y_pred)

(19671138.918004677, 0.8902243375778198)

In [25]:
clf = make_pipeline(preprocessor, xgb.XGBRFRegressor())
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
mean_absolute_error(y_test, y_pred), r2_score(y_test, y_pred)

(28299643.260882895, 0.8137798309326172)

In [26]:
clf = make_pipeline(preprocessor, lgb.LGBMRegressor())
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
mean_absolute_error(y_test, y_pred), r2_score(y_test, y_pred)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000975 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2854
[LightGBM] [Info] Number of data points in the train set: 4893, number of used features: 133
[LightGBM] [Info] Start training from score 95262929.838545


(19950427.959127583, 0.8878094559620686)