# Где дешевле жить? Предсказание цен в Airbnb - учимся генерировать признаки и интерпретировать результаты модели

In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

# Загружаем датасет с помощью Pandas
data = pd.read_csv('AB_NYC_2019.csv')

# Добавляем в датасет для каждого объекта расстояние до центра Манхеттена
man_lat = (data[data['neighbourhood_group'] == 'Manhattan']['latitude'].max() - data[data['neighbourhood_group'] == 'Manhattan']['latitude'].min()) / 2
man_lon = (data[data['neighbourhood_group'] == 'Manhattan']['longitude'].max() - data[data['neighbourhood_group'] == 'Manhattan']['longitude'].min()) / 2
data['man_dist'] = abs(data['latitude'] - man_lat) + abs(data['longitude'] - man_lon)

# Кодируем числами категориальные признаки room_type и neighbourhood_group
le = LabelEncoder()
data['room_type'] = le.fit_transform(data['room_type'])
data['neighbourhood'] = le.fit_transform(data['neighbourhood'])
data['neighbourhood_group'] = le.fit_transform(data['neighbourhood_group'])

# Заполняем NoN в reviews_per_month
data['reviews_per_month'] = data['reviews_per_month'].fillna(-999)

# Отрезаем лишние колонки
cols_to_drop = ['id', 'name', 'host_id', 'host_name', 'last_review', 'latitude', 'longitude']
data = data.drop(columns=cols_to_drop)

# Устраняем аномалии
list_neighbourhood = list(data['neighbourhood'].unique())
list_room_type = list(data['room_type'].unique())

for i in list_neighbourhood:
    for j in list_room_type:
        count = data[(data['neighbourhood'] == i) & (data['room_type'] == j)].shape[0]
        if count != 0:
            mean = int(data[(data['neighbourhood'] == i) & (data['room_type'] == j)]['price'].mean())
            std = data[(data['neighbourhood'] == i) & (data['room_type'] == j)]['price'].std()
            std = int(np.nan_to_num(std, nan=0))
            # data.loc[(data['neighbourhood'] == i) & (data['room_type'] == j) & (data['price'] > mean + 2 * std), 'price'] = mean
            mask = (data['neighbourhood'] == i) & (data['room_type'] == j) & (data['price'] > mean + 1 * std)
            data = data.drop(data[mask].index)


In [74]:
# str

In [2]:
data.isna().sum()

neighbourhood_group               0
neighbourhood                     0
room_type                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
man_dist                          0
dtype: int64

In [3]:
data.shape

(45718, 10)

In [75]:
#

In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['price'], axis=1),
    data['price'],
    test_size=0.30,
    random_state=42,
)

X_test_pg1 = X_test[y_test < 100] # pg1
y_test_pg1 = y_test[y_test < 100] # pg1
X_test_pg2 = X_test[(y_test < 500) & (y_test >= 100)] # pg2
y_test_pg2 = y_test[(y_test < 500) & (y_test >= 100)] # pg2
X_test_pg3 = X_test[(y_test < 1000) & (y_test >= 500)] # pg3
y_test_pg3 = y_test[(y_test < 1000) & (y_test >= 500)] # pg3
X_test_pg4 = X_test[y_test >= 1000] # pg4
y_test_pg4 = y_test[y_test >= 1000] # pg4
# X_test_pg5 = X_test[y_test >= 5000] # pg5
# y_test_pg5 = y_test[y_test >= 5000] # pg5

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_test_pg1 = scaler.transform(X_test_pg1) # pg1
X_test_pg2 = scaler.transform(X_test_pg2) # pg2
X_test_pg3 = scaler.transform(X_test_pg3) # pg3
X_test_pg4 = scaler.transform(X_test_pg4) # pg4
# X_test_pg5 = scaler.transform(X_test_pg5) # pg5

# mod = LinearRegression()
# mod = KNeighborsRegressor()
mod = RandomForestRegressor(random_state=42)
# mod = GradientBoostingRegressor()
# mod = CatBoostRegressor(random_state=42)
# mod = LGBMRegressor()

param_grid = {
    'n_estimators': [100, 150],   # число деревьев
    'max_depth': [20],          # глубина
    'min_samples_split': [4],   # минимальное число объектов для разбиения
    'min_samples_leaf': [7, 10]      # минимальное число объектов в листе
}

model = GridSearchCV(
    estimator=mod,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=4,         # 4-кратная кросс-валидация
    n_jobs=-1,    # параллельно на всех ядрах
    verbose=2
)

model.fit(X_train, y_train)
y_test_predictions = model.predict(X_test)

print("Best params:", model.best_params_)
print("Best RMSE:", -model.best_score_)
print('-----')

rmse = np.sqrt(mean_squared_error(y_test, y_test_predictions))
print("RMSE:", rmse)

y_test_pg1_predictions = model.predict(X_test_pg1)  # pg1
rmse_pg1 = np.sqrt(mean_squared_error(y_test_pg1, y_test_pg1_predictions))  # pg1
print("RMSE pg1:", rmse_pg1) # pg1

y_test_pg2_predictions = model.predict(X_test_pg2)  # pg2
rmse_pg2 = np.sqrt(mean_squared_error(y_test_pg2, y_test_pg2_predictions))  # pg2
print("RMSE pg2:", rmse_pg2) # pg2

y_test_pg3_predictions = model.predict(X_test_pg3)  # pg3
rmse_pg3 = np.sqrt(mean_squared_error(y_test_pg3, y_test_pg3_predictions))  # pg3
print("RMSE pg3:", rmse_pg3) # pg3

y_test_pg4_predictions = model.predict(X_test_pg4)  # pg4
rmse_pg4 = np.sqrt(mean_squared_error(y_test_pg4, y_test_pg4_predictions))  # pg4
print("RMSE pg4:", rmse_pg4) # pg4

# y_test_pg5_predictions = model.predict(X_test_pg5)  # pg5
# rmse_pg5 = np.sqrt(mean_squared_error(y_test_pg5, y_test_pg5_predictions))  # pg5
# print("RMSE pg5:", rmse_pg5) # pg5

Fitting 4 folds for each of 4 candidates, totalling 16 fits
Best params: {'max_depth': 20, 'min_samples_leaf': 7, 'min_samples_split': 4, 'n_estimators': 150}
Best RMSE: 56.798244757732945
-----
RMSE: 57.861928588576305
RMSE pg1: 31.488833290783756
RMSE pg2: 63.84148522132888
RMSE pg3: 348.33498519367174
RMSE pg4: 896.7552402456299
