In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import xgboost as xgb
import csv
import os
import zipfile
import warnings

from itertools import zip_longest #Make an iterator that aggregates elements from each of the iterables
from sklearn import svm, model_selection, preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor 

%matplotlib inline

In [None]:
train_zip = zipfile.ZipFile('/kaggle/input/sberbank-russian-housing-market/train.csv.zip') 
test_zip = zipfile.ZipFile('/kaggle/input/sberbank-russian-housing-market/test.csv.zip')
sample_submit_zip = zipfile.ZipFile('/kaggle/input/sberbank-russian-housing-market/sample_submission.csv.zip') 
macro_zip = zipfile.ZipFile('/kaggle/input/sberbank-russian-housing-market/macro.csv.zip') 

train_data = pd.read_csv(train_zip.open('train.csv'))
test_data = pd.read_csv(test_zip.open('test.csv'))
sample_submit_data = pd.read_csv(sample_submit_zip.open('sample_submission.csv'))
macro_data = pd.read_csv(macro_zip.open('macro.csv'))


In [None]:
print(train_data.shape)

In [None]:
print(test_data.shape)

In [None]:
train_data.describe()

**Анализ целевой переменной**

In [None]:
fig, ax = plt.subplots(figsize=(8,5))
sns.histplot(train_data['price_doc'].values, bins=100, kde=True, ax=ax)
plt.title('Целевая переменная, price_doc', fontsize=18)
plt.xlabel('price_doc', fontsize=12)
plt.show()

In [None]:
#поиск наименее коррелирующих признаков с ценой (целевой переvенной)
corrs_find = train_data.corr(method='pearson')
corrs_find_prices = corrs_find["price_doc"][:-1]
corrs_find_prices[abs(corrs_find_prices).argsort()[::-1]].tail(100)

In [None]:
#Удалим наименее корреклирующие с целевой переменной признаки
delete_data = list(corrs_find_prices[abs(corrs_find_prices).argsort()[::-1]].tail(100).index)
print(delete_data[0], delete_data[-1])

In [None]:
print(train_data.shape, '\n', test_data.shape)

for i in delete_data:
    train_data.drop(i, axis=1, inplace=True)
    test_data.drop(i, axis=1, inplace=True)

print(train_data.shape, '\n', test_data.shape)

In [None]:
#проверка датасета на пропущенные значения

missing_values = train_data.columns[train_data.isnull().any()].tolist()
pd.isnull(train_data[missing_values]).sum().sort_values(ascending=False)

In [None]:
#Заменим пропущенные значения средним
missing_values_mean = ['hospital_beds_raion',
                      'state',
                      'num_room',
                      'life_sq',
                      'build_count_monolith',
                      'build_count_brick',
                      'cafe_avg_price_1500',
                      'cafe_sum_1500_min_price_avg',
                      'cafe_sum_1500_max_price_avg',
                      'cafe_avg_price_2000',
                      'cafe_sum_2000_max_price_avg',
                      'floor',
                      'metro_min_walk',
                      'metro_km_walk',
                      'railroad_station_walk_km',
                      'railroad_station_walk_min']

for i in missing_values_mean:
    train_data[i].fillna(train_data[i].mean(),inplace=True)
    test_data[i].fillna(train_data[i].mean(),inplace=True)

In [None]:
# Выводим числовые и категориальные признаки
num = train_data.dtypes[train_data.dtypes != "object"].index
cat = train_data.dtypes[train_data.dtypes == "object"].index
print(len(num))
print(len(cat))

In [None]:
#проверка датасета на NaN
isMissing = train_data.isna()
isMissing.sum().sum()

In [None]:
train_data.drop(['id', 'timestamp'], axis=1,inplace=True)
id_data = test_data['id']
test_data.drop(['id', 'timestamp'], axis=1,inplace=True)

In [None]:
#Выводим числовые и категориальные признаки
num = train_data.dtypes[train_data.dtypes != "object"].index
cat = train_data.dtypes[train_data.dtypes == "object"].index
print(len(num))
print(len(cat))

In [None]:
#закодируем категориальн8ые переменные с использование One Hot Coder
encoder = OneHotEncoder(handle_unknown = 'ignore')
encoder_train_data = pd.DataFrame(encoder.fit_transform(train_data[cat]).toarray())
encoder_test_data = pd.DataFrame(encoder.transform(test_data[cat]).toarray())

In [None]:
#возврат названий и индексов
encoder_train_data.columns = encoder.get_feature_names_out(cat)
encoder_test_data.columns = encoder.get_feature_names_out(cat)

encoder_train_data.index = train_data.index
encoder_test_data.index = test_data.index

In [None]:
#убираем категориальные признаки
num_train_data = train_data.drop(cat, axis=1)
num_test_data = test_data.drop(cat, axis=1)

In [None]:
train_data_concat = pd.concat([num_train_data, encoder_train_data], axis=1)
test_data_concat = pd.concat([num_test_data, encoder_test_data], axis=1)

print(train_data_concat.shape)
print(test_data_concat.shape)

In [None]:
train_data_concat.median().sort_values(ascending=False)

In [None]:
X = train_data_concat.drop(['price_doc'], axis=1)
y = train_data_concat['price_doc']

print("X shape:", X.shape)
print("y shape:", y.shape)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_test = test_data_concat

In [None]:
isMissing2 = train_data_concat.isna()
isMissing2.sum().sum()

**ДЕРЕВО РЕШЕНИЙ**

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_log_error, r2_score

#создаем модель и задает гиперпараметры
clf = DecisionTreeRegressor(random_state=42, max_depth=5)  

clf.fit(X_train, y_train)
tree_predict = clf.predict(X_train)
print(mean_squared_log_error(tree_predict, y_train))

test_preds = clf.predict(X_val)
print(mean_squared_log_error(test_preds, y_val))

**Определение наиболее важных характеристик**

In [None]:
import xgboost as xgb
if X_train[f].dtype=='object':
        lbl = preprocessing.OneHotEncoder()
        lbl.fit(list(train_data[cat].values)) 
        X_train[f] = lbl.transform(list(X_train[f].values))
        
#X_train = X_train.drop(["id", "timestamp"], axis=1)

xgb_params = {
    'eta': 0.05,
    'max_depth': 8,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}
dtrain = xgb.DMatrix(X_train, y_train, feature_names=X_train.columns.values)
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=100)

# plot the important features #
fig, ax = plt.subplots(figsize=(12,18))
xgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)
plt.show()

In [None]:
#X_test = X_test.drop(["id", "timestamp"], axis=1)

xgb_params = {
    'eta': 0.05,
    'max_depth': 4,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'min_child_weight':1,
    'silent': 1,
    'seed':0
}

xgtrain = xgb.DMatrix(X_train, y_train, feature_names=X_train.columns)
xgtest = xgb.DMatrix(X_val, y_val, feature_names=X_test.columns)
watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
num_rounds = 100 # Increase the number of rounds while running in local
model = xgb.train(xgb_params, xgtrain, num_rounds, watchlist, early_stopping_rounds=50, verbose_eval=5)

In [None]:
fig, ax = plt.subplots(figsize=(12,18))
xgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)
plt.show()

In [None]:
y_predict = model.predict(xgtrain)
output = pd.DataFrame({ 'price_doc': y_predict})
output.head()