## Required packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import xgboost as xgb
import csv
import os

from itertools import zip_longest
from sklearn import svm, model_selection, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

%matplotlib inline

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
!unzip /kaggle/input/sberbank-russian-housing-market/train.csv.zip
!unzip /kaggle/input/sberbank-russian-housing-market/test.csv.zip
!unzip /kaggle/input/sberbank-russian-housing-market/macro.csv.zip
!unzip /kaggle/input/sberbank-russian-housing-market/sample_submission.csv.zip

In [None]:
!ls /kaggle/working

In [None]:
train = pd.read_csv("/kaggle/working/train.csv")
test = pd.read_csv("/kaggle/working/test.csv")

In [None]:
pd.set_option('display.max_rows', None)
train.head(5).transpose()

In [None]:
train.shape

## Let's investigate data types and missing values

In [None]:
pd.DataFrame(train.dtypes.value_counts())

In [None]:
missingValueColumns = train.columns[train.isnull().any()].tolist()
msno.bar(train[missingValueColumns], figsize=(20,8),color=(0.5, 0.5, 1),fontsize=12,labels=True,)

### We will use neighborhood in order to estimate price_doc_per_sq_meter

### So, we need anchors. One of them may be heart of Moscow.

### Then, there is a few nuclear reactors in Moscow. One one then in kurchatov institute.

In [None]:
plt.hist(train['kremlin_km'], bins=100)

In [None]:
plt.hist(train['nuclear_reactor_km'], bins=100)

In [None]:
mean_price = np.mean(train['price_doc'])

In [None]:
plt.hist(train['price_doc'], bins=100)

## price_doc distribution looks like lognormal

In [None]:
plt.hist(np.log(train['price_doc']), bins=100)

In [None]:
# Prepare dataset

In [None]:
X_train = train.drop(['timestamp'], axis=1)
y_train = train['price_doc']
feature_names = list(X_train.columns)

In [None]:
for c in X_train.columns:
    if X_train[c].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(X_train[c].values)) 
        X_train[c] = lbl.transform(list(X_train[c].values))

imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp = imp.fit(X_train)
X_train = pd.DataFrame(imp.transform(X_train), columns=feature_names)

In [None]:
y_train.drop(X_train[X_train['full_sq'] == 0].index, inplace=True)
X_train.drop(X_train[X_train['full_sq'] == 0].index, inplace=True)
X_train['price_doc_per_sq'] = y_train / X_train['full_sq']
X_train['average_price_doc_per_sq_near'] = X_train['price_doc_per_sq']

In [None]:
print(X_train.shape)
print(y_train.shape)

In [None]:
np.sum(np.isinf(X_train['price_doc_per_sq']))

In [None]:
def get_candidates(row, X_train, eps, training=False):
    less_kremlin = row['kremlin_km'] - eps < X_train['kremlin_km']
    greater_kremlin = X_train['kremlin_km'] < row['kremlin_km'] + eps
    less_nuclear = row['nuclear_reactor_km'] - eps < X_train['nuclear_reactor_km']
    greater_nuclear = X_train['nuclear_reactor_km'] < row['nuclear_reactor_km'] + eps
    same_district = X_train['sub_area'] == row['sub_area']
    final = less_kremlin  \
        & greater_kremlin \
        & same_district   \
        & less_nuclear    \
        & greater_nuclear
        
    if training:
        not_itself = X_train['id'] != row['id']
        final = final & not_itself
    return final

In [None]:
neighbors = []
for i, row in X_train.iterrows():
    l_eps = 0 # km
    r_eps = 2 #km

    while r_eps - l_eps > 0.01: # TODO tune
        m_eps = (l_eps + r_eps) / 2
        candidates = get_candidates(row, X_train, m_eps, training=True)
        if candidates.sum() <= 10:
            l_eps = m_eps
        elif candidates.sum() > 10:
            r_eps = m_eps
        else:
            break
    
    X_train['average_price_doc_per_sq_near'][i] = np.mean(X_train['price_doc_per_sq'][candidates]) * X_train['full_sq'][i] if candidates.sum() != 0 else mean_price
    neighbors.append(candidates.sum())
    if i % 1000 == 0:
        print("progress: {}/{}".format(i, X_train.shape[0]))

In [None]:
arr = np.array(neighbors)
print(arr.min())
print(arr.max())
print(np.median(arr))
(arr == 0).sum()

In [None]:
np.sum(np.isinf(X_train['average_price_doc_per_sq_near']))

In [None]:
plt.hist(X_train['average_price_doc_per_sq_near'])

## Here we try to obtain important features from  svm.SVR coefficients

In [None]:
# svm_model = svm.SVR(kernel='linear', epsilon=0.1, max_iter=10000).fit(X_train, y_train)

In [None]:
# imp = list(zip(abs(svm_model.coef_[0]), feature_names))
# imp.sort(key=lambda x: x[0], reverse=True)
# imp, names = list(zip(*imp))
# imp = imp[:15]
# names = names[:15]

In [None]:
# names

In [None]:
# plt.barh(range(len(imp)), imp, align='center')
# plt.yticks(range(len(names)), names)
# plt.show()

## Seems like SVM with linear kernel and tremendous number of features would not approximate prices well. 

## Plan B: use xgboost.

In [None]:

xgb_params = {
    'eta': 0.05,
    'max_depth': 5,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmsle',
    'silent': 1
}

dtrain = xgb.DMatrix(X_train.drop(['price_doc', 'price_doc_per_sq', 'id'], axis=1), y_train)

cv_output = xgb.cv(xgb_params, dtrain, num_boost_round=1000, early_stopping_rounds=20,
    verbose_eval=50, show_stdv=False)
cv_output[['train-rmsle-mean', 'test-rmsle-mean']].plot()

In [None]:
num_boost_rounds = len(cv_output)
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)

In [None]:
featureImportance = model.get_fscore()
features = pd.DataFrame()
features['features'] = featureImportance.keys()
features['importance'] = featureImportance.values()
features.sort_values(by=['importance'],ascending=False,inplace=True)
fig,ax= plt.subplots()
fig.set_size_inches(20,10)
plt.xticks(rotation=60)
sns.barplot(data=features.head(30),x="features",y="importance",ax=ax,orient="v")

## Now we can discover features that xgboost considers to be important.

In [None]:
features.head(30)['features']

## Dealing with multicollinearity

### Unfortunantly, most of this features are hightly correlated:

In [None]:
topFeatures = features["features"].tolist()[:30]
topFeatures.append("price_doc")
corrMatt = X_train[topFeatures].corr()
mask = np.array(corrMatt)
mask[np.tril_indices_from(mask)] = False
fig,ax= plt.subplots()
fig.set_size_inches(30,30)
sns.heatmap(corrMatt, mask=mask,vmax=.8, square=True,annot=True, center=0)

## Here we carefully exclude features that are highly correlated with others.
## There is few expertise beyound the scene. For example, cafes are located in office premises and we don't need to keep both features.
## Also, distance from metro in kilometers and in minutes means the same.
## After several iterations we exclude following features.

In [None]:
bad_features = ['cafe_count_5000_price_high', 'cafe_count_2000', 'cafe_count_3000_price_2500', 'cafe_count_3000', 'cafe_count_2000_price_2500',
               'swim_pool_km', 'ttk_km',
               'cafe_count_5000_price_2500', 'sport_count_3000',
               'university_km', 'theater_km',
               'kindergarten_km',
               'railroad_km',
               'metro_min_avto',
               'cafe_count_5000', 'cafe_count_3000_price_1500', 'cafe_count_5000_price_1500', 'cafe_count_5000_price_1500', 'cafe_count_3000_price_1000', 'cafe_count_5000_price_4000',
               'nuclear_reactor_km', 'radiation_km',
               'zd_vokzaly_avto_km', 'metro_km_avto',
               'detention_facility_km', 'ice_rink_km',
               'cafe_count_1000', 'cafe_count_2000_price_1000', 'cafe_sum_5000_min_price_avg', 'cafe_count_5000_price_1000',
               'industrial_km', 'school_km', 'big_road1_km', 'park_km', 'trc_count_3000', 'exhibition_km',
               'office_count_3000', 'office_count_5000']


## it might be little confusing to remove universities, schools and kindergartens from feature space, but it is too highly correlated with publics healthcare which is more important.

In [None]:
# x_train = train.drop(["id", "timestamp", "price_doc"], axis=1)
# x_train = x_train.drop(labels=bad_features, axis=1)


# for c in x_train.columns:
#     if x_train[c].dtype == 'object':
#         lbl = preprocessing.LabelEncoder()
#         lbl.fit(list(x_train[c].values)) 
#         x_train[c] = lbl.transform(list(x_train[c].values))
        
# dtrain = xgb.DMatrix(x_train, y_train)
# cv_output = xgb.cv(xgb_params, dtrain, num_boost_round=100, early_stopping_rounds=20,
#     verbose_eval=50, show_stdv=False)
# cv_output[['train-rmsle-mean', 'test-rmsle-mean']].plot()
# num_boost_rounds = len(cv_output)
# model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round= num_boost_rounds)

In [None]:
# featureImportance = model.get_fscore()
# features = pd.DataFrame()
# features['features'] = featureImportance.keys()
# features['importance'] = featureImportance.values()
# features.sort_values(by=['importance'],ascending=False,inplace=True)
# topFeatures = features["features"].tolist()[:30]
# new_features = [x for x in topFeatures if x not in bad_features]
# corrMatt = train[new_features].corr()
# mask = np.array(corrMatt)
# mask[np.tril_indices_from(mask)] = False
# fig,ax= plt.subplots()
# fig.set_size_inches(30,30)
# sns.heatmap(corrMatt, mask=mask,vmax=.8, square=True,annot=True, center=0)

## After another one selection we choose following features:

In [None]:
# final_features = ['full_sq',
#  'life_sq',
#  'kitch_sq',
#  'num_room',
#  'build_year',
#  'max_floor',
#  'floor',
#  'state',
#  'sadovoe_km',
#  'metro_min_walk',
#  'public_healthcare_km',
#  'office_sqm_5000',
#  'public_transport_station_km',
#  'church_count_5000',
#  'material',
#  'sub_area',
#  'ID_metro',
#  'indust_part']

In [None]:
final_features = ['full_sq',
                  'life_sq',
                  'build_year',
                  'num_room',
                  'kitch_sq',
                  'floor',
                  'state',
                  'max_floor',
                  'kremlin_km',
                  'nuclear_reactor_km',
                  'metro_min_walk',
                  'public_healthcare_km',
                  'office_sqm_5000',
                  'public_transport_station_km']

In [None]:
optional_features = ['average_price_doc_per_sq_near']

In [None]:
len(final_features)

In [None]:
corrMatt = X_train[final_features + ['average_price_doc_per_sq_near', 'price_doc']].corr()
mask = np.array(corrMatt)
mask[np.tril_indices_from(mask)] = False
fig,ax= plt.subplots()
fig.set_size_inches(30,30)
sns.heatmap(corrMatt, mask=mask,vmax=.8, square=True,annot=True, center=0)

# And now we can fit xgboost in this feature space

In [None]:
# y_train = train["price_doc"]
# x_train = train[final_features]
# x_test = test[final_features]

# for c in x_train.columns:
#     if x_train[c].dtype == 'object':
#         lbl = preprocessing.LabelEncoder()
#         lbl.fit(list(x_train[c].values)) 
#         x_train[c] = lbl.transform(list(x_train[c].values))
        
# for c in x_test.columns:
#     if x_test[c].dtype == 'object':
#         lbl = preprocessing.LabelEncoder()
#         lbl.fit(list(x_test[c].values)) 
#         x_test[c] = lbl.transform(list(x_test[c].values))

# xgb_params = {
#     'eta': 0.05,
#     'max_depth': 5,
#     'subsample': 0.7,
#     'colsample_bytree': 0.7,
#     'objective': 'reg:linear',
#     'eval_metric': 'rmsle',
#     'silent': 1
# }

# dtrain = xgb.DMatrix(x_train, y_train)
# dtest = xgb.DMatrix(x_test)

# cv_output = xgb.cv(xgb_params, dtrain, num_boost_round=1000, early_stopping_rounds=100,
#     verbose_eval=50, show_stdv=False)
# cv_output[['train-rmsle-mean', 'test-rmsle-mean']].plot()

In [None]:
knn_dtrain = xgb.DMatrix(X_train[final_features + optional_features], y_train)

In [None]:
cv_output = xgb.cv(xgb_params, knn_dtrain, num_boost_round=1000, early_stopping_rounds=900,
    verbose_eval=50, show_stdv=False)
cv_output[['train-rmsle-mean', 'test-rmsle-mean']].plot()

In [None]:
knn_model = xgb.train(dict(xgb_params, silent=0), knn_dtrain, num_boost_round=1000)

In [None]:
X_train[final_features + optional_features].head()

In [None]:
# average_price_doc_per_sq_near

In [None]:
# x_test['kremlin_km'] = test['kremlin_km']
# x_test['nuclear_reactor_km'] = test['nuclear_reactor_km']
# x_test['average_price_doc_per_sq_near'] = x_test['full_sq']

In [None]:
X_test = test[final_features]
X_test['average_price_doc_per_sq_near'] = X_test['full_sq']

In [None]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp = imp.fit(X_test)
X_test = pd.DataFrame(imp.transform(X_test), columns=final_features + ['average_price_doc_per_sq_near'])

In [None]:
X_test['sub_area'] = test['sub_area']

In [None]:
X_test.head()

In [None]:
train.drop(train[train['full_sq'] == 0].index, inplace=True)

In [None]:
X_train['sub_area'] = train['sub_area']

In [None]:
test_neighbors = []
for i, row in X_test.iterrows():
    l_eps = 0 # km
    r_eps = 2 # km

    while r_eps - l_eps > 0.01: #TODO tune
        m_eps = (l_eps + r_eps) / 2
        candidates = get_candidates(row, X_train, m_eps, training=False)
        if candidates.sum() <= 10:
            l_eps = m_eps
        elif candidates.sum() > 10:
            r_eps = m_eps
        else:
            break
    
    X_test['average_price_doc_per_sq_near'][i] = np.mean(X_train['price_doc_per_sq'][candidates]) * X_test['full_sq'][i] if candidates.sum() != 0 else mean_price
    test_neighbors.append(candidates.sum())
    if i % 1000 == 0:
        print(i)

In [None]:
X_test.drop(['sub_area'], axis=1, inplace=True)

In [None]:
X_test.head()

In [None]:
dtest = xgb.DMatrix(X_test)

In [None]:
test_neighbors_np = np.array(test_neighbors)
print(test_neighbors_np.min())
print(test_neighbors_np.max())
print(np.median(test_neighbors_np))
print(np.sum(test_neighbors_np == 0))

In [None]:
ans[list(np.where(test_neighbors_np == 0)[0])]

In [None]:
# candidates = get_candidates(row, X_train, 0)
# candidates.sum()

In [None]:
# plt.hist(x_test['kremlin_km'], bins=100)

In [None]:
# from scipy import stats
# stats.mode(x_test['average_price_doc_per_sq_near'])

In [None]:
num_boost_rounds = len(cv_output)
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round= num_boost_rounds)

In [None]:
# x_knn_train = X_train[final_features + ['average_price_doc_per_sq_near']]
# d_knn_train = xgb.DMatrix(x_knn_train, y_train)
# knn_model = xgb.train(dict(xgb_params, silent=0), d_knn_train, num_boost_round= num_boost_rounds)

In [None]:
ans = knn_model.predict(dtest)

In [None]:
plt.hist(ans, bins=100)

In [None]:
any(ans < 0)

In [None]:
np.any(np.isnan(ans))

In [None]:
np.any(np.isinf(ans))

In [None]:
# for i, val in enumerate(ans):
#     if val < 0: 
#         print(i, val)

## Suddenly there is one negative prediction in testing. Let's just replace it with the sample mean

In [None]:
# ans[6289] = train['price_doc'].mean()

In [None]:
subm = np.array(list(zip(test['id'], ans)))
df = pd.DataFrame(subm, columns=['id', 'price_doc'])
df['id'] = df['id'].astype('int64')

In [None]:
df.head()

In [None]:
df.to_csv('/kaggle/working/output-ver3.csv', index=False)

# Final private score: RMSLE = 0.32382
# ~ 1682th rank on the private leaderboard