In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from zipfile import ZipFile

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_csv(ZipFile("../input/sberbank-russian-housing-market/train.csv.zip").open('train.csv'), parse_dates=['timestamp'])
test_df = pd.read_csv(ZipFile("../input/sberbank-russian-housing-market/test.csv.zip").open('test.csv'), parse_dates=['timestamp'])

# EDA

In [None]:
train_df.head()

Очень много признаковых переменных. Есть смысл избавиться от части из них. Например, можно избавиться от малозначимых признаков. Такими будут признаки у которых малая корреляция с таргетом price_doc. 

In [None]:
corrMatrix = train_df.corr()
corrMatrix['price_doc'].to_frame('correlation with price_doc')

In [None]:
id_for_submission = test_df.id

In [None]:
# it will exrtact only those features that have correlation with target >= threshold 
# and will return dataframe with less colums than in the original df
def extract_features(df, threshold):
    sorted_corr = df.corr().abs().sort_values('price_doc', ascending=False)
    indeces = sorted_corr[sorted_corr.price_doc >= threshold].index
    df_sorted = df.loc[:, indeces]
    
    return df_sorted, indeces

In [None]:
[train_df, indeces] = extract_features(train_df, 0.2)
test_df = test_df.loc[:, indeces.drop('price_doc')]
print(train_df.columns)
print(test_df.columns)

In [None]:
import matplotlib.pyplot as plt
_ =train_df.hist(bins=50, figsize=(20,15))

### Распределение цен

In [None]:
train_df[['price_doc']].plot.hist(bins = 50, title = 'price_doc')

Здесь видно, что очень дорогих квартир мало. Скорее всего они "живут" по каким-то своим правилам. Есть смысл удалить из данных строки, в которых price_doc > 25 000 000.

In [None]:
train_df = train_df[train_df.price_doc <= 0.25*1e8]

In [None]:
train_df[['price_doc']].plot.hist(bins = 50, title = 'price_doc')

### Распределение количества комнат

In [None]:
print('min number of rooms', train_df['num_room'].min())
print('max number of rooms', train_df['num_room'].max())
print('NaNs', train_df['num_room'].isna().sum())

In [None]:
train_df[['num_room']].plot.hist(bins = 50, title = 'num_room in train set', xticks = range(19))
test_df[['num_room']].plot.hist(bins = 50, title = 'num_room in test set', xticks = range(19))

Здесь видно, что в тренировочном датасете крайне мало записей про квартиры с 0 комнат (?), а также с количеством комнат > 4. Можно их убрать из датасета. При этом важно оставить пропуски для дальнейшей работы с ними.

In [None]:
train_df[(train_df['num_room']>0) & (train_df['num_room']<=4) | (train_df['num_room'].isna())]
train_df = train_df[(train_df['num_room']>0) & (train_df['num_room']<=4) | (train_df['num_room'].isna())]

### Распределение общей площади

In [None]:
train_df[['full_sq']].plot.hist(bins = 50, title = 'price_doc in train set')
test_df[['full_sq']].plot.hist(bins = 50, title = 'price_doc in test set')

In [None]:
train_df = train_df[train_df['full_sq']<125]

Можно продолжить подчищать данные, но я пока остановлюсь на этом.

# Заполнение пропусков

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
print('number of NaNs in num_room: ',train_df.num_room.isna().sum())

Пропуски остались только для признака num_room в тренировочном датасете. Их достаточно много. Их можно заполнить с помощью какого-нибудь другого признака, наиболее кореллированным с num_room.

In [None]:
train_df.corr().abs().sort_values('num_room', ascending=False).num_room.to_frame('correlation with num_room').head(5)

Судя по результату, лучше всего выбрать признак full_sq для заполнения пропусков в num_room. 

In [None]:
plt.title('full_sq vs num_room')
plt.xlabel('full_sq')
plt.ylabel('num_room')
plt.scatter(train_df.full_sq, train_df.num_room)

In [None]:
green_diamond = dict(markerfacecolor='g', marker='D')
data = [train_df[train_df.num_room == 1].full_sq.values,
        train_df[train_df.num_room == 2].full_sq.values,
        train_df[train_df.num_room == 3].full_sq.values,
        train_df[train_df.num_room == 4].full_sq.values]
plt.figure(figsize=(20,10))
plt.xlabel('num_room')
plt.ylabel('full_sq')
plt.title('full_sq vs num_room')
plt.boxplot(data, flierprops=green_diamond)
plt.show()

In [None]:
# this function will drop outliers from a given Series 
def drop_outliers(data):
    Q1 = data.quantile(0.25)
    mean = data.mean()
    Q3 = data.quantile(0.75)
    IQR = Q3-Q1
    
    mask = ((data >= (mean - IQR*1.5)) & (data <= (mean + IQR*1.5)))
    return data[mask]

In [None]:
def get_clean_full_sq_df_with_num_room(df, cond):
    num_room_clean = drop_outliers(df[df.num_room == cond].full_sq)
    return pd.DataFrame({'num_room': [cond]*len(num_room_clean), 'full_sq': num_room_clean}) 

In [None]:
num_room_vs_full_sq_df = pd.concat([get_clean_full_sq_df_with_num_room(train_df, 1),
                                get_clean_full_sq_df_with_num_room(train_df, 2),
                                get_clean_full_sq_df_with_num_room(train_df, 3),
                                get_clean_full_sq_df_with_num_room(train_df, 4)],ignore_index = True
                               )

In [None]:
orig_train_df = train_df

In [None]:
train_df = num_room_vs_full_sq_df
green_diamond = dict(markerfacecolor='g', marker='D')
data = [train_df[train_df.num_room == 1].full_sq.values,
        train_df[train_df.num_room == 2].full_sq.values,
        train_df[train_df.num_room == 3].full_sq.values,
        train_df[train_df.num_room == 4].full_sq.values]
plt.figure(figsize=(20,10))
plt.xlabel('num_room')
plt.ylabel('full_sq')
plt.title('full_sq vs num_room')
plt.boxplot(data, flierprops=green_diamond)
plt.show()

In [None]:
train_df = orig_train_df

Идея: обучить модель классификации на датасете num_room_vs_full_sq_df, чтобы по площади определять количество комнат. Предварительно посмотрим на распределение общей площади квартир, для которых известно и неизвестно количество комнат.

In [None]:
# количество комнат известно
import numpy as np
train_df[np.invert(train_df['num_room'].isna())].full_sq.plot.hist(bins = 50, title = 'full_sq')
# количество комнат НЕизвестно
train_df[train_df['num_room'].isna()].full_sq.plot.hist(bins = 50, title = 'full_sq', legend=True)

Распределения похожи, что очень хорошо. Тогда для заполения пропусков можно сделать так: на непропущенных значениях num_room и соотв. full_sq обучить логистическую регрессию. В качестве сэмплов буду брать full_sq, в качестве таргета --- num_room. С помощью построенной модели получить значения для пропусков.

In [None]:
X = num_room_vs_full_sq_df['full_sq'].values.reshape(-1, 1)
y = num_room_vs_full_sq_df['num_room'].values

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.33, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import GridSearchCV
parameters = {"C":[0.001, 0.1, 1, 2, 5, 10], "penalty":['l2']}
logreg = LogisticRegression(solver='lbfgs')
clf = GridSearchCV(logreg, parameters)
clf.fit(X_train, y_train)
print("tuned hyperparameters :(best parameters) ",clf.best_params_)
print("accuracy :",clf.best_score_)

In [None]:
from sklearn.linear_model import SGDClassifier
parameters = {"alpha":[0.0001, 0.001, 0.1, 1, 2, 5, 10], "penalty":['l2','l1', 'elasticnet']}
sgdc = SGDClassifier()
clf = GridSearchCV(sgdc, parameters)
clf.fit(X_train, y_train)
print("tuned hyperparameters :(best parameters) ",clf.best_params_)
print("accuracy :",clf.best_score_)

Похоже получить точность выше ~ 0.79 только подбором параметров не получится. Пока остановлюсь на этом. 

In [None]:
clf = LogisticRegression(C=1, penalty='l2').fit(X_train, y_train)
from sklearn.metrics import accuracy_score
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
X_to_predict = train_df[train_df['num_room'].isna()].full_sq.values.reshape(-1, 1)
num_room_predicted = clf.predict(X_to_predict)
train_df['log_reg_num_room'] = train_df.num_room
train_df.loc[train_df['log_reg_num_room'].isna(), 'log_reg_num_room'] = num_room_predicted

In [None]:
plt.title('full_sq vs log_reg_num_room')
plt.xlabel('full_sq')
plt.ylabel('log_reg_num_room')
plt.scatter(train_df.full_sq, train_df.log_reg_num_room)

Видно, что пропуски хорошо заполнены, т.к. исходная зависимость площади и количества комнат качественно не изменилась.

In [None]:
train_df.num_room =train_df.log_reg_num_room
train_df.drop('log_reg_num_room', axis=1, inplace=True)
train_df.info()

### Придумаем новый признак про спорт с помощью кластеризации.

In [None]:
sports_df = train_df[['sport_count_1500', #sport_count_1500 - The number of sport facilities in 1500 meters zone
                      'sport_count_2000',
                      'sport_count_3000',
                      'sport_count_5000' ]]
sports_array = sports_df.to_numpy()
sports_array

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2, random_state=0)
kmeans.fit(sports_array)
print(kmeans.cluster_centers_)
train_df['sports_nearby'] = kmeans.labels_
test_df['sports_nearby'] = kmeans.predict(test_df[['sport_count_1500',
                                                   'sport_count_2000',
                                                   'sport_count_3000',
                                                   'sport_count_5000' ]])

Получается, выделилось два кластера соответствующие тому, мало или много спортивных сооружений вокруг дома в радиусе от 1.5км до 5км.

In [None]:
sports_km = train_df[['basketball_km', 'swim_pool_km', 'stadium_km']].to_numpy() #Distance to the                                                                                         sports building
kmeans = KMeans(n_clusters=3, random_state=0)
kmeans.fit(sports_km)
train_df['sports_km'] = kmeans.labels_
test_df['sports_km'] = kmeans.predict(test_df[['basketball_km', 'swim_pool_km', 'stadium_km']])

In [None]:
kmeans.cluster_centers_

Получается, выделилось три кластера соответствующие тому, насколько далеко спортивные сооружения от квартиры: близко (класс 0), далеко (класс 1) и средне далеко (класс 2), соответственно.

In [None]:
sorted(train_df.columns)

Здесь целых 8 фичей, посвященных расстоянию до кафе и средней стоимостью чека в них. Можно попробовать избавиться от какого-то количества фичей с помощью PCA.

Подготовим данные для осуществления PCA.

In [None]:
from sklearn.preprocessing import StandardScaler
features = ['cafe_count_5000',
 'cafe_count_5000_na_price',
 'cafe_count_5000_price_1000',
 'cafe_count_5000_price_1500',
 'cafe_count_5000_price_2500',
 'cafe_count_5000_price_4000',
 'cafe_count_5000_price_500',
 'cafe_count_5000_price_high']
X_train = train_df.loc[:, features].values
X_test = test_df.loc[:, features].values
X_train = StandardScaler().fit_transform(X_train)
X_test = StandardScaler().fit_transform(X_test)
X_train

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
corr = pd.DataFrame(X_train).corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

Судя по матрице корреляций, есть смысл выполнять PCA (см. https://www.originlab.com/doc/Origin-Help/PrincipleComp-Analysis)

In [None]:
corr = pd.DataFrame(X_test).corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

Выбор количества компонент предоставляю компьютеру (см. https://www.mikulskibartosz.name/pca-how-to-choose-the-number-of-components/)

In [None]:
#95% of variance 
from sklearn.decomposition import PCA
pca = PCA(n_components = 0.95)
reduced_train = pca.fit_transform(X_train)
reduced_test = pca.fit_transform(X_test)

In [None]:
pca.explained_variance_ratio_

In [None]:
print(reduced_train.shape)
print(reduced_test.shape)

In [None]:
train_df_PCA = pd.DataFrame(data = reduced_train,
                             columns = ['cafe'])
test_df_PCA = pd.DataFrame(data = reduced_test,
                             columns = ['cafe'])
train_df_PCA

In [None]:
train_df.reset_index(drop=True,inplace=True)
train_df = pd.concat([train_df, train_df_PCA], axis = 1)
train_df

In [None]:
test_df.reset_index(drop=True,inplace=True)
test_df = pd.concat([test_df, test_df_PCA], axis = 1)
test_df

In [None]:
train_df.drop(['cafe_count_5000',
                                  'cafe_count_5000_na_price',
                                  'cafe_count_5000_price_1000',
                                  'cafe_count_5000_price_1500',
                                  'cafe_count_5000_price_2500',
                                  'cafe_count_5000_price_4000',
                                  'cafe_count_5000_price_500',
                                  'cafe_count_5000_price_high'], axis=1, inplace=True)
test_df.drop(['cafe_count_5000',
                                  'cafe_count_5000_na_price',
                                  'cafe_count_5000_price_1000',
                                  'cafe_count_5000_price_1500',
                                  'cafe_count_5000_price_2500',
                                  'cafe_count_5000_price_4000',
                                  'cafe_count_5000_price_500',
                                  'cafe_count_5000_price_high'], axis=1, inplace=True)
sorted(train_df.columns)

### Нормализуем данные и подготовим train и test sets.

In [None]:
X = train_df.drop('price_doc', axis=1).to_numpy()
X = StandardScaler().fit_transform(X)
X

In [None]:
y = train_df.price_doc.values
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Выбор алгоритмов для предсказания цен.

Датасет относительно небольшой (<100K сэмплов), поэтому с опорой на https://scikit-learn.org/stable/_static/ml_map.png нет смысла применять SGDRegressor. С учетом того, что предсказываем цену, то ожидаем неотрицательные значения. Линейные алгоритмы могут выдавать отрицательные значения. Поэтому для них нужно преобразовать таргет, например логарифмом. Или, если без преобразований, то есть смысл использовать RandomForest.

In [None]:
def print_metrics(y_test, y_pred):
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
    print(f"MSE => {mean_squared_error(y_test, y_pred)}")
    print(f"RMSE => {np.sqrt(mean_squared_error(y_test, y_pred))}")
    print(f"MAE => {mean_absolute_error(y_test, y_pred)}")
    print(f"R2 => {r2_score(y_test, y_pred)}")
    return 

## Линейные модели

In [None]:
y_train_log = np.log(y_train)

In [None]:
from sklearn.linear_model import LinearRegression
regr = LinearRegression()
regr.fit(X_train, y_train_log)
y_pred_log = regr.predict(X_test)
y_pred = np.exp(y_pred_log)
print_metrics(y_test, y_pred)

Проверка на наличие отрицательных предсказаний.

In [None]:
y_pred[y_pred<0]

### Гребневая регрессия (т.е. линейная регр. с l2 регуляризацией)

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {'alpha' : [0.001, 0.01, 0.1, 0.5, 1, 5, 10]} 
reg = linear_model.Ridge()
grids = GridSearchCV(reg, parameters,cv = 5)
grids.fit(X_train, y_train_log)

In [None]:
grids.best_estimator_

In [None]:
from sklearn import linear_model
reg = linear_model.Ridge(alpha=0.1)
reg.fit(X_train, y_train_log)
y_pred_log = reg.predict(X_test)
y_pred = np.exp(y_pred_log)
print_metrics(y_test, y_pred)

Регуляризация не улучшила результаты.

### Лассо (лин. регр. с l1 регуляризацией)

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {'alpha' : [0.001, 0.01, 0.1, 0.5, 1, 5, 10]} 
reg = linear_model.Lasso(max_iter = 10000)
grids = GridSearchCV(reg, parameters,cv = 3)
grids.fit(X_train, y_train_log)

In [None]:
grids.best_estimator_

In [None]:
from sklearn import linear_model
reg = linear_model.Lasso(alpha=0.001, max_iter = 10000)
reg.fit(X_train, y_train_log)
y_pred_log = reg.predict(X_test)
y_pred = np.exp(y_pred_log)
print_metrics(y_test, y_pred)

Результат ухудшился.

## Случайный лес

In [None]:
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor()
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
print_metrics(y_test, y_pred)

In [None]:
from sklearn.ensemble import RandomForestRegressor
parameters = {'max_depth' : list(range(10,21))} 
modelregr = RandomForestRegressor()
grids = GridSearchCV(modelregr, parameters,cv = 2, verbose = 2, n_jobs=-1)
grids.fit(X_train, y_train)

In [None]:
grids.best_estimator_

In [None]:
regr = RandomForestRegressor(max_depth=13)
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
print_metrics(y_test, y_pred)

In [None]:
test_df_normalized = StandardScaler().fit_transform(test_df)

In [None]:
predicted_prices = regr.predict(test_df_normalized)
predicted_prices

In [None]:
my_submission = pd.DataFrame({'id': id_for_submission, 'price_doc': predicted_prices})
my_submission.to_csv('submission.csv', index=False)

# Эксперименты с другими моделями

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score
r_squared = []
for i in range(1,40):
    knnregr = KNeighborsRegressor(n_neighbors=i)
    knnregr.fit(X_train,y_train)
    pred_i = knnregr.predict(X_test)
    r_squared.append(r2_score(y_test, pred_i))

plt.figure(figsize=(10,6))
plt.plot(range(1,40),r_squared,color='blue', linestyle='dashed', 
         marker='o',markerfacecolor='red', markersize=10)
plt.title('r2 vs. K Value')
plt.xlabel('K')
plt.ylabel('r2')
print("Max r2:-",max(r_squared),"at K =",r_squared.index(max(r_squared)))

In [None]:
from sklearn.neighbors import KNeighborsRegressor
neigh = KNeighborsRegressor(n_neighbors=14)
neigh.fit(X_train, y_train)
y_pred = neigh.predict(X_test)
print_metrics(y_test, y_pred)