In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sberbank-realty-data-2/data_dictionary.txt
/kaggle/input/sberbank-realty-data-2/sample_submission.csv/sample_submission.csv
/kaggle/input/sberbank-realty-data-2/macro.csv/macro.csv
/kaggle/input/sberbank-realty-data-2/train.csv/train.csv
/kaggle/input/sberbank-realty-data-2/test.csv/test.csv


# Problem Description

Housing costs demand a significant investment from both consumers and developers. And when it comes to planning a budget—whether personal or corporate—the last thing anyone needs is uncertainty about one of their biggets expenses. Sberbank, Russia’s oldest and largest bank, helps their customers by making predictions about realty prices so renters, developers, and lenders are more confident when they sign a lease or purchase a building.

Although the housing market is relatively stable in Russia, the country’s volatile economy makes forecasting prices as a function of apartment characteristics a unique challenge. Complex interactions between housing features such as number of bedrooms and location are enough to make pricing predictions complicated. Adding an unstable economy to the mix means Sberbank and their customers need more than simple regression models in their arsenal.

In this competition, Sberbank is challenging Kagglers to develop algorithms which use a broad spectrum of features to predict realty prices. Competitors will rely on a rich dataset that includes housing data and macroeconomic patterns. An accurate forecasting model will allow Sberbank to provide more certainty to their customers in an uncertain economy.

## Setting up

In [2]:
!pip install opendatasets --upgrade --quiet
!pip install scikit-learn --upgrade --quiet
!pip install pandas --upgrade --quiet
!pip install matplotlib --upgrade --quiet
!pip install seaborn --upgrade --quiet
!pip install numpy --upgrade --quiet

[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pandas-profiling 3.1.0 requires markupsafe~=2.0.1, but you have markupsafe 2.1.1 which is incompatible.[0m[31m
[0m

In [3]:
import opendatasets as od
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import zipfile

## Getting Dataset

In [4]:
for zf in ['macro.csv.zip','train.csv.zip','test.csv.zip', 'sample_submission.csv.zip']:
    input_file=open('.../input/sberbank-realty-data-2/'+zf,'rb')
    output_file=open(zf,'wb')
    output_file.write(input_file.read())
    output_file.close(); input_file.close()
    zipf=zipfile.ZipFile(zf,'r')
    zipf.extractall(''); zipf.close()
macro=pd.read_csv('macro.csv')
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

FileNotFoundError: [Errno 2] No such file or directory: '.../input/sberbank-realty-data-2/macro.csv.zip'

In [None]:
train.info()

In [None]:
macro.info()

In [None]:
test.head()

## Data Preparation

In [None]:
# Creating a list of input columns
input_col=list(train.columns)[1:-1]

# Creating a list of target column
target_col='price_doc'

inputs=train[input_col].copy()

target=train[target_col]

# Dividing data into numeric and categorical values
numeric_cols=inputs.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_cols=inputs.select_dtypes(include=['object']).columns.tolist()



## Data Cleaning

In [None]:
# For train inputs
missing_counts = inputs[numeric_cols].isna().sum().sort_values(ascending=False)
missing_counts[missing_counts>0]

In [None]:
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(strategy='mean').fit(train[numeric_cols])
inputs[numeric_cols]=imputer.transform(inputs[numeric_cols])

inputs

In [None]:
# Drop build_year and timestamp
inputs.drop('build_year', inplace=True, axis=1)
inputs.drop('timestamp', inplace=True, axis=1)

numeric_cols.remove('build_year')
categorical_cols.remove('timestamp')

missing_counts = inputs[numeric_cols].isna().sum().sort_values(ascending=False)

## Scale numerical values

In [None]:
inputs[numeric_cols].describe().loc[['min', 'max']]

In [None]:
from sklearn.preprocessing import MinMaxScaler
# Create the scaler
scaler = MinMaxScaler()
# Fit the scaler to the numeric columns
scaler.fit(train[numeric_cols])
# Transform and replace the numeric columns
inputs[numeric_cols] = scaler.transform(inputs[numeric_cols])

In [None]:
inputs[numeric_cols].describe().loc[['min', 'max']]

## Encode categorical values

In [None]:
inputs[categorical_cols].nunique().sort_values(ascending=False)

In [None]:
from sklearn.preprocessing import OneHotEncoder
#Creating the encoder
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
#Fitting the encoder to the categorical colums
encoder.fit(train[categorical_cols])
#Generating column names for each category
encoded_cols = list(encoder.get_feature_names(categorical_cols))
# Transforming and adding new one-hot category columns
inputs[encoded_cols] = encoder.transform(train[categorical_cols])

In [None]:
inputs

# Train and validate models

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_inputs, val_inputs, train_targets, val_targets = train_test_split(inputs[numeric_cols + encoded_cols], 
                                                                        target, 
                                                                        test_size=0.25, 
                                                                        random_state=31)

In [None]:
train_inputs.head()

In [None]:
val_inputs.head()

In [None]:
train_targets.head()

In [None]:
val_targets.head()

## Linear regression

In [None]:
from sklearn.linear_model import Ridge
# Create the model
model = model = Ridge()
# Fit the model using inputs and targets
model.fit(train_inputs[numeric_cols + encoded_cols], train_targets)

## Model evaluation

In [None]:
from sklearn.metrics import mean_squared_error

X_train = train_inputs[numeric_cols + encoded_cols]
X_val = val_inputs[numeric_cols + encoded_cols]

train_preds = model.predict(X_train)
train_preds

In [None]:
print('The RMSE loss for the training set is ₽ {}.'.format(mean_squared_error(train_targets, train_preds, squared=False)))

In [None]:
val_preds = model.predict(X_val)
val_preds

In [None]:
print('The RMSE loss for the valication set is ₽ {}.'.format(mean_squared_error(val_targets, val_preds, squared=False)))

## Importance of features

In [None]:
weights=model.coef_

weights_df = pd.DataFrame({
    'feature': train_inputs.columns,
    'weight': weights
}).sort_values('weight', ascending=False)

plt.title('Feature Importance')
sns.barplot(data=weights_df.head(10), x='weight', y='feature')

## Test predictions

In [None]:
test_input_cols = list(test.columns)[1:]

test_inputs_df = test[test_input_cols].copy()
test_numeric_cols = test_inputs_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
test_categorical_cols =  test_inputs_df.select_dtypes(include=['object']).columns.tolist()

missing_counts = test_inputs_df[test_numeric_cols].isna().sum().sort_values(ascending=False)
missing_counts

In [None]:
test_imputer = SimpleImputer(strategy = 'mean').fit(test[test_numeric_cols])
test_inputs_df[test_numeric_cols] = test_imputer.transform(test_inputs_df[test_numeric_cols])
test_inputs_df.drop('build_year', inplace=True, axis=1)
test_inputs_df.drop('timestamp', inplace=True, axis=1)
test_numeric_cols.remove('build_year')
test_categorical_cols.remove('timestamp')

scaler.fit(test[test_numeric_cols])
test_inputs_df[test_numeric_cols] = scaler.transform(test_inputs_df[test_numeric_cols])
test_inputs_df[test_numeric_cols].describe().loc[['min', 'max']]

In [None]:
encoder.fit(test[test_categorical_cols])
test_encoded_cols = list(encoder.get_feature_names(test_categorical_cols))
test_inputs_df[test_encoded_cols] = encoder.transform(test[test_categorical_cols])

In [None]:
X_test = test_inputs_df[test_numeric_cols + test_encoded_cols]

In [None]:
test_preds = model.predict(X_test)

In [None]:
print('The test predictions are ₽ {}.'.format(test_preds))

## Decision tree

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree = DecisionTreeRegressor(random_state=41)
tree.fit(train_inputs, train_targets)
tree_train_preds = tree.predict(train_inputs)

tree_train_rmse = mean_squared_error(train_targets,tree_train_preds, squared=False )
tree_val_preds = tree.predict(val_inputs)
tree_val_rmse = mean_squared_error(val_targets,tree_val_preds, squared=False )
print('Train RMSE: {}, Validation RMSE: {}'.format(tree_train_rmse, tree_val_rmse))

## Visualize decision tree

In [None]:
from sklearn.tree import plot_tree, export_text

plt.figure(figsize=(30,15))

# Visualize the tree graphically using plot_tree
plot_tree(tree,max_depth=3,feature_names=train_inputs.columns, filled=False, rounded=True);

In [None]:
# Check feature importance
tree_importances = tree.feature_importances_
tree_importance_df = pd.DataFrame({
    'feature': train_inputs.columns,
    'importance': tree_importances
}).sort_values('importance', ascending=False)
tree_importance_df

In [None]:
plt.title('Decision Tree Feature Importance')
sns.barplot(data=tree_importance_df.head(10), x='importance', y='feature');

## Random forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, confusion_matrix

# Create the model
rf1 = RandomForestRegressor(n_jobs=-1, random_state=21)
# Fit the model
rf1.fit(train_inputs,train_targets)
rf1.score(train_inputs,train_targets)

In [None]:
rf1_train_preds=rf1.predict(train_inputs)

rf1_train_rmse= mean_squared_error(train_targets,rf1_train_preds, squared=False )

rf1_val_preds=rf1.predict(val_inputs)

rf1_val_rmse= mean_squared_error(val_targets,rf1_val_preds, squared=False )

print('Train RMSE: {}, Validation RMSE: {}'.format(rf1_train_rmse, rf1_val_rmse))

## Tune hyperparameters

In [None]:
def test_params(**params):
    model = RandomForestRegressor(random_state=21, n_jobs=-1, **params).fit(train_inputs, train_targets)
    train_rmse = mean_squared_error(model.predict(train_inputs), train_targets, squared=False)
    val_rmse = mean_squared_error(model.predict(val_inputs), val_targets, squared=False)
    return train_rmse, val_rmse

In [None]:
def test_param_plot(param_name, param_values):
    train_errors, val_errors = [], [] 
    for value in param_values:
        params = {param_name: value}
        train_rmse, val_rmse = test_params(**params)
        train_errors.append(train_rmse)
        val_errors.append(val_rmse)
    plt.figure(figsize=(10,6))
    plt.title('Overfitting curve: ' + param_name)
    plt.plot(param_values, train_errors, 'b-o')
    plt.plot(param_values, val_errors, 'r-o')
    plt.xlabel(param_name)
    plt.ylabel('RMSE')
    plt.legend(['Training', 'Validation'])

In [None]:
test_param_plot('n_estimators', [10,20,30,40,50,60,70])

In [None]:
test_param_plot('max_depth', [ 10, 15, 20, 25, 30])

## Predictions

In [None]:
rf2 = RandomForestRegressor(n_jobs=-1, max_depth = 15 , n_estimators = 30, random_state=21)

In [None]:
# Fit the model
rf2.fit(train_inputs,train_targets)

In [None]:
rf2_train_preds = rf2.predict(train_inputs)
rf2_train_rmse =  mean_squared_error(train_targets,rf2_train_preds, squared=False )

In [None]:
rf2_val_preds = rf2.predict(val_inputs)
rf2_val_rmse= mean_squared_error(val_targets,rf2_val_preds, squared=False )

In [None]:
print('Train RMSE: {}, Validation RMSE: {}'.format(rf2_train_rmse, rf2_val_rmse))

## Test prediction

In [None]:
rf2_test_preds = rf2.predict(X_test)
print('The test predictions are ₽ {}.'.format(test_preds))