# Sberbank Russian Housing Market

Use the "Run" button to execute the code.

In [None]:
!pip install opendatasets --upgrade --quiet
!pip install scikit-learn --upgrade --quiet
!pip install pandas --upgrade --quiet
!pip install matplotlib --upgrade --quiet
!pip install seaborn --upgrade --quiet
!pip install numpy --upgrade --quiet

In [None]:
import opendatasets as od
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import zipfile

# Downloading Dataset and EDA

In [None]:
for zf in ['macro.csv.zip','train.csv.zip','test.csv.zip', 'sample_submission.csv.zip']:
    input_file=open('../input/sberbank-russian-housing-market/'+zf,'rb')
    output_file=open(zf,'wb')
    output_file.write(input_file.read())
    output_file.close(); input_file.close()
    zipf=zipfile.ZipFile(zf,'r')
    zipf.extractall(''); zipf.close()
macro=pd.read_csv('macro.csv')
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [None]:
train.info()

In [None]:
macro.info()

In [None]:
test.info()

In [None]:
train.head()

In [None]:
test.head()

# Prepareing the Dataset

The train and test data are divided on the basis of timespaces. First the input and target columns need to be determined

In [None]:
# Creating a list of input columns
input_col=list(train.columns)[1:-1]

In [None]:
# Creating a list of target column
target_col='price_doc'

In [None]:
inputs=train[input_col].copy()

In [None]:
target=train[target_col]

Dividing the data into Numeric and Categorical types

In [None]:
numeric_cols=inputs.select_dtypes(include=['int64','float64']).columns.tolist()

In [None]:
categorical_cols=inputs.select_dtypes(include=['object']).columns.tolist()

# Data Cleaning

In [None]:
#For train inputs
missing_counts = inputs[numeric_cols].isna().sum().sort_values(ascending=False)
missing_counts[missing_counts>0]

In [None]:
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(strategy='mean').fit(train[numeric_cols])
inputs[numeric_cols]=imputer.transform(inputs[numeric_cols])

In [None]:
inputs

build_year is quite high and impracticle. It need to be dropped. Same for timestamp

In [None]:
inputs.drop('build_year', inplace=True, axis=1)
inputs.drop('timestamp', inplace=True, axis=1)

In [None]:
numeric_cols.remove('build_year')
categorical_cols.remove('timestamp')

In [None]:
missing_counts = inputs[numeric_cols].isna().sum().sort_values(ascending=False)


# Scaling of Numerical values

In [None]:
inputs[numeric_cols].describe().loc[['min', 'max']]

In [None]:
from sklearn.preprocessing import MinMaxScaler
# Create the scaler
scaler = MinMaxScaler()
# Fit the scaler to the numeric columns
scaler.fit(train[numeric_cols])
# Transform and replace the numeric columns
inputs[numeric_cols] = scaler.transform(inputs[numeric_cols])


In [None]:
inputs[numeric_cols].describe().loc[['min', 'max']]

# Encoding catergorical Columns

In [None]:
inputs[categorical_cols].nunique().sort_values(ascending=False)

In [None]:
from sklearn.preprocessing import OneHotEncoder
#Creating the encoder
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
#Fitting the encoder to the categorical colums
encoder.fit(train[categorical_cols])
#Generating column names for each category
encoded_cols = list(encoder.get_feature_names(categorical_cols))
# Transforming and adding new one-hot category columns
inputs[encoded_cols] = encoder.transform(train[categorical_cols])


In [None]:
inputs

# Training and validation

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_inputs, val_inputs, train_targets, val_targets = train_test_split(inputs[numeric_cols + encoded_cols], 
                                                                        target, 
                                                                        test_size=0.25, 
                                                                        random_state=31)

In [None]:
train_inputs.head()

In [None]:
val_inputs.head()

In [None]:
train_targets.head()

In [None]:
val_targets.head()

# Linear Regression Model

In [None]:
from sklearn.linear_model import Ridge
# Create the model
model = model = Ridge()
# Fit the model using inputs and targets
model.fit(train_inputs[numeric_cols + encoded_cols], train_targets)

# Evaluating the model

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
X_train = train_inputs[numeric_cols + encoded_cols]
X_val = val_inputs[numeric_cols + encoded_cols]

In [None]:
train_preds = model.predict(X_train)
train_preds

In [None]:
print('The RMSE loss for the training set is ₽ {}.'.format(mean_squared_error(train_targets, train_preds, squared=False)))

In [None]:
val_preds = model.predict(X_val)
val_preds

In [None]:
print('The RMSE loss for the valication set is ₽ {}.'.format(mean_squared_error(val_targets, val_preds, squared=False)))

# Feature Importance

In [None]:
weights=model.coef_

In [None]:
weights_df = pd.DataFrame({
    'feature': train_inputs.columns,
    'weight': weights
}).sort_values('weight', ascending=False)

In [None]:
plt.title('Feature Importance')
sns.barplot(data=weights_df.head(10), x='weight', y='feature')

# Test Predictions

In [None]:
test_input_cols = list(test.columns)[1:]

test_inputs_df = test[test_input_cols].copy()
test_numeric_cols = test_inputs_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
test_categorical_cols =  test_inputs_df.select_dtypes(include=['object']).columns.tolist()

In [None]:
missing_counts = test_inputs_df[test_numeric_cols].isna().sum().sort_values(ascending=False)
missing_counts

In [None]:
test_imputer = SimpleImputer(strategy = 'mean').fit(test[test_numeric_cols])
test_inputs_df[test_numeric_cols] = test_imputer.transform(test_inputs_df[test_numeric_cols])
test_inputs_df.drop('build_year', inplace=True, axis=1)
test_inputs_df.drop('timestamp', inplace=True, axis=1)
test_numeric_cols.remove('build_year')
test_categorical_cols.remove('timestamp')

In [None]:
scaler.fit(test[test_numeric_cols])
test_inputs_df[test_numeric_cols] = scaler.transform(test_inputs_df[test_numeric_cols])
test_inputs_df[test_numeric_cols].describe().loc[['min', 'max']]

In [None]:
encoder.fit(test[test_categorical_cols])
test_encoded_cols = list(encoder.get_feature_names(test_categorical_cols))
test_inputs_df[test_encoded_cols] = encoder.transform(test[test_categorical_cols])

In [None]:
X_test = test_inputs_df[test_numeric_cols + test_encoded_cols]

In [None]:
test_preds = model.predict(X_test)

In [None]:
print('The test predictions are ₽ {}.'.format(test_preds))

In [None]:
submission_df = pd.read_csv('sample_submission.csv')

In [None]:
submission_df['price_doc'] = test_preds

In [None]:
submission_df.to_csv('submission.csv', index=False)

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
tree = DecisionTreeRegressor(random_state=41)

In [None]:
tree.fit(train_inputs, train_targets)

In [None]:
tree_train_preds = tree.predict(train_inputs)

In [None]:
tree_train_rmse = mean_squared_error(train_targets,tree_train_preds, squared=False )

In [None]:
tree_val_preds = tree.predict(val_inputs)

In [None]:
tree_val_rmse = mean_squared_error(val_targets,tree_val_preds, squared=False )

In [None]:
print('Train RMSE: {}, Validation RMSE: {}'.format(tree_train_rmse, tree_val_rmse))

# Visualizing the decision Tree

In [None]:
from sklearn.tree import plot_tree, export_text

In [None]:
plt.figure(figsize=(30,15))

# Visualize the tree graphically using plot_tree
plot_tree(tree,max_depth=3,feature_names=train_inputs.columns, filled=False, rounded=True);

In [None]:
# Visualize the tree textually using export_text
tree_text = export_text(tree)

In [None]:
# Display the first few lines
print(tree_text[:1000])

In [None]:
# Check feature importance
tree_importances = tree.feature_importances_
tree_importance_df = pd.DataFrame({
    'feature': train_inputs.columns,
    'importance': tree_importances
}).sort_values('importance', ascending=False)
tree_importance_df

In [None]:
plt.title('Decision Tree Feature Importance')
sns.barplot(data=tree_importance_df.head(10), x='importance', y='feature');

# Random Forest Decision Tree

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
# Create the model
rf1 = RandomForestRegressor(n_jobs=-1, random_state=21)
# Fit the model
rf1.fit(train_inputs,train_targets)
rf1.score(train_inputs,train_targets)

In [None]:
rf1_train_preds=rf1.predict(train_inputs)

In [None]:
rf1_train_rmse= mean_squared_error(train_targets,rf1_train_preds, squared=False )

In [None]:
rf1_val_preds=rf1.predict(val_inputs)

In [None]:
rf1_val_rmse= mean_squared_error(val_targets,rf1_val_preds, squared=False )

In [None]:
print('Train RMSE: {}, Validation RMSE: {}'.format(rf1_train_rmse, rf1_val_rmse))

# Hyperparameter Tuning

In [None]:
def test_params(**params):
    model = RandomForestRegressor(random_state=21, n_jobs=-1, **params).fit(train_inputs, train_targets)
    train_rmse = mean_squared_error(model.predict(train_inputs), train_targets, squared=False)
    val_rmse = mean_squared_error(model.predict(val_inputs), val_targets, squared=False)
    return train_rmse, val_rmse

In [None]:
def test_param_plot(param_name, param_values):
    train_errors, val_errors = [], [] 
    for value in param_values:
        params = {param_name: value}
        train_rmse, val_rmse = test_params(**params)
        train_errors.append(train_rmse)
        val_errors.append(val_rmse)
    plt.figure(figsize=(10,6))
    plt.title('Overfitting curve: ' + param_name)
    plt.plot(param_values, train_errors, 'b-o')
    plt.plot(param_values, val_errors, 'r-o')
    plt.xlabel(param_name)
    plt.ylabel('RMSE')
    plt.legend(['Training', 'Validation'])

In [None]:
test_param_plot('n_estimators', [10,20,30,40,50,60,70])

In [None]:
test_param_plot('max_depth', [ 10, 15, 20, 25, 30])

# Making Predictions

In [None]:
rf2 = RandomForestRegressor(n_jobs=-1, max_depth = 15 , n_estimators = 30, random_state=21)

In [None]:
# Fit the model
rf2.fit(train_inputs,train_targets)

In [None]:
rf2_train_preds = rf2.predict(train_inputs)
rf2_train_rmse =  mean_squared_error(train_targets,rf2_train_preds, squared=False )

In [None]:
rf2_val_preds = rf2.predict(val_inputs)
rf2_val_rmse= mean_squared_error(val_targets,rf2_val_preds, squared=False )

In [None]:
print('Train RMSE: {}, Validation RMSE: {}'.format(rf2_train_rmse, rf2_val_rmse))

# Test Prediction

In [None]:
rf2_test_preds = rf2.predict(X_test)

In [None]:
print('The test predictions are ₽ {}.'.format(test_preds))

In [None]:
submission_df2 = pd.read_csv('sample_submission.csv')

In [None]:
submission_df['price_doc'] = test_preds

In [None]:
submission_df.to_csv('submission2.csv', index=False)

# References

Dataset :https://www.kaggle.com/c/sberbank-russian-housing-market/overview

Notebook reference: https://jovian.ai/aakashns/sklearn-decision-trees-random-forests
                    https://jovian.ai/shlok-ramteke24/python-random-forests-assignment