In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import os 

!pip install ZipFile       
from zipfile import ZipFile 
os.listdir('../input/sberbank-russian-housing-market')

In [None]:
train_df = pd.read_csv(ZipFile("../input/sberbank-russian-housing-market/train.csv.zip").open('train.csv'), parse_dates=['timestamp'])
test_df = pd.read_csv(ZipFile("../input/sberbank-russian-housing-market/test.csv.zip").open('test.csv'), parse_dates=['timestamp'])
macro_df = pd.read_csv(ZipFile("../input/sberbank-russian-housing-market/macro.csv.zip").open('macro.csv'), parse_dates=['timestamp'])

In [None]:
df_train=train_df.copy()

In [None]:
df_train['price_doc_log'] = np.log1p(df_train['price_doc'])

In [None]:
df_train.head()

In [None]:
type(df_train)
df_train.shape

In [None]:
df_train.info()

In [None]:
df_train.describe().T

In [None]:
columns_with_missing_values = df_train.columns[df_train.isnull().any()]
df_train[columns_with_missing_values].isnull().sum()

In [None]:
# To hold variable names
labels = [] 

# To hold the count of missing values for each variable 
valuecount = [] 

# To hold the percentage of missing values for each variable
percentcount = [] 

for col in columns_with_missing_values:
    labels.append(col)
    valuecount.append(df_train[col].isnull().sum())
    # df_train.shape[0] will give the total row count
    percentcount.append(df_train[col].isnull().sum()/df_train.shape[0])


In [None]:
ind = np.arange(len(labels))

fig, (ax1, ax2) = plt.subplots(2,1,figsize=(18,36))

rects = ax1.barh(ind, np.array(valuecount), color='green')
ax1.set_yticks(ind)
ax1.set_yticklabels(labels, rotation='horizontal')
ax1.set_xlabel("Count of missing values")
ax1.set_title("Variables with missing values")

rects = ax2.barh(ind, np.array(percentcount), color='pink')
ax2.set_yticks(ind)
ax2.set_yticklabels(labels, rotation='horizontal')
ax2.set_xlabel("Percentage of missing values")
ax2.set_title("Variables with missing values")

In [None]:

plt.figure(figsize=(18, 36))

# cubehelix palette is a part of seaborn that produces a colormap
cmap = sns.cubehelix_palette(light=1, as_cmap=True, reverse=True)
sns.heatmap(df_train.isnull(), cmap=cmap)

The response variable (price_doc) represents the housing prices we're trying to predict. 
Let's start by drawing some graphs to visualize the distribution:

In [None]:
# Histogram of response(price_doc) variable:
f, ax = plt.subplots(figsize=(7,5))
sns.despine(f)
sns.histplot(df_train['price_doc'], bins = 50, kde=True )
plt.show()

In [None]:
# Scatterplot of response(price_doc) variable:
plt.scatter(range(df_train.shape[0]), np.sort(df_train.price_doc.values), color="red")
plt.xlabel('Index', fontsize=20)
plt.ylabel('Price', fontsize=20)
plt.show()

Distribution (as expected) of housing prices is quite skewed. Let's try logging and drawing the plots again.

In [None]:
# Histogram of log of response variable:
f, ax = plt.subplots(figsize=(7,5))
sns.despine(f)
sns.histplot(df_train['price_doc_log'], bins = 50, kde=True )
plt.show()

In [None]:
# Scatterplot of log of response variable:
plt.scatter(range(df_train.shape[0]), np.sort(df_train.price_doc_log.values), color="red")
plt.xlabel('Index', fontsize=20)
plt.ylabel('Price', fontsize=20)
plt.show()

In [None]:
train_monthgrp = df_train.groupby('timestamp')['price_doc'].aggregate(np.median).reset_index()
plt.figure(figsize=(24, 16))
sns.lineplot(x="timestamp",y="price_doc", data=train_monthgrp)
plt.ylabel('Median Price', fontsize=18)
plt.xlabel('Year Month', fontsize=18)
plt.xticks(rotation='vertical')
plt.show()

And let's plot the response variable against some of the predictors.

In [None]:
# Price vs. Sq-Meter:
sns.regplot(x = 'full_sq', y = 'price_doc', data = df_train[df_train['full_sq'] < 1000], fit_reg = False, color="red")
plt.show()

In [None]:
# Scatterplot vs. Number of Rooms:
sns.regplot(x = 'num_room', y = 'price_doc', data = df_train, fit_reg = False, color="red")
plt.show()

here the count plot of floor variable.

In [None]:
plt.figure(figsize=(12,8))
sns.countplot(x='floor', data=df_train)
plt.ylabel('Count', fontsize=12)
plt.xlabel('floor number', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()

The distribution is right skewed. There are some  drops in between (5 to 6, 9 to 10, 12 to 13, 17 to 18). Now let us see how the price changes with respect to floors.


In [None]:
temp_df = df_train.groupby(['floor'])['price_doc'].aggregate(np.median).reset_index()
plt.figure(figsize=(12,10))
sns.pointplot(x='floor', y='price_doc', data=temp_df)
plt.ylabel('Median Price', fontsize=12)
plt.xlabel('Floor number', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()

Firstly, a sudden increace in the 26th floor. After that, the most sudden increase in the house price is also observed at floor 33.

Let's see total number of floors in the plot. 

In [None]:
plt.figure(figsize=(12,8))
sns.countplot(x="max_floor", data=df_train)
plt.ylabel('Count', fontsize=12)
plt.xlabel('Max floor number', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()

In [None]:
temp_df = df_train.groupby(['max_floor'])['price_doc'].aggregate(np.median).reset_index()
plt.figure(figsize=(12,8))
sns.pointplot(x="max_floor", y="price_doc", data=temp_df)
plt.ylabel('Median Price', fontsize=12)
plt.xlabel('Max Floor number', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()