In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import warnings
warnings.filterwarnings(action = 'ignore')

from scipy.stats import norm

df_train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
df_test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [3]:
def check_columns(df1, df2):
    df1_columns = set(df1.columns)
    df2_columns = set(df2.columns)
    print('Diff in columns df1: ', df1_columns - df2_columns)
    print('Diff in columns df2: ', df2_columns - df1_columns)

In [4]:
check_columns(df_train, df_test)

In [5]:
df_train['SalePrice'].describe()

In [6]:
sns.set_style('white')
sns.set_color_codes(palette = 'pastel')

f, ax = plt.subplots(figsize = (18,9))

sns.distplot(a = df_train['SalePrice'], fit = norm, color = 'blue')

mu, sigma = norm.fit(df_train['SalePrice'])

plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)], loc='best')
ax.set(xlabel = 'Price distribution')
ax.set(ylabel = 'Frequency')
ax.set(title = 'Normal distribution of sale price')

sns.despine(left = True, trim = True)

plt.show()

In [7]:
plt.figure(figsize=(18,9))
sns.distplot(np.log(df_train['SalePrice']), color = 'blue')
plt.title('Log distribution of prices to avoid outliners')
plt.show()

In [44]:
f, ax = plt.subplots(figsize = (18, 6))

missing = round(df_train.isnull().mean() * 100, 2)
missing = missing[missing >= 0.05]
missing.sort_values(inplace = True)

missing.plot.bar(color = 'tomato')

plt.hlines(50, -50, +50 ,colors = 'red', linestyle = '-')
plt.text(0.5, 55, 'Over 50% of missing data')

ax.set(xlabel = 'Missing values each variable')
ax.set(ylabel = 'Number of missing values')
ax.set(title = 'Missing values')

plt.show()

In [51]:
f, ax = plt.subplots(figsize = (30, 30))

corr_matrix = df_train.corr('pearson')
mask = np.triu(np.ones_like(corr_matrix, dtype = bool))
cmap = sns.diverging_palette(250, 0, as_cmap = True)

sns.heatmap(corr_matrix, mask = mask, cmap = cmap, vmax = 1, center = 0, annot = True,
           square = True, linewidths = .5, cbar_kws = {'shrink': .5})

plt.show()

In [10]:
def plot_correlation(dataframe, target_column, feature_column):
    f, ax = plt.subplots(1, 3, figsize = (18, 6))
    sns.boxplot(data = dataframe, x = feature_column, y = target_column, ax = ax[0], palette = 'pastel')
    sns.violinplot(data = dataframe, x = feature_column, y = target_column, ax = ax[1], palette = 'pastel')
    sns.stripplot(data = dataframe, x = feature_column, y = target_column, ax = ax[2], palette = 'pastel')
    
    plt.show()

In [11]:
plot_correlation(df_train, 'SalePrice', 'OverallQual')

In [13]:
plot_correlation(df_train, 'SalePrice', 'GarageCars')

In [17]:
f, ax = plt.subplots(figsize = (18, 6))

sns.barplot(x = 'YrSold', y = 'SalePrice', data = df_train, estimator = np.median, palette = 'pastel')
ax.set(xlabel = 'Year Sold')
ax.set(ylabel = 'Median of Sale Price')
ax.set(title = 'Median of Sale Price vs Year Sold')

plt.show()

In [30]:
f, ax = plt.subplots(figsize = (18, 9))

sns.scatterplot(x = 'YearBuilt', y = 'SalePrice', data = df_train, color = 'blue')
ax.set(xlabel = 'Year of construction')
ax.set(ylabel = 'Sale price')
ax.set(title = 'Sale price vs year of contruction')

plt.show()

In [37]:
df_relevant = corr_matrix[corr_matrix['SalePrice'] > 0.50]
df_relevant_list = df_relevant.index.values

df_train2 = df_train[df_train.columns.intersection(df_relevant_list)]
df_test2 = df_test[df_test.columns.intersection(df_relevant_list)]
df_test2

In [47]:
df_nan = pd.DataFrame(df_train2.isna().mean(), columns = ['% of missing data'])
df_nan['Useful'] = np.where(df_nan['% of missing data'] >= 0.50, 'No', 'Yes')

df_total_na = pd.DataFrame(df_train.isna().mean(), columns = ['% missing data'])
df_total_na['Useful'] = np.where(df_total_na['% missing data'] >= 0.50, 'No', 'Yes')

df_total_na