* A histogram is a plot that allows you to discover, and show the underlying frequency distribution of a set of continuous data. It is a useful method for exploratory data analysis in which it inspects the shape of underlying probability distribution, outliers, skewness and so on.

*

* To construct a histogram from a continuous variable we first split the data into intervals called **bins**.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from scipy import stats.
import matplotlib.gridspec as gridspec


In [None]:
house_price = pd.read_csv('assets/house_prices_train.csv')

In [None]:
house_price.drop('Id', axis = 1).corr()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
MSSubClass,1.0,-0.386347,-0.139781,0.032628,-0.059316,0.02785,0.040581,0.022936,-0.069836,-0.065649,...,-0.012579,-0.0061,-0.012037,-0.043825,-0.02603,0.008283,-0.007683,-0.013585,-0.021407,-0.084284
LotFrontage,-0.386347,1.0,0.426095,0.251646,-0.059213,0.123349,0.088866,0.193458,0.233633,0.0499,...,0.088521,0.151972,0.0107,0.070029,0.041383,0.206167,0.003368,0.0112,0.00745,0.351799
LotArea,-0.139781,0.426095,1.0,0.105806,-0.005636,0.014228,0.013788,0.10416,0.214103,0.11117,...,0.171698,0.084774,-0.01834,0.020423,0.04316,0.077672,0.038068,0.001205,-0.014261,0.263843
OverallQual,0.032628,0.251646,0.105806,1.0,-0.091932,0.572323,0.550684,0.411876,0.239666,-0.059119,...,0.238923,0.308819,-0.113937,0.030371,0.064886,0.065166,-0.031406,0.070815,-0.027347,0.790982
OverallCond,-0.059316,-0.059213,-0.005636,-0.091932,1.0,-0.375983,0.073741,-0.128101,-0.046231,0.040229,...,-0.003334,-0.032589,0.070356,0.025504,0.054811,-0.001985,0.068777,-0.003511,0.04395,-0.077856
YearBuilt,0.02785,0.123349,0.014228,0.572323,-0.375983,1.0,0.592855,0.315707,0.249503,-0.049107,...,0.22488,0.188686,-0.387268,0.031355,-0.050364,0.00495,-0.034383,0.012398,-0.013618,0.522897
YearRemodAdd,0.040581,0.088866,0.013788,0.550684,0.073741,0.592855,1.0,0.179618,0.128451,-0.067759,...,0.205726,0.226298,-0.193919,0.045286,-0.03874,0.005829,-0.010286,0.02149,0.035743,0.507101
MasVnrArea,0.022936,0.193458,0.10416,0.411876,-0.128101,0.315707,0.179618,1.0,0.264736,-0.072319,...,0.159718,0.125703,-0.110204,0.018796,0.061466,0.011723,-0.029815,-0.005965,-0.008201,0.477493
BsmtFinSF1,-0.069836,0.233633,0.214103,0.239666,-0.046231,0.249503,0.128451,0.264736,1.0,-0.050117,...,0.204306,0.111761,-0.102303,0.026451,0.062021,0.140491,0.003571,-0.015727,0.014359,0.38642
BsmtFinSF2,-0.065649,0.0499,0.11117,-0.059119,0.040229,-0.049107,-0.067759,-0.072319,-0.050117,1.0,...,0.067898,0.003093,0.036543,-0.029993,0.088871,0.041709,0.00494,-0.015211,0.031706,-0.011378


In [None]:
pd.concat([house_price['SalePrice'], house_price['OverallQual']], axis = 1)

Unnamed: 0,SalePrice,OverallQual
0,208500,7
1,181500,6
2,223500,7
3,140000,7
4,250000,8
...,...,...
1455,175000,6
1456,210000,6
1457,266500,7
1458,142125,5


In [None]:
pd.concat([house_price['SalePrice'], house_price['TotalBsmtSF']], axis = 1)

Unnamed: 0,SalePrice,TotalBsmtSF
0,208500,856
1,181500,1262
2,223500,920
3,140000,756
4,250000,1145
...,...,...
1455,175000,953
1456,210000,1542
1457,266500,1152
1458,142125,1078


In [None]:
def plotting_charts(df, feature):
    
    style.use('fivethirtyeight')
    
    # Create a customized chart and set gridsize
    fig = plt.figure(figsize = (12,8))
    # and create a gid og 3 cols and 3 rows
    grid = gridspec.GridSpec(ncols = 3, nrows = 3, figure = fig)
    
    # We define ax1 as a variable to plot histogram on the first row
    ax1 = fig.add_subplot(grid[0, :2])
    # then set the title
    ax1.set_title('Histogram of housing price')
    # and make a histogram for selected feature.
    # Note that we choose all rows and subset only the featured column to make histogram
    sns.distplot(df.loc[:,feature], norm_hist=True, ax = ax1)
    
    # We define ax1 as a variable to plot histogram on the second row
    ax2 = fig.add_subplot(grid[1, :2])
    # also set the overlaying title for the QQ-plot
    ax2.set_title('Quantile-Quantile Plot')
    stats.probplot(df.loc[:,feature], plot = ax2)
    
    # We define ax1 as a variable to plot histogram on the third column
    ax3 = fig.add_subplot(grid[:, 2])
    ax3.set_title('Box Plot')
    # We use the boxplot
    sns.boxplot(df.loc[:,feature], orient = 'v', ax = ax3)
    
    
    
    

In [None]:
plotting_charts(pd.DataFrame(y), 'SalePrice')

NameError: name 'y' is not defined

In [None]:
# Looking at the large quantity of sale prices, a log-transformation seems mitigating the skewness of the variable.
y = np.loglp(y)
plotting_charts(pd.DataFrame(y), 'SalePrice')