# EDA of Boston Housing Price Prediction Dataset

### What is EDA 

Exploratory Data Analysis is a technique which is used to understand the data

- maximize insight into a data set
- uncover underlying structure
- extract important variables
- detect outliers and anomalies
- test underlying assumptions
- determine optimal factor settings

Dataset: https://archive.ics.uci.edu/ml/machine-learning-databases/housing/

# Data Preparation

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import pickle as pkl

In [None]:
boston = pkl.load(open('boston_housing.pkl', 'rb'))

In [None]:
type(boston)

In [None]:
boston.keys()

In [None]:
print(boston.DESCR)

In [None]:
boston['feature_names']

In [None]:
data = boston.data

In [None]:
data.shape

In [None]:
data = pd.DataFrame(data = data, columns=boston.feature_names)
data

In [None]:
data['Price'] = boston.target
data.head()

# Understand Your Data and Plot Style Setting 

In [None]:
data.describe()

In [None]:
data_desc = data.describe()
data_desc.loc['mean']

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
data.duplicated().sum()

In [None]:
data_desc.loc['mean'].plot.bar()

# Plot Styling 

In [None]:
print(plt.style.available)

In [None]:
len(plt.style.available)

In [None]:
for style in plt.style.available:
    plt.style.use(style)
    data_desc.loc['mean'].plot.bar()
    plt.title(style)
    plt.savefig('plots/' + style + ".png")

In [None]:
plt.style.use('ggplot')
data_desc.loc['mean'].plot.bar()

# Pair Plot 

In [None]:
import seaborn as sns

In [None]:
data.head()

In [None]:
sns.pairplot(data)

In [None]:
sns.pairplot(data.iloc[:, np.hstack(([0], range(1, 7)))], diag_kind='kde')

In [None]:
sns.pairplot(data.iloc[:, np.hstack(([0], range(7, 14)))], diag_kind='kde')

# Distribution Plot

In [None]:
rows = 2
cols = 7

fig, ax = plt.subplots(nrows=rows, ncols = cols, figsize = (16, 4))

col = data.columns
index = 0

for i in range(rows):
    for j in range(cols):
        sns.distplot(data[col[index]], ax = ax[i][j])
        index = index + 1
        
plt.tight_layout()

# Scatter Plot
## Plotting `Price` with remaining columns 

In [None]:
rows = 2
cols = 7

fig, ax = plt.subplots(rows, cols, figsize = (16, 4))

col = data.columns
index = 0

for i in range(rows):
    for j in range(cols):
        sns.scatterplot(x = 'Price', y = col[index], data = data, ax = ax[i][j])
        index = index + 1
        
plt.tight_layout()
plt.show()

# Heatmap 

In [None]:
corrmat = data.corr()
corrmat

In [None]:
corrmat.shape

In [None]:
import matplotlib
matplotlib.__version__

In [None]:
fig, ax = plt.subplots(figsize = (7, 5))
sns.heatmap(corrmat, annot = True, annot_kws = {'size': 7})

bottom, top = ax.get_ylim()
ax.set_ylim(bottom+0.5, top-.5)
plt.show()

# Correlated Feature Selection

In [None]:
corrmat.index

In [None]:
def getCorrelatedFeature(corrdata, threshold):
    feature = []
    value = []
    
    for i, index in enumerate(corrdata.index):
        if abs(corrdata[index]) > threshold:
            feature.append(index)
            value.append(corrdata[index])
            
    df = pd.DataFrame(data = value, index=feature, columns=['corr value'])
    
    return df

In [None]:
threshold = 0.5
corr_df = getCorrelatedFeature(corrmat['Price'], threshold)

In [None]:
corr_df

# Heatmap and Pair Plot of Correlated Data 

In [None]:
correlated_data = data[corr_df.index]
correlated_data.head()

In [None]:
sns.pairplot(correlated_data)
plt.tight_layout()

In [None]:
fig, ax = plt.subplots(figsize = (4, 4))
sns.heatmap(correlated_data.corr(), annot = True, annot_kws = {'size': 12})

bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
plt.show()

# Box and Rel Plot 


    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
        - B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
        - LSTAT    % lower status of the population
        - MEDV     Median value of owner-occupied homes in $1000's

In [None]:
sns.boxplot(y = 'Price', x = 'CHAS', data = data)

In [None]:
sns.relplot(x = 'RM', y = 'Price', data = data, hue = 'CHAS')

In [None]:
sns.relplot(x = 'RM', y = 'Price', data = data, style = 'CHAS')

In [None]:
sns.relplot(x = 'RM', y = 'Price', data = data, size = 'CHAS')

In [None]:
sns.relplot(x = 'RM', y = 'Price', data = data, col = 'CHAS')

# Joint Plot 

When dealing with a set of data, often the first thing you’ll want to do is get a sense for how the variables are distributed

In [None]:
sns.jointplot(x = data['RM'], y = data['Price'])

In [None]:
sns.jointplot(x = data['RM'], y = data['Price'], kind = 'hex')

In [None]:
sns.jointplot(x = data['RM'], y = data['Price'], kind = 'kde')

In [None]:
g = sns.jointplot(data['RM'], data['Price'], kind = 'kde', color = 'm')
g.plot_joint(plt.scatter, c = 'r', s = 40, linewidth = 1, marker = '+')
g.ax_joint.collections[0].set_alpha(0.3)

In [None]:
fig, ax = plt.subplots(figsize = (6, 6))
cmap = sns.cubehelix_palette(as_cmap = True, dark = 0, light = 1, reverse = True)
sns.kdeplot(data['RM'], data['Price'], cmap = cmap, n_levels = 60, shade = True)

# Linear Regression and Relationship

- regplot()
- lmplot()

In [None]:
data.head()

In [None]:
sns.regplot(x = 'RM', y = 'Price', data = data, robust=True)

In [None]:
sns.lmplot(x = 'RM', y = 'Price', data = data)

In [None]:
sns.lmplot(x = 'RM', y = 'Price', data = data, hue = 'CHAS')

In [None]:
sns.lmplot(x = 'RM', y = 'Price', data = data, col = 'CHAS')

In [None]:
sns.lmplot(x = 'RM', y = 'Price', data = data, col = 'CHAS', robust=True)

In [None]:
sns.lmplot(x = 'RM', y = 'Price', data = data, col = 'CHAS', order = 2)

In [None]:
sns.lmplot(x = 'CHAS', y = 'Price', data = data, x_estimator=np.mean)