# Boston Housing Visualization

## Setup

In [None]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

sys.path.append("..")

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# for pretty printing
def printDf(sprkDF): 
    newdf = sprkDF.toPandas()
    from IPython.display import display, HTML
    return HTML(newdf.to_html())

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

# Spark libs
from pyspark.sql.session import SparkSession
from helpers.path_translation import translate_to_file_string

Select the Imput File

In [None]:
inputFile = translate_to_file_string("../data/Boston_Housing_Data.csv")

SparkSession creation

In [None]:
spark = (SparkSession
       .builder
       .appName("BostonHousingRegression")
       .getOrCreate())

Create a DataFrame using an ifered schema 

In [None]:
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile)
print(df.printSchema())

## Plot the data

In [None]:
printDf(df.summary())

In [None]:
### Box Plots

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
df.toPandas().boxplot(sym='r+', figsize=(20,15))
save_fig("attribute_box_plots")
plt.show()

### Histograms

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
df.toPandas().hist(bins=50, figsize=(20,15))
save_fig("attribute_histogram_plots")
plt.show()

### Density Plot

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
df.select("CRIM", "ZN", "INDUS").toPandas().plot.kde(figsize=(20,15))
save_fig("attribute_density_plots")
plt.show()

### Scatter Plot

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
ax = df.select("CRIM", "ZN", "INDUS").filter(df.CAT == 0).toPandas().plot.scatter(x='CRIM', y='INDUS', color='DarkBlue', label='CAT 0')
df.select("CRIM", "ZN", "INDUS").filter(df.CAT != 0).toPandas().plot.scatter(x='CRIM', y='INDUS', color='DarkGreen', label='CAT 1', ax=ax)
save_fig("scatter_plots")
plt.show()

### Scatter matrix

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
scatter_matrix(df.select("CRIM", "ZN", "INDUS","NOX","RM", "AGE", "DIS").toPandas(), alpha=0.2, figsize=(30, 30), diagonal='kde')
save_fig("scatter_matrix")
plt.show()

### Bubble Charts

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
dfPandas= df.select("CRIM", "ZN", "INDUS","MEDV").toPandas()
dfPandas.plot.scatter(x='CRIM', y='INDUS', s=dfPandas['MEDV'] * 1, color='DarkBlue')
save_fig("bubble_plots")
plt.show()

### Parallel coordinates

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from pandas.plotting import parallel_coordinates
parallel_coordinates(df.select("CRIM", "ZN", "INDUS","CAT").toPandas(), 'CAT')
save_fig("paralle_coordinates_plots")
plt.show()

In [None]:
spark.stop()