In [None]:
import pandas as pd
import numpy as np
import math

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_style('whitegrid')

In [None]:
from sklearn.datasets import fetch_california_housing
dataset = fetch_california_housing()
df = pd.DataFrame(dataset.data, columns=[dataset.feature_names])
df.head()

In [None]:
df.shape

In [None]:
df['price'] = 100_000 * dataset.target
df.head()

In [None]:
df.isna().sum().sum()

In [None]:
df.columns

In [None]:
df.columns = df.columns.get_level_values(0)

In [None]:
df.columns

In [None]:
def helf_masked_corr_heatmap(dataframe, title=None, file=None):
    plt.figure(figsize=(9,9))
    sns.set(font_scale=1)

    mask = np.zeros_like(dataframe.corr())
    mask[np.triu_indices_from(mask)] = True

    with sns.axes_style('white'):
        sns.heatmap(dataframe.corr(), mask=mask, annot=True, cmap='coolwarm')
    
    if title: plt.title(f'\n{title}\n', fontsize=18)
    plt.xlabel('')
    plt.ylabel('')
    if file: plt.savefig(file, bbox_inches='tight')
    plt.show()

    return

In [None]:
helf_masked_corr_heatmap(df, 'Calif. Housing Price Data - Variable Correlations')

In [None]:
def corr_to_target(dataframe, target, title=None, file=None):
    plt.figure(figsize=(4,6))
    sns.set(font_scale=1)

    sns.heatmap(dataframe.corr()[[target]].sort_values(target, ascending=False)[1:], annot=True, cmap='coolwarm')
    if title: plt.title(f'\n{title}\n', fontsize=18)
    plt.xlabel('')
    plt.ylabel('')
    if file: plt.savefig(file, bbox_inches='tight')
    plt.show()

    return

In [None]:
corr_to_target(df, 'price', 'Calif Housing Price Data - Corr to Price')

In [None]:
def gen_scatterplots(dataframe, target_column, list_of_columns, cols=1, file=None):
    rows        = math.ceil(len(list_of_columns)/cols)
    figwidth    = 5 * cols
    figheight   = 4 * rows

    fig, ax = plt.subplots(nrows=rows, ncols=cols, figsize=(figwidth, figheight))

    color_choices = ['blue', 'grey', 'goldenrod', 'red', 'black', 'darkorange', 'green']

    plt.subplots_adjust(wspace=0.3, hspace=0.3)
    ax = ax.ravel()

    for i, column in enumerate(list_of_columns):
        ax[i].scatter(dataframe[column], dataframe[target_column], color=color_choices[i % len(color_choices)], alpha=0.1)

        #ax[i].set_title(f'{column} vs. {target_column}', fontsize=18)
        ax[i].set_ylabel(f'{target_column}', fontsize=14)
        ax[i].set_xlabel(f'{column}', fontsize=14)

    fig.suptitle('\nEach Feature vs. Target Scatter Plots', size=24)
    fig.tight_layout()
    fig.subplots_adjust(bottom=0, top=0.88)
    if file: plt.savefig(file, bbox_inches='tight')
    plt.show()
    
    return

In [None]:
feature_cols = [col for col in df.columns if col != 'price']
gen_scatterplots(df, 'price', feature_cols, 3)

In [None]:
def gen_histogram(dataframe, cols=1, file=None):
    rows        = math.ceil(len(dataframe.columns)/cols)
    figwidth    = 5 * cols
    figheight   = 4 * rows

    fig, ax = plt.subplots(nrows=rows, ncols=cols, figsize=(figwidth, figheight))

    color_choices = ['blue', 'grey', 'goldenrod', 'red', 'black', 'darkorange', 'green']
    ax = ax.ravel()

    for i, column in enumerate(dataframe.columns):
        ax[i].hist(dataframe[column], color=color_choices[i % len(color_choices)], alpha = 1)

        ax[i].set_title(f'{dataframe[column].name}', fontsize=18)
        ax[i].set_ylabel('Observations', fontsize=14)
        ax[i].set_xlabel('', fontsize=14)

    fig.suptitle('\nHistograms for All Variables in Dataframe', size=24)
    fig.tight_layout()
    fig.subplots_adjust(bottom=0, top=0.88)
    if file: plt.savefig(file, bbox_inches='tight')
    plt.show()

    return

In [None]:
gen_histogram(df, 3)

In [None]:
def gen_boxplots(dataframe, cols=1, file=None):
    rows        = math.ceil(len(dataframe.columns)/cols)
    figwidth    = 5 * cols
    figheight   = 4 * rows

    fig, ax = plt.subplots(nrows=rows, ncols=cols, figsize=(figwidth, figheight))

    plt.subplots_adjust(wspace=0.3, hspace=0.3)
    ax = ax.ravel()

    for i, column in enumerate(dataframe.columns):
        ax[i].boxplot(dataframe[column])
        ax[i].set_title(f'{dataframe[column].name}', fontsize=18)
        ax[i].set_ylabel('', fontsize=14)
        ax[i].set_xlabel('', fontsize=14)
        ax[i].tick_params(labelbottom=False)

    fig.suptitle('\nBoxplots for All Variables in Dataframe', size=24)
    fig.tight_layout()
    fig.subplots_adjust(bottom=0, top=0.88)
    if file: plt.savefig(file, bbox_inches='tight')
    plt.show()

    return

In [None]:
gen_boxplots(df, 3)

In [None]:
def gen_linecharts(dataframe, cols=1, file=None):
    list_of_columns = list(dataframe.columns)
    rows        = math.ceil(len(list_of_columns)/cols)
    figwidth    = 5 * cols
    figheight   = 4 * rows

    fig, ax = plt.subplots(nrows=rows, ncols=cols, figsize=(figwidth, figheight))

    color_choices = ['blue', 'grey', 'goldenrod', 'red', 'black', 'darkorange', 'green']

    plt.subplots_adjust(wspace=0.3, hspace=0.3)
    ax = ax.ravel()

    for i, column in enumerate(list_of_columns):
        ax[i].plot(dataframe[column], color=color_choices[i % len(color_choices)])

        ax[i].set_title(f'{column}', fontsize=18)
        ax[i].set_ylabel(f'{column}', fontsize=14)

    fig.suptitle('\nLine Graphs for All Variables in Dataframe', size=24)
    fig.tight_layout()
    fig.subplots_adjust(bottom=0, top=0.88)
    if file: plt.savefig(file, bbox_inches='tight')
    plt.show()

    return

In [None]:
gen_linecharts(df, 3)

In [None]:
def gen_linecharts_rolling(dataframe, roll_num, cols=1, file=None):
    list_of_columns = list(dataframe.columns)    
    rows      = math.ceil(len(list_of_columns)/cols)
    figwidth  = 5 * cols
    figheight = 4 * rows
    
    dataframe = dataframe.rolling(roll_num).mean()

    fig, ax = plt.subplots(nrows   = rows,
                           ncols   = cols,
                           figsize = (figwidth, figheight))
    
    color_choices = ['blue', 'grey', 'goldenrod', 'r', 'black', 'darkorange', 'g']

    plt.subplots_adjust(wspace=0.3, hspace=0.3)
    ax = ax.ravel()         # Ravel turns a matrix into a vector... easier to iterate

    for i, column in enumerate(list_of_columns):
        ax[i].plot(dataframe[column],
                   color=color_choices[i % len(color_choices)])
        
        ax[i].set_title(f'{column}', fontsize=18)
        ax[i].set_ylabel(f'{column}', fontsize=14)
        ax[i].set_xlabel('Time', fontsize=14)
        
    fig.suptitle('\nRolling Avg. Line Graphs (all vars)', size=24)
    fig.tight_layout()
    fig.subplots_adjust(bottom=0, top=0.88)
    if file: plt.savefig(file, bbox_inches='tight')
    plt.show();
    
    return

In [None]:
gen_linecharts_rolling(df, 150, 3)