# Include-3-Shared-Viz-Functions

## Set up the plotting libraries, modules, and styles

In [None]:
# Import the pyplot module from the matplotlib library
from matplotlib import pyplot as plt
# Use Jupyter magics to plot inline without needing to call plt.show()
# From the documentation (https://stackoverflow.com/questions/43027980/)
# "With backend = 'inline', the output of plotting commands is displayed inline within frontends 
#   like the Jupyter notebook, directly below the code cell that produced it. 
#   The resulting plots will then also be stored in the notebook document."
%matplotlib inline

In [1]:
# Import the Seaborn library (by Michael Waskom)
import seaborn as sns
# Set the visual styles
sns.set(context = 'notebook', 
        style = 'darkgrid',
        palette = 'deep', 
        font = 'sans-serif', 
        font_scale = 1.3, 
        color_codes = True, 
        rc = None
       )

In [None]:
# List the matplotlib styles available
#print(plt.style.available)

In [None]:
#### Set the matplotlib style here ####
style = 'seaborn-darkgrid'
plt.style.use(style)

In [None]:
# Test out the style settings
#print("Here's what the {} style looks like...".format(style))
#fig, axes = plt.subplots(1, 4, figsize=(12, 4))
#axes[0].set_xlim(0, 0.5)

In [None]:
# Plotnine for ggplot
from plotnine import *

## Packages for computation and data manipulation

In [None]:
import numpy as np # for number crunching
import pandas as pd # for data loading and manipulation
import time
from itertools import groupby

## Jupyter notebook display settings

In [None]:
# Make sure all columns of a dataframe are displayed
# https://stackoverflow.com/questions/47022070/
from IPython.display import display
pd.options.display.max_columns = None

In [None]:
# Make sure that a dataframe column value (e.g., a large text field) is not truncated
# https://stackoverflow.com/questions/25351968
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 100)

In [1]:
# Configure slide scrolling
# from hfinger at https://github.com/damianavila/RISE/issues/185
#### NOTE: Have to restart notebook server after running it the first time ####
from notebook.services.config import ConfigManager
cm = ConfigManager()
cm.update('livereveal', {'width': 1024, 'height': 768, 'scroll': True})

{'height': 768, 'scroll': True, 'width': 1024}

## Data preprocessing functions

In [None]:
# Get a list of each attribute and the first n values for that attribute in the data set
#### SET n HERE ####
display_n = 3

def get_first_n_vals(dataFrame, n=display_n):
    feature_list = list(dataFrame)
    first_n = [list(dataFrame[attribute][0:n]) for attribute in feature_list]
    return list(enumerate(list(zip(feature_list, first_n))))

In [None]:
# For each feature, how many/what percentage of rows are missing values?
# From https://datascience.stackexchange.com/questions/12645/

def num_missing_values_per_feature(dataFrame, display='percentage'):
    if display == 'count':
        return dataFrame.isnull().sum(axis=0)
    else:
        return dataFrame.isnull().sum(axis=0)/len(dataFrame)

In [None]:
# For each row, how many/what percentage of rows are missing values?
# From https://datascience.stackexchange.com/questions/12645/

def num_missing_values_per_row(dataFrame, display='percentage'):
    if display == 'count':
        return dataFrame.isnull().sum(axis=1)
    else:
        return dataFrame.isnull().sum(axis=1)/len(dataFrame)

## Data Display Functions

### Display Categorical Features

In [None]:
# get the unique values of the categorical attributes/features
def get_cat_values(dataFrame, cat_feature_set):
    # dataFrame is the dataset in pandas dataframe format
    # cat_feature_set is a list of categorical feature names
    
    return list(zip(cat_feature_set, \
                    [dataFrame[cat_feature].unique() for cat_feature in cat_feature_set]))

In [None]:
# Set up the plot - use for categorical features
# Show how a categorical feature's values are distributed across the possible values it can take
def cat_value_dist(dataFrame, feature, display='percentage', orient='vert'):
    # display can be 'percentage' (default) or 'count'
    
    fig, ax = plt.subplots(figsize=(8, 8))
    
    # The unique values the feature takes
    feat_values = dataFrame[feature].unique()
    
    # To make the plot vertical, use x=feature in the 'count' display and (feat_values, y) in the 'percentage' display
    # To make the plot horizontal, use y=feature in the 'count' display and (y, feat_values) in the 'percentage' display
    if display == 'count':
        if orient == 'vert': 
            ax = sns.countplot(x=feature, data=dataFrame)
        else:
            # horiz orientation 
            ax = sns.countplot(y=feature, data=dataFrame)
    elif display == 'percentage':
        y = [len([val for val in dataFrame[feature] if val == x_val])/len(dataFrame[feature]) * 100 for x_val in feat_values]
        if orient == 'vert': 
            ax = sns.barplot(feat_values, y)
        else:
            # horiz orientation
            ax = sns.barplot(y, feat_values)
    
    # If the number of distinct values is greater than n, rotate the labels
    n = 3
    if len(feat_values) > n:
        plt.xticks(rotation=90)
    
    if orient == 'vert':
        plt.ylabel(display)
        plt.title(feature)
    else:
        plt.ylabel(feature)
        plt.title(display)
    
    # If %matplotlib inline is invoked, we don't need to return plt.show()
    #return plt.show()

### Display the Relationship Between Categorical Features

In [None]:
# Contingency table to track the relationship between any two categorical variables
def contingency_table(dataFrame, row_feat, col_feat):
    # dataFrame is the complete dataset
    # row_feat is the feature whose values are displayed as rows
    # col_feat is the feature whose values are displayed across columns
    ct = pd.crosstab(index=dataFrame[row_feat], 
                     columns=dataFrame[col_feat]
                    )

    return ct

In [None]:
# Plot a contingency table as a stacked bar chart
def plot_contingency_table(dataFrame, row_feat, col_feat, stacked=True):
    ct = contingency_table(dataFrame, row_feat, col_feat)
    # For horizontal chart use kind='barh'
    # For vertical chart use kind='bar'
    ct.plot(kind="barh", 
            figsize=(10,8), 
            stacked=stacked
           )

In [None]:
# Another way to visualize the relationship between 2 categorical features
# Requires the plotnine package

def cat_2_bars(dataFrame, x_feat, y_feat):
    disp = (ggplot(dataFrame, aes(x=x_feat, fill=y_feat)) \
            + geom_bar(position='fill') \
            + ylab('Percentage') \
            + theme(axis_text_x=element_text(rotation=90, hjust=1)))
    
    return disp

In [None]:
# Visualize the relationship between 3 categorical variables
# Requires the plotnine package

def cat_3_bars(dataFrame, x_feat, y_feat, z_feat):
    disp = (ggplot(dataFrame, aes(x=x_feat, fill=y_feat)) \
            + geom_bar(position='fill') \
            + facet_wrap('~' + z_feat) \
            + ylab('Percentage') \
            + theme(axis_text_x=element_text(rotation=90, hjust=1))
           )
    
    return disp

### Display Numerical Features

In [None]:
# Skyline of a single numerical feature
def num_skyline(dataFrame, num_feature_name, x_axis_label, y_axis_label='Count'):
    feat_vals = dataFrame[num_feature_name]
    feat_vals_sorted = np.array(feat_vals.sort_values())
    feat_vals_freq = [len(list(group)) for key, group in groupby(feat_vals_sorted)]
    feat_labels = np.unique(feat_vals_sorted)
    
    # Create the plot
    fig, ax = plt.subplots(figsize=(10,6))
    height = feat_vals_freq
    bars = feat_labels
    y_pos = np.arange(len(bars))
 
    # Create bars
    plt.bar(y_pos, height)

    # Add title and axis names
    plt.xlabel(x_axis_label)
    plt.ylabel(y_axis_label)

 
    # Create names on the x-axis
    plt.xticks(y_pos, feat_labels);

In [None]:
# Bar plot with customized bin widths
def num_skyline_hist(dataFrame, 
                     num_feature_name,
                     bin_width, 
                     bin_min, 
                     bin_max, 
                     x_axis_label, 
                     y_axis_label='Count'):
    
    # bin_width is the width of each bin in the histogram
    # bin_min is the lowest value of the feature
    # bin_max is the highest value of the feature
    
    feat_vals = dataFrame[num_feature_name]
    feat_vals_sorted = np.array(feat_vals.sort_values())
    feat_vals_freq = [len(list(group)) for key, group in groupby(feat_vals_sorted)]
    #feat_labels = np.unique(feat_vals_sorted)
    
    bin_freqs = []
    bin_labels = []
    while bin_min < bin_max:
        bin_next = bin_min + bin_width
        #print(bin_next)
        bin_label = ' '.join([str(bin_min), 'to', str(bin_next)])
        bin_labels.append(bin_label)
        vals_in_bin = len([item for item in feat_vals_sorted if (item >= bin_min) & (item < bin_next)])
        #print(vals_in_bin)
        bin_freqs.append(vals_in_bin)
        bin_min = bin_next
    
    # Create the plot
    fig, ax = plt.subplots(figsize=(12,8))
    height = bin_freqs
    bars = bin_labels
    y_pos = np.arange(len(bars))
 
    # Create bars
    plt.bar(y_pos, height)

    # Add title and axis names
    plt.xlabel(x_axis_label)
    plt.ylabel(y_axis_label)

 
    # Create names on the x-axis
    plt.xticks(y_pos, bars);

In [None]:
# Histogram of a single numerical feature as a histogram 
# or a probability distribution (kde)
def num_hist(dataFrame, num_feature_name, kde=False):
    # dataFrame is the entire dataset
    # num_feature_name is the name of a single numerical feature, e.g., 'numerical_feature'
    fig, ax = plt.subplots(figsize=(12,8))
    sns.distplot(dataFrame[num_feature_name], bins=7, kde=kde)

In [None]:
#### NOTE: Boxplot of a single numerical feature ####
# Use num_boxplot_mult(dataFrame, ['num_feature_name'])
# num_boxplot_mult is defined below

In [None]:
# Time series for a single numerical feature
def time_series_plot(dataFrame, time_feature_name, num_feature_name):
    # dataFrame is the entire dataset
    # time_feature_name is the name of a the time feature, e.g., 'PUBLISHED_DATE'
    # num_feature_name is the name of the numerical feature that evolves in time, e.g., 'COUNT_IVR'
    
    # First sort the dataframe in ascending order of the time_feature_name
    df_sorted = dataFrame.sort_values(by=[time_feature_name])
    
    fig, ax = plt.subplots(figsize=(10,6))
    x = df_sorted[time_feature_name]
    y = df_sorted[num_feature_name]
    plt.plot(x,y, marker='o')
    #ax.tick_params(labelbottom='off') # turn off the x axis tick labels
    plt.xticks(rotation=90) # rotate the x axis tick labels
    ax.set_xlabel(time_feature_name)
    plt.legend()

In [1]:
# Time series evolution of a list of numerical features
def time_series_mult_plot(dataFrame, time_feature_name, num_feature_list, highlighted_feature=''):
    # dataFrame is the entire dataset
    # time_feature_name is the name of a the time feature, e.g., 'PUBLISHED_DATE'
    # num_feature_list contains the names of the numerical features that evolve in time, 
    # e.g., ['COUNT_IVR', ..., 'Avg_Dwell_Time'] or quality_feats
    # highlighted_feature is the feature in the num_feature_list to highlight in the plot
    
    # First sort the dataframe in ascending order of the time_feature_name
    df_sorted = dataFrame.sort_values(by=[time_feature_name])
    
    if highlighted_feature != '':
        # Create the abriged list of numerical features
        abbr_feature_list = [x for x in num_feature_list if x != highlighted_feature]
    
    # set up the plot
    fig, ax = plt.subplots(figsize=(14,10))
    x = df_sorted[time_feature_name]
    if highlighted_feature != '':
        y = df_sorted[abbr_feature_list]
    else:
        y = df_sorted[num_feature_list]
    plt.plot(x,y)
    
    #ax.tick_params(labelbottom='off') # no x axis tick labels
    plt.xticks(rotation=90) # rotate the x axis tick labels
    ax.set_xlabel(time_feature_name)

    # Now re-plot the highlighted feature - bigger with distinct color
    if highlighted_feature != '':
        plt.plot(x, df_sorted[highlighted_feature], marker='o', color='purple', linewidth=3, alpha=0.7)
        plt.legend(abbr_feature_list + [highlighted_feature])
    else:
        plt.legend(num_feature_list)

In [None]:
# Time series plots for a list of numerical features displayed side by side
# TO DO

### Display Relationships Between Numerical Features

In [None]:
# Simple contour plot of the KDE for any two numerical features
def kde_contour(dataFrame, num_feature_1, num_feature_2):
    # dataFrame is the entire dataset
    # num_feature_1 and 2 are individual numerical feature names, e.g., 'numerical_feature'
    sns.kdeplot(dataFrame[num_feature_1], dataFrame[num_feature_2])

In [None]:
# KDE contour + distribution for any two numerical features
def kde_contour_dist(dataFrame, num_feature_1, num_feature_2, kind='kde'):
    # dataFrame is the entire dataset
    # num_feature_1 and 2 are individual numerical feature names, e.g., 'numerical_feature'
    # kind = 'kde' or 'hex'
    with sns.axes_style('white'):
        sns.jointplot(x=dataFrame[num_feature_1], y=dataFrame[num_feature_2], kind=kind)

In [None]:
# Multiple histograms
def num_hist_mult(dataFrame, num_feature_list):
    # dataFrame is the entire dataset
    # num_feature_list is a list of numerical features, e.g., ['num_feat1', ..., 'num_feat_N']
    fig, ax = plt.subplots(figsize=(12,8))
    for num_feature in num_feature_list:
        plt.hist(dataFrame[num_feature], normed=True, alpha=0.5, label=num_feature)
        
    plt.legend()
    

In [None]:
# Distribution density curves overlayed
# Distributions of a set of numerical features
def num_kde_mult(dataFrame, num_feature_list):
    # dataFrame is the entire dataset
    # num_feature_list is a list of numerical features, e.g., ['num_feat1', ..., 'num_feat_N']
    fig, ax = plt.subplots(figsize=(8,6))
    for num_feature in num_feature_list:
        sns.kdeplot(dataFrame[num_feature], label=num_feature)
        
    plt.legend()

In [None]:
# Both histograms and density curves overlayed
# Distributions of a set of numerical features
def num_hist_kde_mult(dataFrame, num_feature_list):
    # dataFrame is the entire dataset
    # num_feature_list is a list of numerical features, e.g., ['num_feat1', ..., 'num_feat_N']
    fig, ax = plt.subplots(figsize=(8,6))
    for num_feature in num_feature_list:
        sns.distplot(dataFrame[num_feature], label=num_feature)
        
    plt.legend()

In [None]:
# Boxplots for a set of numerical features
# The swarmplot shows the data points jittered for better visibility
# Another option instead of the jitter is to use a violinplot (for large datasets)
def num_boxplot_mult(dataFrame, num_feature_list):
    # dataFrame is the entire dataset
    # num_feature_list is a list of numerical features, e.g., ['num_feat1', ..., 'num_feat_N']
    fig, ax = plt.subplots(figsize=(8,6))
    ax = sns.boxplot(data=dataFrame[num_feature_list], palette='Set2')
    ax = sns.swarmplot(data=dataFrame[num_feature_list], color='grey')
    
     # If the number of distinct values is greater than n, rotate the labels
    n = 3
    if len(num_feature_list) > n:
        plt.xticks(rotation=90)

In [None]:
# Correlation Table -- Display the relationship between 
# Correlation Density Plot
def num_corr_table(dataFrame, num_feature_list):
    # dataFrame is the entire dataset
    # num_feature_list is a list of numerical features, e.g., ['num_feat1', ..., 'num_feat_N']
    fig, ax = plt.subplots(figsize=(8,6))
    cm = dataFrame[num_feature_list].corr()
    sns.set(font_scale=1)
    #### NOTE: fmt directive controls number of decimal points displayed in the correlation value. ####
    hm = sns.heatmap(cm,
                     cbar=True,
                     annot=True,
                     square=False,
                     fmt='.2f',
                     annot_kws={'size':14},
                     yticklabels=num_feature_list,
                     xticklabels=num_feature_list
                    )

    plt.title('Correlation Heat Map')

In [None]:
# Bubble chart showing the relationship between any three numerical features
def bubble_chart(dataFrame, x_feature, y_feature, bubble_size_feature):
    # dataFrame is the entire dataset
    # x_feature and y_feature are numerical features on the x and y axis respectively
    # bubble_size_feature is represented by the size of the bubble
    
    fig, ax = plt.subplots(figsize=(10, 8))
    x = dataFrame[x_feature]
    y = dataFrame[y_feature]
    if bubble_size_feature == 'UNIQUE_USERS':
        bubble_size = dataFrame[bubble_size_feature]/100. # scale bubble size
    else:
        bubble_size = dataFrame[bubble_size_feature] # No need to scale bubble size
    plt.scatter(x, y, s=bubble_size*2000, c=x, cmap="Blues", alpha=0.4, edgecolors="orange", linewidth=2)
    plt.xlabel(x_feature)
    plt.ylabel(y_feature)
    plt.title("Bubble Size = " + bubble_size_feature)

## Display Relationships Between Numerical and Categorical Features

In [None]:
# Looking at how a single numerical feature varies across a single categorical feature
# Box plot display
def box_plot(dataFrame, cat_feature, num_feature, orient='h'):
    # dataFrame is the entire dataset
    # cat_feature is the name of a single categorical feature 
    # num_feature is the name of a single numerical feature
    fig, ax = plt.subplots(figsize=(12,8))
    if orient == 'v':
        ax = sns.boxplot(x=dataFrame[cat_feature], y=dataFrame[num_feature], palette="Set2")
    elif orient == 'h':
        ax = sns.boxplot(x=dataFrame[num_feature], y=dataFrame[cat_feature], palette="Set2")
    
    if len(dataFrame[cat_feature].unique()) > 3:
        plt.xticks(rotation=90);

In [None]:
# Looking at how a single numerical feature varies across a single categorical feature
# Jitter plot display
def jitter_plot(dataFrame, cat_feature, num_feature, orient='h'):
    # dataFrame is the entire dataset
    # cat_feature is the name of a single categorical feature 
    # num_feature is the name of a single numerical feature
    fig, ax = plt.subplots(figsize=(10,6))
    if orient == 'v':
        ax = sns.stripplot(x=cat_feature, y=num_feature, data=dataFrame, jitter=0.1)
    elif orient == 'h':
        ax = sns.stripplot(y=cat_feature, x=num_feature, data=dataFrame, jitter=0.1)
    
    if len(dataFrame[cat_feature].unique()) > 3:
        plt.xticks(rotation=90);

In [None]:
# Visualize the relationship between a set of numerical features and 
# a given categorical feature
# Scatter plot format

def num_cat_scatter(dataFrame, num_feats_list, cat_feat_name):
    # dataFrame is the entire dataset
    # num_feats_list is the list of numerical features, e.g., doc_feats
    # cat_feat_name is the name of the single categorical feature, e.g., 'AUTHOR_NAME'
    
    # Create the combined dataframe
    feat_list = num_feats_list + [cat_feat_name]
    
    # Create the pairplot
    sns.pairplot(dataFrame[feat_list], kind='scatter', hue=cat_feat_name);

In [None]:
# Visualize the relationship between a set of numerical features and 
# a given categorical feature
# Regression plot format

def num_cat_regress(dataFrame, num_feats_list, cat_feat_name):
    # dataFrame is the entire dataset
    # num_feats_list is the list of numerical features, e.g., doc_feats
    # cat_feat_name is the name of the single categorical feature, e.g., 'AUTHOR_NAME'
    
    # Create the combined dataframe
    feat_list = num_feats_list + [cat_feat_name]
    
    # Create the pairplot
    sns.pairplot(dataFrame[feat_list], kind='reg', hue=cat_feat_name);

In [None]:
# Scatter plot of two numerical features grouped by a categorical feature
def scatter_plot(dataFrame, num_feat_x, num_feat_y, cat_feat):
    
    # See https://xkcd.com/color/rgb/ for xkcd named colors
    xkcd_colors = ["blue", "hot pink", "violet", "olive", "lime green", "lemon yellow", "goldenrod", "dark orange"]
    
    # Use the 'hue' argument to provide a factor variable
    sns.lmplot(x=num_feat_x, 
               y=num_feat_y, 
               data=dataFrame, 
               fit_reg=False, 
               hue=cat_feat,
               size=8, 
               aspect=1.5,
               legend_out=True, 
               palette=sns.xkcd_palette(xkcd_colors), 
               scatter_kws={'s':200}
              )

In [None]:
# Scatter plot that accommodates the classification of the scatter dots into a large number of items
# Use when the number of the items in a group is > 5
def scatter_plot_large(dataFrame, num_feat_x, num_feat_y, cat_feat, slice_num, slicing_feat='AUTHOR_NAME'):
    # dataFrame is the entire dataset
    # num_feat_x is the numerical feature for the x axis
    # num_feat_y is the numerical feature for the y axis
    # cat_feature is the categorical feature by which the dots are grouped
    # slice_num is the index number of the slice for which we want to create the scatter plot
    ## For example, slice_list[0] might be CIOLC, slice_list[1] might be ALC, etc.
    # slicing_feat is the slicing to be applied to the entire dataset; for example, 
    ## 'AUTHOR_NAME' slices the data set by creating a data frame for each AUTHOR_NAME which 
    ## in our case designates the name of a functional practice (e.g., CIO, Applications, Infrastructure, ...)
    
    # The names of the various items by which to slice the dataFrame
    ## Typically, these slices will be slices by the leadership councils (slicing_feat='AUTHOR_NAME')
    slice_list = np.unique(dataFrame[slicing_feat].values)

    # Rows of data for a given item in the slice_list
    df_slice = dataFrame[dataFrame[slicing_feat] == slice_list[slice_num]]
    
    # Get the title of the plot
    plt_title = np.unique(df_slice[slicing_feat].values)[0]

    # The unique items in the cat_feat for the given df_slice
    cat_titles = df_slice[cat_feat].values

    fig, ax = plt.subplots(figsize=(18,10))
    # basic plot
    p1=sns.regplot(data=df_slice, 
                   x=df_slice[num_feat_x], 
                   y=df_slice[num_feat_y], 
                   fit_reg=False, 
                   marker="o", 
                   color="blue",
                   scatter_kws={'s':100}, 
                   ax=ax
                  )

    # add annotations one by one with a loop
    for line in range(0,df_slice.shape[0]):
        p1.text(df_slice[num_feat_x].values[line]+0.4, 
                df_slice[num_feat_y].values[line]+0.2, 
                cat_titles[line], 
                horizontalalignment='left', 
                size='medium', 
                color='black', 
                weight='normal')

    ax.set_title(plt_title)
    ax.set_xlim(0, 60);

In [None]:
# Looking at how a single numerical feature varies across two categorical features
# Grouped boxplot display
def grouped_boxplot(dataFrame, x_cat_feature, y_num_feature, z_cat_feature):
    fig, ax = plt.subplots(figsize=(14,8))
    sns.boxplot(x=x_cat_feature, 
                y=y_num_feature, 
                hue=z_cat_feature, 
                data=dataFrame, 
                palette="Set3"
               )
    if len(dataFrame[x_cat_feature].unique()) > 3:
        plt.xticks(rotation=90);

## Prepare Dataset for Use in ML Models

In [None]:
#### This is a single longish function that creates inputs for any ML model ####
#### For unsupervised models, the entire dataset is returned ####

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import math

def create_model_input(df_dataset, 
                       target_feature,
                       target_feature_type, 
                       categorical_features_list, 
                       numerical_features_list, 
                       scaler='Standard'
                      ):
    
    # df_dataset is a complete pre-processed input dataset, e.g., df_ig
    # target_feature is the feature that is the target for the model
    #### NOTE: if target_feature is '', then the entire dataset is returned after been one-hot encoded and scaled ####
    # target_type is 'categorical' or 'numerical' or '' (when there is no target specified)
    # categorical_features_list is the list of categorical features used by the model 
    #  (will include the target feature if the target feature is categorical)
    # numerical_features_list is the list of numerical features used by the model
    #  (will include the target feature if the target feature is numerical)
    # scaler can be 'Standard' (default) or 'MinMax'
    
    #### Step 1: Decide how to split the dataset into train, validate, and test datasets ####
    # VAL_PCT_SPLIT can be set to 0.0 if needed
    TRAIN_PCT_SPLIT = 0.8
    VAL_PCT_SPLIT = 0.0
    TEST_PCT_SPLIT = 0.2
    
    #### Step 2: Separate the target feature from the other features ####
    if target_feature != '':
        categorical_features = [x for x in categorical_features_list if x != target_feature]
        numerical_features = [x for x in numerical_features_list if x != target_feature]
    else:
        categorical_features = categorical_features_list
        numerical_features = numerical_features_list
    
    #### Step 3: Create a dataset with the requisite features for the model from the full dataset ####
    if target_feature != '':
        df_model = df_dataset[categorical_features + numerical_features + [target_feature]]
    else:
        df_model = df_dataset[categorical_features + numerical_features]
    
    #### Step 4: One-hot-encode the categorical features ####
    df_model = pd.get_dummies(df_model, columns=categorical_features)
    
    #### Step 5: Label encode the target feature if it's a categorical feature ####
    if target_feature_type == 'categorical':
        le = LabelEncoder()
        df_model[target_feature] = le.fit_transform(df_model[target_feature])
        
    #### df_model now contains all the features and the target we need
    ####  in addition, df_model has its categorical features one-hot-encoded and 
    ####  its label/target encoded if needed
    
    #### Step 6: Shuffle the dataset and split it into train, val, and test ####
    # Shuffle the one-hot-encoded and label-encoded dataset
    df_shuff = shuffle(df_model, random_state=42) # set seed for replicability
    
    (num_rows, num_cols) = df_shuff.shape
    
    num_train = math.floor(TRAIN_PCT_SPLIT * num_rows)
    num_val = math.floor(VAL_PCT_SPLIT * num_rows)
    # num_test consists of the remaning rows of the dataset
    num_test = num_rows - (num_train + num_val)
    
    # Train, val, and test dataframes
    df_train = df_shuff.iloc[0:num_train]
    df_val = df_shuff.iloc[num_train:num_train+num_val]
    df_test = df_shuff.iloc[num_train+num_val: ]
    
    # df_val_test combines df_val and df_test in case we don't need them separately
    # . e.g., when using k-fold cross validation with a scikit classifier
    # Typically used when the dataset is small
    df_val_test = pd.concat([df_val, df_test], axis=0)
    
    # Use df_train_val to (re)train the optimal model once the optimal model 
    #  has been determined using grid search
    df_train_val = pd.concat([df_train, df_val], axis=0)
    
    # And finally, this is the entire dataset (for unsupervised learning, e.g., clustering analysis)
    df_full = pd.concat([df_train_val, df_test], axis=0)
     
    #### Step 8: Scale the numerical features OVER THE TRAINING DATASET ONLY ####
    if scaler == 'Standard':
        sc = StandardScaler()
    elif scaler == 'MinMax':
        sc = MinMaxScaler()
    else:
        sc = StandardScaler() # use StandardScaler as the default scaler
    
    #### NOTE: a copy is made to aviod the pandas SettingWithCopying warning ####
    #### See https://www.dataquest.io/blog/settingwithcopywarning/ ####
    if target_feature == '':
        # Scale the entire dataset's numerical features
        df_full_scaled = df_full.copy()
        df_full_scaled[numerical_features] = sc.fit_transform(df_full[numerical_features])
    else:
        df_full_scaled = df_full
    
    # Scale just the training dataset and use these scaler values to scale the val and test datasets
    df_train_scaled = df_train.copy()
    df_train_scaled[numerical_features] = sc.fit_transform(df_train[numerical_features])

    
    #### Step 9: Scale the numerical features of the other datasets using the scaler values
    ####  of the training dataset ####
    #### NOTE: a copy is made to aviod the pandas SettingWithCopying warning ####
    #### See https://www.dataquest.io/blog/settingwithcopywarning/ ####
    
    # Check to make sure that the validation slice % is not 0
    if len(df_val) > 0:
        df_val_scaled = df_val.copy()
        df_val_scaled[numerical_features] = sc.transform(df_val[numerical_features])
    else:
        df_val_scaled = df_val
    
    df_test_scaled = df_test.copy()
    df_test_scaled[numerical_features] = sc.transform(df_test[numerical_features])
    
    df_val_test_scaled = df_val_test.copy()
    df_val_test_scaled[numerical_features] = sc.transform(df_val_test[numerical_features])
    
    df_train_val_scaled = df_train_val.copy()
    df_train_val_scaled[numerical_features] = sc.transform(df_train_val[numerical_features])
    
    #### Step 10: Get the targets for SciKit Learn models as a (num, ) shape array of reals ####
    #### there are no y values for the full dataset becuause there is no target ####
    if target_feature != '':
        y_train = df_train_scaled[target_feature].values.astype('float32')
        y_val = df_val_scaled[target_feature].values.astype('float32')
        y_test = df_test_scaled[target_feature].values.astype('float32')
        y_val_test = df_val_test_scaled[target_feature].values.astype('float32')
        y_train_val = df_train_val_scaled[target_feature].values.astype('float32')
    else:
        y_train = []
        y_val = []
        y_test = []
        y_val_test = []
        y_train_val = []
    
    #### Step 11: Create the input and target arrays ####
    # Get the feature array as it currently exists for the df_prepped_dataset
    #### NOTE: The feature names may have changed when the categorical features
    #### are one-hot-encoded
    # So features are now all column names EXCEPT for the Target
    features_list = list(df_train_scaled)
    if target_feature != '':
        features_list.remove(target_feature)
    
    X_train = df_train_scaled[features_list].values
    X_val = df_val_scaled[features_list].values
    X_test = df_test_scaled[features_list].values
    X_val_test = df_val_test_scaled[features_list].values
    X_train_val = df_train_val_scaled[features_list].values
    if target_feature == '':
        X_full = df_full_scaled[features_list].values
    else:
        X_full = []
    
    
    #### OUTPUTS ####
    dict_model_inputs = {'X_train': X_train, 
                         'X_val': X_val, 
                         'X_test': X_test, 
                         'X_val_test': X_val_test, 
                         'X_train_val': X_train_val,
                         'X_full': X_full, 
                         'y_train': y_train, 
                         'y_val': y_val, 
                         'y_test': y_test, 
                         'y_val_test': y_val_test, 
                         'y_train_val': y_train_val
                        }
    
    return dict_model_inputs