In [None]:
import numpy as np
from sklearn import datasets
from matplotlib import pyplot as plt, rcParams
from zaps.eda import UniStat, Dist, NumAna, Olrs

# set matplotlib style
rcParams['figure.autolayout'] = True
rcParams['axes.spines.right'] = False
rcParams['axes.spines.top'] = False

#### Test Data

In [None]:
# load dataset
df = datasets.load_breast_cancer(as_frame = True)['frame']

df.head()

In [None]:
# column types
cat_cols = ['target']
num_cols = df.columns[:-1]

# outliers
lrs = Olrs(num_cols, hide_p_bar = True)
lrs_df = lrs.fit_transform(df)

# more categories for grouping
np.random.seed(45)

df['multi_cat'] = np.random.choice(['a', 'b', 'c', 'd'], p = [.4, .3, .2, .1], size = len(df))
df['cat_feat'] = np.random.choice(['e', 'f'], p = [.4, .6], size = len(df))
df['multi_cat_none'] = np.random.choice(['g', 'h', None], p = [.1, .4, .5], size = len(df))

### Univariate Analysis plots

In [None]:
u_s = UniStat(df = df, 
              col_drop = df.columns[-4:-3], # `target` column
              card_thresh = 2,
              rare_thresh = .3,  
              figsize = (5,5),
              n_rows = 2,
              n_cols = 1,
              hide_p_bar = True)

In [None]:
# missing values, high card and rare cats
u_s.stats_plot(width = 800, height = 300)

In [None]:
# skewed feats
u_s.skew_plot()

In [None]:
# adjusting thresholds
del u_s.z_hc_data_

u_s.card_thresh = 4
u_s.rare_thresh = .1
u_s.stats_plot(width = 800, height = 300)

In [None]:
# attributes
display(
    u_s.z_hc_data_,
    u_s.z_miss_data_,
    u_s.z_rare_cat_,
    u_s.z_summary_
)

### Distribution plots

In [None]:
# generic setup
hue_agg = ['count']
log_scale = False
color = 'red'
palette = 'Set1'
nbins = 'auto'
axis = 'both'
tight  = True
x_ax_rotation  = 25
theme = 'brown'
hue = 'target' # None

In [None]:
# iterative plotting - histograms
_ = Dist(df = df, cols = num_cols[:2], hide_p_bar = True)

_.hs(bins = 'fd', stat = 'probability', multiple = 'layer', element = 'step', fill = False, discrete = False,
     hue_agg = hue_agg, log_scale = log_scale, color = color, palette = palette, nbins = nbins, axis = axis,
     tight = tight, x_ax_rotation = x_ax_rotation, theme = theme)

In [None]:
# iterative plotting - kernel density 
_.kd(cut = 3, bw_method = 'silverman', bw_adjust = 3, warn_singular = True, hue_agg = hue_agg, log_scale = log_scale,
     color = color, palette = palette, nbins = nbins, axis = axis, tight = tight, x_ax_rotation = x_ax_rotation,
     theme = theme)

In [None]:
# iterative plotting - box plot 
_.bo(hue = hue, fill = True, showmeans = True,
     meanprops = dict(linewidth = 2, color = 'black'), 
     medianprops = dict(linewidth = 1.5, color = 'cyan'), whis = 3, fliersize = 10, 
     hue_agg = hue_agg, log_scale = log_scale, color = color, palette = palette, nbins = nbins, 
     axis = axis, tight = tight, x_ax_rotation = x_ax_rotation, theme = theme)

In [None]:
# iterative plotting - violin plot 
_.vi(hue = hue, fill = False, inner = 'box', split = True, cut = 3, bw_method = 'silverman', bw_adjust = 1, 
     density_norm = 'count', hue_agg = hue_agg, log_scale = log_scale, color = color, palette = palette, nbins = nbins, 
     axis = axis, tight = tight, x_ax_rotation = x_ax_rotation, theme = theme)

In [None]:
# iterative plotting - count plot
_ = Dist(df = df, cols = ['multi_cat', 'cat_feat', 'multi_cat_none', 'target'], hide_p_bar = True)

_.cp(stat = 'count', native_scale = False, legend = 'auto', hue_agg = hue_agg, log_scale = log_scale, color = color, 
     palette = palette, nbins = nbins, axis = axis, tight = tight, x_ax_rotation = x_ax_rotation, theme = theme)

In [None]:
# iterative plotting - cat feats vs discat target
_ = Dist(df = df, cols =  ['multi_cat_none', 'cat_feat'], target = 'target', hide_p_bar = True)

_.cp()

In [None]:
# iterative plotting - cat feats vs cat target
_ = Dist(df = df, cols =  ['multi_cat_none', 'target'], target = 'multi_cat', hide_p_bar = True)

_.cp()

In [None]:
# iterative plotting - num feats vs cat target
_ = Dist(df = df, cols = num_cols[:2], target = 'cat_feat', hide_p_bar = True)

_.hs()

In [None]:
# with hue
_.bo(hue = hue)

In [None]:
# iterative plotting - num feats vs discat target
_ = Dist(df = df, cols = num_cols[:2], target = 'target', hide_p_bar = True)

_.hs()

In [None]:
# finding best fitting distribution
_.best_fit(
           method = 'parametric', 
           distr = ['norm', 'expon', 'uniform', 'lognorm'], 
           stats = 'RSS',
           n_boots = 50)

In [None]:
# visualize fit results
_.best_vis()

In [None]:
# fitted model
_.z_best_fit_results_['mean radius'].model

### Numeric Analysis plots

In [None]:
# correlation plots
n_a = NumAna(df, num_cols, 'target', hide_p_bar = True)
corr_mtrx, feat_corr_mtrx = n_a.corr(disp_corr = 'spearman', quant = .25, thresh = .8, alpha = .05, plot = True)

display(corr_mtrx[['area error', 'mean radius']], feat_corr_mtrx[['area error', 'mean radius']].T)

In [None]:
# logistic fit plots
n_a = NumAna(df = df,
             cols = num_cols[:5],
             target = 'target',
#              fit = 'logit',
             method = 'cg',
             figsize = (15,3),
             n_rows = 1,
             n_cols = 4,
             silent = True,
             hide_p_bar = True,
             theme = 'darkorange',
             warn_convergence = True,
             gtol = 1e-6,# kwarg for 'cg' method
             ).fit_models()

In [None]:
# attributes
display(n_a.z_fit_results_['mean radius'].params, 
        n_a.z_fit_results_['mean radius'].prsquared, 
        n_a.z_fit_results_['mean radius'].llr_pvalue,
       );

# plot
n_a.vis_fit(x_jitter = .01, tc_color = 'r')

In [None]:
# OLS fit plots
n_a = NumAna(df = df,
             cols = num_cols[:4],
             target = 'mean concavity',
             fit = 'ols',
             figsize = (15,3),
             n_rows = 1,
             n_cols = 4,
             hide_p_bar = True,
             theme = 'black',
             )

n_a.vis_fit(x_ax_rotation = 20)

In [None]:
# OLS assumptions plots
n_a.vis_ols_fit()

In [None]:
# lowess fit plots
n_a = NumAna(df = df,
             cols = num_cols[:4],
             target = 'mean concavity',
             fit = 'lws',
             figsize = (15,3),
             n_rows = 1,
             n_cols = 4,
             hide_p_bar = True,
             theme = 'orange',
             ).vis_fit(olrs_mapping = lrs.z_olrs_, olrs_color = 'b')

In [None]:
# polynomial fit plots
n_a = NumAna(df = df,
             cols = num_cols[:4],
             target = 'mean concavity',
             degree = 3,
             fit = 'lws', # should be ignored
             figsize = (15,3),
             n_rows = 1,
             n_cols = 4,
             hide_p_bar = True,
             theme = 'red',
             ).vis_fit()

In [None]:
# multivariate plot - logistic fit

# whats tested?
# `category_orders` passing specific order overriding default one
# `hoverdata` numeric format, removal of facets and inclusion of index
# `z_plotly_fit_` attribute
# Ignoring `facet_col_wrap`
# overlaying outliers and fit assignment per facet
# title, theme, traces

n_a = NumAna(df, num_cols, 'target', hide_p_bar = True)

n_a.vis_multi(
    col = "mean radius",
    olrs_idx = lrs.z_olrs_["mean radius"],
    facet_col = 'multi_cat',
    facet_row = 'cat_feat',
    category_orders = {
                        'multi_cat': ['c', 'a', 'd', 'b'],
                        'cat_feat': ['f', 'e'],
                       },
    facet_col_wrap = 3, # ignored in favor of "facet_row"
    title =  'test pass',
    height = 600,
    theme = 'azure'
    
)

In [None]:
# confirm fit results of 2nd column and row [a,e]
n_a.z_plotly_fit_.iloc[6]['fit'].summary()

In [None]:
# confirm outlier points mapping
subset = df[(df.multi_cat == 'a') & (df.cat_feat == 'e')] # [a,e]
idx = lrs.z_olrs_["mean radius"][lrs.z_olrs_["mean radius"].isin(subset.index)]

subset.loc[idx][['mean radius', 'target']]

In [None]:
# test multivariate plot - Poly fit

# what's tested?
# `facet_col_wrap`
# fit assignment per facet
# log checks
# trace text
# `z_plotly_fit_` attribute

n_a = NumAna(df, num_cols, 'mean radius', degree = 3, hide_p_bar = True)

n_a.vis_multi(
    col = "mean concavity",
    olrs_idx = lrs.z_olrs_["mean concavity"],
    facet_col = 'multi_cat',
    category_orders = {
                        'multi_cat': ['b', 'a', 'c', 'd'],
                       },
    facet_col_wrap = 2,
    trendline_options = {'log_x' : True},
)

In [None]:
# confirm fit results [a]
n_a.z_plotly_fit_.iloc[[2]]

In [None]:
# ignoring `facet_col_wrap`
n_a.vis_multi(
    col = "mean concavity",
    olrs_idx = lrs.z_olrs_["mean concavity"],
    facet_col = 'multi_cat',
    facet_col_wrap = 2, # ignored in favor of 'trendline_scope'
    trendline_scope = "overall",
)

In [None]:
# confirm fit results - overall
n_a.z_plotly_fit_

In [None]:
# more grouping [color, symbol]
n_a.vis_multi(
    col = "mean concavity",
    olrs_idx = lrs.z_olrs_["mean concavity"],
    color = 'mean texture',
    symbol = 'cat_feat',
    facet_col = 'multi_cat',
    facet_row = 'target',
    facet_col_wrap = 0,
    width = 1100,
    height = 500
)

In [None]:
# confirm fit results [e,c,0]
n_a.z_plotly_fit_.iloc[11]

In [None]:
# test 3D multivariate plot - logistic fit

# what's tested?
# overlaying outliers
# z-axis assigned to 'target'
# hovertext
# layout

n_a = NumAna(df, num_cols, 'target', hide_p_bar = True)

n_a.vis_multi_d(
    x = "mean concavity",
    y = 'mean texture',
    olrs_idx = lrs.z_olrs_["mean concavity"],
    color = 'mean radius',
)

### Input validation not covered in pytest

In [None]:
# Dist(df, num_cols).cp() # feature categorization in count plot

In [None]:
# Dist(df, cat_cols, target = 'area error').cp() # target categorization in count plot

In [None]:
# Dist(df, ['multi_cat', 'cat_feat']).kd() # missing `target` in kde plot

In [None]:
# Dist(df, num_cols).best_vis() # Dist instance not fitted

In [None]:
# _ = Dist(df, cat_cols)

# _.best_fit(method = 'discrete', distr = ['expon'])

# _.best_vis() # visualizing `discrete` results

In [None]:
# NumAna(df, num_cols, target = 'mean concavity').vis_fit(olrs_idx = lrs.z_olrs_) # passing olrs mapping to index param

In [None]:
# # passing olrs idx to mapping param
# NumAna(df, num_cols, target = 'mean concavity').vis_fit(olrs_mapping = lrs.z_olrs_['mean concavity'])

In [None]:
# NumAna(df, num_cols, target = 'target').vis_ols_fit() # calling ols vis on logistic fit