In [1]:
import pandas as pd

In [2]:
from pdexplore.numeric import (
    BasicNumericExploration,
)

# Checking precondition

In [3]:
df = pd.DataFrame(
    data=[
        [23, 'Jo', 1],
        [pow(2,64)-1, 'Bo', 4],
        [82, 'Go', 65535],
        [None, 'No', 1],
    ],
    columns=['age', 'name', 'rank']
)

In [4]:
df

Unnamed: 0,age,name,rank
0,23.0,Jo,1
1,1.844674e+19,Bo,4
2,82.0,Go,65535
3,,No,1


In [5]:
bexp = BasicNumericExploration()

In [6]:
bexp.apply(df['age'])

[38;5;244mBasic numeric exploration skipped. Reason: Less than four non-null values.[0m


In [7]:
bexp.apply(df['rank'])


--- Starting numeric data exploration ---
Data min=1.00, max=65,535.00.
Data mean is 16,385.25, std is 32,766.50
It's also usefull to examine the two corresponding outlier-robust stats:
Median=2.50, median absolute deviation (MAD)=2.22.
Data skewness is 1.15. For normally distributed data, the skewness should be about 0. A skewness value > 0 means that there is more weight in the left tail of the distribution.


# Checking the pipeline

In [8]:
from pdexplore.numeric import (
    run_numeric_exploration_pipeline,
)

In [9]:
run_numeric_exploration_pipeline(df['rank'])


--- Starting numeric data exploration ---
Data min=1.00, max=65,535.00.
Data mean is 16,385.25, std is 32,766.50
It's also usefull to examine the two corresponding outlier-robust stats:
Median=2.50, median absolute deviation (MAD)=2.22.
Data skewness is 1.15. For normally distributed data, the skewness should be about 0. A skewness value > 0 means that there is more weight in the left tail of the distribution.
[38;5;244mSkewness test skipped. Reason: Less than eight non-null values.[0m
Performing the Shapiro-Wilk test for normality...
Null hypothesis (H0): The data comes from a normal dist.
Test statistic: 0.630 p-value: 0.001
The p-value is smaller than the set α; the null hypothesis can be rejected: the data isn't normally distributed.
[38;5;244mD’Agostino’s K^2 normality test skipped. Reason: Less than eight non-null values.[0m
[38;5;3m65,535 found 1 times. It is suspicious, as it is exactly 2^16-1; i.e. the highest number that can be represented by an unsigned 16-bit binary n

# Try the whole thing

In [10]:
from pdexplore import explore, explore_series

In [11]:
explore(df)

Starting to explore a dataframe with pdexplore.
The dataframe contains 3 columns.
The dataframe contains 4 rows.
[1m
Starting to explore series age with pdexplore.
dtype: float64
4 unique values over 4 entries.
25.00% missing values (1).
[38;5;244mValue counts plot skipped. Reason: Low frequency of most-frequent value.[0m
[38;5;244mBasic numeric exploration skipped. Reason: Less than four non-null values.[0m
[38;5;244mSkewness test skipped. Reason: Less than eight non-null values.[0m
[38;5;244mShapiro-Wilk normality test skipped. Reason: Less than three non-null values.[0m
[38;5;244mD’Agostino’s K^2 normality test skipped. Reason: Less than eight non-null values.[0m
[38;5;3m18,446,744,073,709,551,615 found 1 times. It is suspicious, as it is exactly 2^64-1; i.e. the highest number that can be represented by an unsigned 64-bit binary number. It is therefore the max value for variables declared as integers in many programming language. The appearance of the number may reflect