In [1]:
import great_expectations as gx
import pandas as pd

In [2]:
df = pd.read_csv("_data/df-clean.csv")

## Create the Data Context

In [3]:
context = gx.get_context()

## Data Sources & Data Assets

In [4]:
# create the data source
data_source = context.data_sources.add_pandas(
  name="pizza_data"
)

# create the data asset
data_asset = data_source.add_dataframe_asset(
  name="pizza_asset"
)

## Batch Definitions & Batches

In [5]:
# create the batch definition
batch_definition = data_asset.add_batch_definition_whole_dataframe(
  name="my_batch_definition"
)

# pass your dataframe into a batch. A batch is a group of records that a
# validation can be run on 
batch = batch_definition.get_batch(
  batch_parameters={"dataframe": df}
)

In [7]:
batch.head()

Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 149.56it/s]

   Unnamed: 0       Job Clocked In Clocked Out  Duration  Earnings  \
0           0  Macianos   16:40:00    18:45:00      2.08        59   
1           1  Macianos   16:08:00    21:52:00      5.73       108   
2           2  Macianos   17:01:00    21:15:00      4.23        81   
3           3  Macianos   16:25:00    22:17:00      5.87       124   
4           4  Macianos   16:25:00    21:30:00      5.08       122   

         Date  Month  Day  Year  Is Holiday Holiday Name Day of Week  \
0  2020-08-03      8    3  2020       False          NaN      Monday   
1  2020-08-04      8    4  2020       False          NaN     Tuesday   
2  2020-08-05      8    5  2020       False          NaN   Wednesday   
3  2020-08-06      8    6  2020       False          NaN    Thursday   
4  2020-08-07      8    7  2020       False          NaN      Friday   

   Hourly Rate  Season  
0        28.37  Summer  
1        18.85  Summer  
2        19.15  Summer  
3        21.12  Summer  
4        24.02  Summe




In [8]:
batch.head(fetch_all=True)

Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 125.54it/s]


     Unnamed: 0        Job Clocked In Clocked Out  Duration  Earnings  \
0             0   Macianos   16:40:00    18:45:00      2.08        59   
1             1   Macianos   16:08:00    21:52:00      5.73       108   
2             2   Macianos   17:01:00    21:15:00      4.23        81   
3             3   Macianos   16:25:00    22:17:00      5.87       124   
4             4   Macianos   16:25:00    21:30:00      5.08       122   
..          ...        ...        ...         ...       ...       ...   
303         303  Maciano 2   16:23:00    21:33:00      5.17       178   
304         304  Maciano 2   16:48:00    19:42:00      2.90        63   
305         305  Maciano 2   17:10:00    20:30:00      3.33       113   
306         306  Maciano 2   15:49:00    19:28:00      3.65        93   
307         307  Maciano 2   16:40:00    21:31:00      4.85       224   

           Date  Month  Day  Year  Is Holiday Holiday Name Day of Week  \
0    2020-08-03      8    3  2020       False    

In [9]:
batch.columns()

Calculating Metrics: 100%|██████████| 2/2 [00:00<00:00, 293.38it/s]


['Unnamed: 0',
 'Job',
 'Clocked In',
 'Clocked Out',
 'Duration',
 'Earnings',
 'Date',
 'Month',
 'Day',
 'Year',
 'Is Holiday',
 'Holiday Name',
 'Day of Week',
 'Hourly Rate',
 'Season']

## Expectations

- Should be 15 columns

| Field | Type |Notes |
|----------------|--------|--------------------------------|
| Job | String | Should only be *Macianos* or *Macianos 2* |
| Clocked In   | String |                                  |
| Clocked Out | String |                                   |
| Duration    | Float  | Should be greater than 0. Not shift. |
| Earnings    | Int    | Should be greater than 0 and less than 500 |
| Date        | —      |                                            |
| Month       | Int    | Should be any number 1–12(inclusive)       |
| Day         | Int    | Should be any number 1–31 (inclusive)      |
| Year        | Int    | Should be in {2020, 2021, 2022, 2023}      |
| Is Holiday  | Bool   | Should be in {True, False}                 |
| Holiday Name | String | Should be in list of known holidays (e.g. Christmas Eve, Independence Day, etc.) |
| Day of Week    | String | Should be in {Monday, Tuesday, Wednesday, Thursday, ..., Sunday}    |
| Hourly Rate    | Float  | Should be greater than 0 and less than 60       |
| Season         | String | Should be in {Summer, Spring, Fall, Winter}     |


In [23]:
row_count_between = gx.expectations.ExpectTableRowCountToBeBetween(
    min_value=0,
    max_value=500
)

validation_results = batch.validate(
    expect=row_count_between
)

print(validation_results.success)

Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 79.36it/s] 

True





In [26]:
column_count_between = gx.expectations.ExpectTableColumnCountToBeBetween(
    min_value=10,
    max_value=20
)

validation_results = batch.validate(
    expect=column_count_between
)

validation_results.success

Calculating Metrics: 100%|██████████| 3/3 [00:00<00:00, 408.83it/s]


True