# Great Expectations Task

##0. Clean Up

In [None]:
!pip uninstall -y numpy pandas great_expectations


Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Found existing installation: pandas 2.1.4
Uninstalling pandas-2.1.4:
  Successfully uninstalled pandas-2.1.4
Found existing installation: great-expectations 1.3.12
Uninstalling great-expectations-1.3.12:
  Successfully uninstalled great-expectations-1.3.12


In [None]:
!rm -rf /usr/local/lib/python3.11/dist-packages/numpy*
!rm -rf /usr/local/lib/python3.11/dist-packages/pandas*

## 1. Install Great Expectations Library


In [None]:
!pip install --no-cache-dir numpy pandas great_expectations
#!pip install great_expectations

Collecting numpy
  Downloading numpy-2.2.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pandas
  Downloading pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting great_expectations
  Downloading great_expectations-1.3.12-py3-none-any.whl.metadata (8.6 kB)
Collecting pandas
  Downloading pandas-2.1.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting numpy
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m136.0 MB/s[0m eta [36m0:00:00[0m
Downloading great_expectations-1.3.12-p

##2. Import Necessary Libraries

In [None]:
import numpy as np
import pandas as pd
import great_expectations as gx



##3. Load Labels.csv

Download and upload the [Labels.csv](https://github.com/zubxxr/SOFE3980U-Lab5/blob/main/Labels.csv) into this notebook, and then load the file.

In [None]:
from google.colab import files

uploaded = files.upload()  # This will prompt you to upload Labels.csv
df = pd.read_csv("Labels.csv")  # Load the uploaded file

Saving Labels.csv to Labels (1).csv


##4. Preview the Dataset

In [None]:
df.head()

Unnamed: 0,Timestamp,Car1_Location_X,Car1_Location_Y,Car1_Location_Z,Car2_Location_X,Car2_Location_Y,Car2_Location_Z,Occluded_Image_view,Occluding_Car_view,Ground_Truth_View,pedestrianLocationX_TopLeft,pedestrianLocationY_TopLeft,pedestrianLocationX_BottomRight,pedestrianLocationY_BottomRight
0,1736796157,-51.402977,143,0.596902,-59.32027,140,0.596902,A_001.png,B_001.png,C_001.png,593,361,610,410
1,1736796167,-53.819637,143,0.596902,-59.196568,140,0.596902,A_002.png,B_002.png,C_002.png,579,368,594,415
2,1736796178,-50.239144,143,0.596902,-56.744479,140,0.596902,A_003.png,B_003.png,C_003.png,854,720,854,720
3,1736796188,-53.70722,143,0.596902,-57.30938,140,0.596902,A_004.png,B_004.png,C_004.png,549,368,567,425
4,1736796198,-52.053721,143,0.596902,-59.545897,140,0.596902,A_005.png,B_005.png,C_005.png,524,368,537,413


##5. Set Up Great Expectations Context and Data Source

In [21]:


context = gx.get_context()
data_source = context.data_sources.add_pandas("pandas")
data_asset = data_source.add_dataframe_asset(name="labels_asset")

INFO:great_expectations.data_context.types.base:Created temporary directory '/tmp/tmpo9d_xs94' for ephemeral docs site


##6. Define and Create a Data Batch

In [22]:
batch_definition = data_asset.add_batch_definition_whole_dataframe("labels_batch")
batch = batch_definition.get_batch(batch_parameters={"dataframe": df})


##7. Define Three Expectations for Column Values

Using this [link](https://greatexpectations.io/expectations/), choose three expectation functions and apply them to the labels dataset in a relevant manner.

You should replace the 'ExpectColumnValuesToBeBetween' function with other functions you select from the link.

You can also check the format/parameters required of each function when you click "See more" on the function.

In [None]:
## Original Function
expectation = gx.expectations.ExpectColumnValuesToBeBetween(
    column="column", min_value=0, max_value=20
)

## Example Function

## This function only requires a column parameter, and not a max or min value
expectation = gx.expectations.ExpectColumnValuesToBeUnique(
    column="column"
)

### Expectation 1

In [23]:
expectation1 = gx.expectations.ExpectColumnValuesToNotBeNull(
    column="label"
)


### Validate Data Against Expectation 1

In [24]:
result1 = batch.validate(expectation1)
print(result1)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": false,
  "expectation_config": {
    "type": "expect_column_values_to_not_be_null",
    "kwargs": {
      "column": "label",
      "batch_id": "pandas-labels_asset"
    },
    "meta": {}
  },
  "result": {},
  "meta": {},
  "exception_info": {
    "MetricConfigurationID(metric_name='column_values.nonnull.condition', metric_domain_kwargs_id='d93e8b477aeb45ca7c3e64c5f396a01c', metric_value_kwargs_id=())": {
      "exception_traceback": "Traceback (most recent call last):\n  File \"/usr/local/lib/python3.11/dist-packages/great_expectations/execution_engine/execution_engine.py\", line 534, in _process_direct_and_bundled_metric_computation_configurations\n    metric_computation_configuration.metric_fn(  # type: ignore[misc] # F not callable\n  File \"/usr/local/lib/python3.11/dist-packages/great_expectations/expectations/metrics/metric_provider.py\", line 99, in inner_func\n    return metric_fn(*args, **kwargs)\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File \"/usr/local/lib/p

### Expectation 2

In [25]:


expectation2 = gx.expectations.ExpectColumnValuesToBeInSet(
    column="label",
    value_set=["cat", "dog", "bird"]
)


### Validate Data Against Expectation 2

In [26]:


result2 = batch.validate(expectation2)
print(result2)

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

{
  "success": false,
  "expectation_config": {
    "type": "expect_column_values_to_be_in_set",
    "kwargs": {
      "column": "label",
      "value_set": [
        "cat",
        "dog",
        "bird"
      ],
      "batch_id": "pandas-labels_asset"
    },
    "meta": {}
  },
  "result": {},
  "meta": {},
  "exception_info": {
    "MetricConfigurationID(metric_name='column_values.nonnull.condition', metric_domain_kwargs_id='8e71924e7ee2b200202d51461bcf2326', metric_value_kwargs_id=())": {
      "exception_traceback": "Traceback (most recent call last):\n  File \"/usr/local/lib/python3.11/dist-packages/great_expectations/execution_engine/execution_engine.py\", line 534, in _process_direct_and_bundled_metric_computation_configurations\n    metric_computation_configuration.metric_fn(  # type: ignore[misc] # F not callable\n  File \"/usr/local/lib/python3.11/dist-packages/great_expectations/expectations/metrics/metric_provider.py\", line 99, in inner_func\n    return metric_fn(*args, **

### Expectation 3

In [27]:


expectation3 = gx.expectations.ExpectColumnValuesToBeUnique(
    column="filename"
)

### Validate Data Against Expectation 3

In [28]:


result3 = batch.validate(expectation3)
print(result3)

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

{
  "success": false,
  "expectation_config": {
    "type": "expect_column_values_to_be_unique",
    "kwargs": {
      "column": "filename",
      "batch_id": "pandas-labels_asset"
    },
    "meta": {}
  },
  "result": {},
  "meta": {},
  "exception_info": {
    "MetricConfigurationID(metric_name='column_values.unique.condition', metric_domain_kwargs_id='bb28fd93dc8072f7ec5aea209b8bdabb', metric_value_kwargs_id=())": {
      "exception_traceback": "Traceback (most recent call last):\n  File \"/usr/local/lib/python3.11/dist-packages/great_expectations/execution_engine/execution_engine.py\", line 534, in _process_direct_and_bundled_metric_computation_configurations\n    metric_computation_configuration.metric_fn(  # type: ignore[misc] # F not callable\n  File \"/usr/local/lib/python3.11/dist-packages/great_expectations/expectations/metrics/metric_provider.py\", line 99, in inner_func\n    return metric_fn(*args, **kwargs)\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File \"/usr/local/lib/p