In [2]:
!pip3 install great_expectations



In [3]:
import great_expectations as gx
import pandas as pd

In [4]:
df = pd.read_csv(
    "https://raw.githubusercontent.com/great-expectations/gx_tutorials/main/data/yellow_tripdata_sample_2019-01.csv"
)

# Using Great Expectations

In [5]:
## Example Setup

context = gx.get_context()
data_source = context.data_sources.add_pandas("pandas")
data_asset = data_source.add_dataframe_asset(name="pd dataframe asset")

batch_definition = data_asset.add_batch_definition_whole_dataframe("batch definition")
batch = batch_definition.get_batch(batch_parameters={"dataframe": df})

expectation = gx.expectations.ExpectColumnValuesToBeBetween(
    column="passenger_count", min_value=1, max_value=6
)

validation_result = batch.validate(expectation)

INFO:great_expectations.data_context.types.base:Created temporary directory '/tmp/tmptibqrx75' for ephemeral docs site


Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

In [6]:
car_df = pd.read_csv("/content/data-a1.csv")

Column Level Validation

In [7]:
context = gx.get_context()
data_source = context.data_sources.add_pandas("pandas_source")
data_asset = data_source.add_dataframe_asset(name="car_data_asset")
batch_definition = data_asset.add_batch_definition_whole_dataframe("batch_def")
batch = batch_definition.get_batch(batch_parameters={"dataframe": car_df})

validator = context.get_validator(batch=batch)

expectations = [
    #column level
    gx.expectations.ExpectColumnValuesToBeBetween(column="year", min_value=1980, max_value=2025),
    gx.expectations.ExpectColumnValuesToBeBetween(column="engine", min_value=0, max_value=16000),
    gx.expectations.ExpectColumnValuesToBeBetween(column="price", min_value=1, max_value=1e9),
    gx.expectations.ExpectColumnValuesToBeBetween(column="mileage", min_value=0, max_value=1e6),
    #row level
    gx.expectations.ExpectTableRowCountToBeBetween(min_value=100, max_value=62302),
    #table level
    gx.expectations.ExpectColumnToExist(column="year"),
    gx.expectations.ExpectColumnToExist(column="engine"),
    gx.expectations.ExpectColumnToExist(column="price"),
    gx.expectations.ExpectColumnToExist(column="mileage"),
    gx.expectations.ExpectColumnValuesToBeNull(column="price")

]

validation_results = []
for expectation in expectations:
    validation_result = batch.validate(expectation)
    validation_results.append(validation_result)


display(validation_results)

INFO:great_expectations.data_context.types.base:Created temporary directory '/tmp/tmpuvr18sl0' for ephemeral docs site


Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

[{
   "success": true,
   "expectation_config": {
     "type": "expect_column_values_to_be_between",
     "kwargs": {
       "batch_id": "pandas_source-car_data_asset",
       "column": "year",
       "min_value": 1980.0,
       "max_value": 2025.0
     },
     "meta": {}
   },
   "result": {
     "element_count": 62302,
     "unexpected_count": 0,
     "unexpected_percent": 0.0,
     "partial_unexpected_list": [],
     "missing_count": 3786,
     "missing_percent": 6.076851465442521,
     "unexpected_percent_total": 0.0,
     "unexpected_percent_nonmissing": 0.0,
     "partial_unexpected_counts": [],
     "partial_unexpected_index_list": []
   },
   "meta": {},
   "exception_info": {
     "raised_exception": false,
     "exception_traceback": null,
     "exception_message": null
   }
 },
 {
   "success": true,
   "expectation_config": {
     "type": "expect_column_values_to_be_between",
     "kwargs": {
       "batch_id": "pandas_source-car_data_asset",
       "column": "engine",
    

In [8]:
matches_df = pd.read_csv("/content/matches.csv")

In [9]:
!pip install dbt-core



In [13]:
context = gx.get_context()
data_source = context.data_sources.add_pandas("pandas_source_matches")
data_asset = data_source.add_dataframe_asset(name="matches_data_asset")
batch_definition = data_asset.add_batch_definition_whole_dataframe("matches_batch_def")
batch = batch_definition.get_batch(batch_parameters={"dataframe": matches_df})

validator = context.get_validator(batch=batch)

expectations = [
    # Column Level Expectations
    gx.expectations.ExpectColumnValuesToBeBetween(column="season", min_value=2008, max_value=2024),
    gx.expectations.ExpectColumnValuesToNotBeNull(column="city"),
    gx.expectations.ExpectColumnValuesToMatchRegex(column="team1", regex=r"[A-Za-z\s]+"),

    # Row Level Expectations
    gx.expectations.ExpectTableRowCountToBeBetween(min_value=100, max_value=1100),

    #table level
    gx.expectations.ExpectTableColumnCountToEqual(value=20),

    #distribution level expectation
    gx.expectations.ExpectColumnStdevToBeBetween(column="season", min_value=0, max_value=1)
]


# Validate each expectation individually and store results
validation_results = []
for expectation in expectations:
    validation_result = batch.validate(expectation)
    validation_results.append(validation_result)

display(validation_results)

#table level expectation
validator.expect_compound_columns_to_be_unique(['season', 'city', 'date'])


INFO:great_expectations.data_context.types.base:Created temporary directory '/tmp/tmpfbye8rth' for ephemeral docs site


Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/3 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

[{
   "success": false,
   "expectation_config": {
     "type": "expect_column_values_to_be_between",
     "kwargs": {
       "column": "season",
       "min_value": 2008.0,
       "max_value": 2024.0,
       "batch_id": "pandas_source_matches-matches_data_asset"
     },
     "meta": {}
   },
   "result": {},
   "meta": {},
   "exception_info": {
     "MetricConfigurationID(metric_name='column_values.between.condition', metric_domain_kwargs_id='8ec39d99c73ab9462ba930a1d357bbbe', metric_value_kwargs_id='77e18d0ac4ff6a5bc87b9c170d3b87f7')": {
       "exception_traceback": "Traceback (most recent call last):\n  File \"/usr/local/lib/python3.11/dist-packages/great_expectations/execution_engine/execution_engine.py\", line 533, in _process_direct_and_bundled_metric_computation_configurations\n    metric_computation_configuration.metric_fn(  # type: ignore[misc] # F not callable\n  File \"/usr/local/lib/python3.11/dist-packages/great_expectations/expectations/metrics/metric_provider.py\", lin



Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

{
  "success": false,
  "expectation_config": {
    "type": "expect_compound_columns_to_be_unique",
    "kwargs": {
      "batch_id": "pandas_source_matches-matches_data_asset",
      "column_list": [
        "season",
        "city",
        "date"
      ]
    },
    "meta": {}
  },
  "result": {
    "element_count": 1095,
    "unexpected_count": 46,
    "unexpected_percent": 4.200913242009133,
    "partial_unexpected_list": [
      {
        "season": "2009",
        "city": "Cape Town",
        "date": "2009-04-18"
      },
      {
        "season": "2009",
        "city": "Cape Town",
        "date": "2009-04-18"
      },
      {
        "season": "2009",
        "city": "Cape Town",
        "date": "2009-04-19"
      },
      {
        "season": "2009",
        "city": "Cape Town",
        "date": "2009-04-19"
      },
      {
        "season": "2009",
        "city": "Durban",
        "date": "2009-04-29"
      },
      {
        "season": "2009",
        "city": "Durban",
      

Row Level Validation

In [14]:
reddit_df = pd.read_csv("/content/reddit_posts.csv")

In [16]:
context = gx.get_context()

data_source = context.data_sources.add_pandas("pandas_source_reddit")
data_asset = data_source.add_dataframe_asset(name="reddit_data_asset")
batch_definition = data_asset.add_batch_definition_whole_dataframe("reddit_batch_def")
batch = batch_definition.get_batch(batch_parameters={"dataframe": reddit_df})

validator = context.get_validator(batch=batch)

expectations = [
    # Column Level Expectations
    gx.expectations.ExpectColumnValuesToNotBeNull(column="subreddit"),
    gx.expectations.ExpectColumnValuesToNotBeNull(column="title"),
    gx.expectations.ExpectColumnValuesToNotBeNull(column="url"),

    gx.expectations.ExpectColumnValuesToBeBetween(column="ups", min_value=0),
    gx.expectations.ExpectColumnValuesToBeBetween(column="downs", min_value=0),
    gx.expectations.ExpectColumnValuesToBeBetween(column="score", min_value=0),
    gx.expectations.ExpectColumnValuesToBeBetween(column="num_comments", min_value=0),

    gx.expectations.ExpectColumnValuesToBeBetween(column="upvote_ratio", min_value=0, max_value=1),
    gx.expectations.ExpectColumnValuesToMatchRegex(column="created_utc", regex=r"\d{2}-\d{2}-\d{4} \d{2}:\d{2}:\d{2}"),

    # Table Level Expectations
    gx.expectations.ExpectTableColumnCountToEqual(value=10),

    # Distribution Level Expectations
    gx.expectations.ExpectColumnStdevToBeBetween(column="upvote_ratio", min_value=0, max_value=0.5),
    gx.expectations.ExpectColumnMeanToBeBetween(column="score", min_value=0, max_value=1000)
]

# Validate each expectation
validation_results = []
for expectation in expectations:
    validation_result = batch.validate(expectation)
    validation_results.append(validation_result)

display(validation_results)


INFO:great_expectations.data_context.types.base:Created temporary directory '/tmp/tmp8n93126l' for ephemeral docs site


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/3 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

[{
   "success": true,
   "expectation_config": {
     "type": "expect_column_values_to_not_be_null",
     "kwargs": {
       "batch_id": "pandas_source_reddit-reddit_data_asset",
       "column": "subreddit"
     },
     "meta": {}
   },
   "result": {
     "element_count": 70,
     "unexpected_count": 0,
     "unexpected_percent": 0.0,
     "partial_unexpected_list": [],
     "partial_unexpected_counts": [],
     "partial_unexpected_index_list": []
   },
   "meta": {},
   "exception_info": {
     "raised_exception": false,
     "exception_traceback": null,
     "exception_message": null
   }
 },
 {
   "success": true,
   "expectation_config": {
     "type": "expect_column_values_to_not_be_null",
     "kwargs": {
       "batch_id": "pandas_source_reddit-reddit_data_asset",
       "column": "title"
     },
     "meta": {}
   },
   "result": {
     "element_count": 70,
     "unexpected_count": 0,
     "unexpected_percent": 0.0,
     "partial_unexpected_list": [],
     "partial_unexpecte

#**To-do:**

1.   Fetch any dataset from online source. I'll recommend using reddit API used in the assignment #1.
2.   Run 5 expectations of your choice to validate the dataset. They should cover row, column, multi-column, table, distribution operation.
3.   Submit the notebook file on LMS before Monday 11:55 PM.
4.   Mention your group number in the name of the file.

