# I. Introduction
Name : Reido Vidaya Mahardika  
Batch : RMT-40


In [1]:
# Install the library

# !pip install -q "great-expectations==0.18.19" "numpy==1.24.3"

In [2]:
import numpy as np
np.__version__

'1.24.3'

# II. Instantiate Data Context

In [3]:
# Create a data context

from great_expectations.data_context import FileDataContext

context = FileDataContext.create(project_root_dir='./')

In [4]:
# Give a name to a Datasource. This name must be unique between Datasources.
datasource_name = 'csv-data-clean'
datasource = context.sources.add_pandas(datasource_name)

# Give a name to a data asset
asset_name = 'car_sales'
path_to_data = '/content/P2M3_Reido_data_clean.csv'
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)

# Build batch request
batch_request = asset.build_batch_request()

# III. Create an Expectation Suite

In [5]:
# Creat an expectation suite
expectation_suite_name = 'expectation-car-sales'
context.add_or_update_expectation_suite(expectation_suite_name)

# Create a validator using above expectation suite
validator = context.get_validator(
    batch_request = batch_request,
    expectation_suite_name = expectation_suite_name
)

# Check the validator
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,car_id,date,customer_name,gender,annual_income,dealer_name,company,model,engine,transmission,color,price,dealer_no,body_style,phone,dealer_region
0,C_CND_000001,2022-01-02,Geraldine,Male,13500,Buddy Storbeck's Diesel Service Inc,Ford,Expedition,DoubleÂ Overhead Camshaft,Auto,Black,26000,06457-3834,SUV,8264678,Middletown
1,C_CND_000002,2022-01-02,Gia,Male,1480000,C & M Motors Inc,Dodge,Durango,DoubleÂ Overhead Camshaft,Auto,Black,19000,60504-7114,SUV,6848189,Aurora
2,C_CND_000003,2022-01-02,Gianna,Male,1035000,Capitol KIA,Cadillac,Eldorado,Overhead Camshaft,Manual,Red,31500,38701-8047,Passenger,7298798,Greenville
3,C_CND_000004,2022-01-02,Giselle,Male,13500,Chrysler of Tri-Cities,Toyota,Celica,Overhead Camshaft,Manual,Pale White,14000,99301-3882,SUV,6257557,Pasco
4,C_CND_000005,2022-01-02,Grace,Male,1465000,Chrysler Plymouth,Acura,TL,DoubleÂ Overhead Camshaft,Auto,Red,24500,53546-9427,Hatchback,7081483,Janesville


# IV. Expectations

In [6]:
# Expectation 1 : Column `car_id` must be unique

validator.expect_column_values_to_be_unique('car_id')




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_unique",
    "kwargs": {
      "column": "car_id",
      "batch_id": "csv-data-clean-car_sales"
    },
    "meta": {}
  },
  "result": {
    "element_count": 5000,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

From the results above, it can be concluded that the "car_id" column meets the expect_column_values_to_be_unique expectation, which means every value in it is unique.

In [7]:
# Expectation 2 : Column `price` must be less than $ 100000

validator.expect_column_values_to_be_between(
    column='price', min_value=0, max_value=100000)




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_between",
    "kwargs": {
      "column": "price",
      "min_value": 0,
      "max_value": 100000,
      "batch_id": "csv-data-clean-car_sales"
    },
    "meta": {}
  },
  "result": {
    "element_count": 5000,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

From the results above, it can be concluded that the "price" column meets the expectation, meaning every value in it is within the range of 0 - 100000.

In [8]:
# Expectation 3 : Column `gender` must contain Male and Female

validator.expect_column_values_to_be_in_set('gender', ['Male', 'Female'])




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_in_set",
    "kwargs": {
      "column": "gender",
      "value_set": [
        "Male",
        "Female"
      ],
      "batch_id": "csv-data-clean-car_sales"
    },
    "meta": {}
  },
  "result": {
    "element_count": 5000,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

From the results above, it can be concluded that the "gender" column meets the expectation, meaning that the column only contains the two values "Male" and "Female".

In [9]:
# Expectation 4 : Column `model` must in object type

validator.expect_column_values_to_be_in_type_list('model', ['str'])




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_in_type_list",
    "kwargs": {
      "column": "model",
      "type_list": [
        "str"
      ],
      "batch_id": "csv-data-clean-car_sales"
    },
    "meta": {}
  },
  "result": {
    "element_count": 5000,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

From the results above, it can be concluded that the "model" column meets the expectation, meaning the column only contains string data types.

In [10]:
# Expectation 5 : Average value of column `price` must between 100 and 50000
validator.expect_column_mean_to_be_between(
    column='price', min_value=100, max_value=50000
)




Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_mean_to_be_between",
    "kwargs": {
      "column": "price",
      "min_value": 100,
      "max_value": 50000,
      "batch_id": "csv-data-clean-car_sales"
    },
    "meta": {}
  },
  "result": {
    "observed_value": 28236.4144
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

From the results above, it can be concluded that the "price" column has an average value of 28236.4144, which means it meets the established expectation of being between 100 and 50000

In [11]:
# Expectation 6 : Median value of column `price` must between 100 and 50000
validator.expect_column_median_to_be_between(
    column='price',
    min_value=100,
    max_value=50000
)




Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_median_to_be_between",
    "kwargs": {
      "column": "price",
      "min_value": 100,
      "max_value": 50000,
      "batch_id": "csv-data-clean-car_sales"
    },
    "meta": {}
  },
  "result": {
    "observed_value": 23000.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

From the results above, it can be concluded that the "price" column has a median value of 23,000, which means it meets the established expectation of being between 100 and 50,000.

In [12]:
# Expectation 7 : Length value of column `transmission` must less than 10
validator.expect_column_value_lengths_to_be_between(
    column='transmission',
    max_value=10
)




Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_value_lengths_to_be_between",
    "kwargs": {
      "column": "transmission",
      "max_value": 10,
      "batch_id": "csv-data-clean-car_sales"
    },
    "meta": {}
  },
  "result": {
    "element_count": 5000,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

From the results above, it can be concluded that the "transmission" column does not have any strings longer than 10 characters, therefore it meets the established expectation of a maximum length of 10

In [13]:
# Save into Expectation Suite

validator.save_expectation_suite(discard_failed_expectations=False)

# V. Checkpoint

In [14]:
# Create a checkpoint

checkpoint_1 = context.add_or_update_checkpoint(
    name = 'checkpoint_1',
    validator = validator,
)

In [15]:
# Run a checkpoint

checkpoint_result = checkpoint_1.run()

Calculating Metrics:   0%|          | 0/41 [00:00<?, ?it/s]

# VI. Data Docs

In [16]:
# Build data docs

context.build_data_docs()

{'local_site': 'file:///content/gx/uncommitted/data_docs/local_site/index.html'}