#1 - Introduction

* Name : Rahardiansyah Fatoni
* Batch : RMT-027
* Objective : To validate the data cleaned through airflow using Great Expectation (GX).

#2 - Setting up Great Expectation

In [1]:
# Install the library

!pip install -q great-expectations

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.4/5.4 MB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.1/109.1 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m59.6 MB/s[0m eta [36m0:00:00[0m
[?25h

##2.1 - Create Data Context

In [2]:
# Create a data context

from great_expectations.data_context import FileDataContext

context = FileDataContext.create(project_root_dir='./')

##2.2 - Connect to a `Datasource`

In [3]:
# Give a name to a Datasource. This name must be unique between Datasources.
datasource_name = 'p2m3_rahardiansyah_fatoni_data_clean'
datasource = context.sources.add_pandas(datasource_name)

# Give a name to a data asset
asset_name = 'videogame_sales'
path_to_data = 'https://raw.githubusercontent.com/rahardianfatoni/milestone/main/P2M3_rahardiansyah_fatoni_data_clean.csv'
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)

# Build batch request
batch_request = asset.build_batch_request()

  and should_run_async(code)



##2.3 - Create `Expectation Suite`

In [4]:
# Creat an expectation suite
expectation_suite_name = 'expectation-videgame-sales'
context.add_or_update_expectation_suite(expectation_suite_name)

# Create a validator using above expectation suite
validator = context.get_validator(
    batch_request = batch_request,
    expectation_suite_name = expectation_suite_name
)

# Check the validator
validator.head()




Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,rank,name,platform,year,genre,publisher,na_sales,eu_sales,jp_sales,other_sales,global_sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


##2.4 - Expectations

An `Expectation` is a verifiable assertion about source data. Similar to assertions in traditional Python unit tests, Expectations provide a flexible, declarative language for describing expected behaviors.

**Expectations is basically what do we expect from the data.**

For example we expect our columns:
- not to be empty
- to be unique
- should be between x and y
- should match with regex
- and many more

###2.4.1 - Expectation 1

In [5]:
# Expectation 1 : Column `rank` must be unique

validator.expect_column_values_to_be_unique('rank')

  and should_run_async(code)




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_unique",
    "kwargs": {
      "column": "rank",
      "batch_id": "p2m3_rahardiansyah_fatoni_data_clean-videogame_sales"
    },
    "meta": {}
  },
  "result": {
    "element_count": 16291,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

###2.4.2 - Expectation 2

In [6]:
# Expectation 2 : Column `global_sales` must be more than 0

validator.expect_column_values_to_be_between(
    column='global_sales', min_value=0
)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_between",
    "kwargs": {
      "column": "global_sales",
      "min_value": 0,
      "batch_id": "p2m3_rahardiansyah_fatoni_data_clean-videogame_sales"
    },
    "meta": {}
  },
  "result": {
    "element_count": 16291,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

###2.4.3 - Expectation 3

In [7]:
# Expectation 3 : Column `genre` must contain one of the following 6 things :
# 1 = Sports
# 2 = Platform
# 3 = Racing
# 4 = Role-Playing
# 5 = Puzzle
# 6 = Misc
# 7 = Shooter
# 8 = Simulation
# 9 = Action
# 10 = Fighting
# 11 = Adventure
# 12 = Strategy

validator.expect_column_values_to_be_in_set('genre', ['Sports', 'Platform', 'Racing', 'Role-Playing', 'Puzzle', 'Misc',
       'Shooter', 'Simulation', 'Action', 'Fighting', 'Adventure', 'Strategy'])

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  common = np.find_common_type([values.dtype, comps_array.dtype], [])



{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_in_set",
    "kwargs": {
      "column": "genre",
      "value_set": [
        "Sports",
        "Platform",
        "Racing",
        "Role-Playing",
        "Puzzle",
        "Misc",
        "Shooter",
        "Simulation",
        "Action",
        "Fighting",
        "Adventure",
        "Strategy"
      ],
      "batch_id": "p2m3_rahardiansyah_fatoni_data_clean-videogame_sales"
    },
    "meta": {}
  },
  "result": {
    "element_count": 16291,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

###2.4.4 - Expectation 4

In [8]:
# Expectation 4 : Column `year` must in form of integer or float

validator.expect_column_values_to_be_in_type_list('year', ['integer', 'float'])

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_in_type_list",
    "kwargs": {
      "column": "year",
      "type_list": [
        "integer",
        "float"
      ],
      "batch_id": "p2m3_rahardiansyah_fatoni_data_clean-videogame_sales"
    },
    "meta": {}
  },
  "result": {
    "observed_value": "float64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

###2.4.5 - Expectation 5

In [9]:
# Expectation 5 : The row count (entries) to equal 16291

validator.expect_table_row_count_to_equal(16291)

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_table_row_count_to_equal",
    "kwargs": {
      "value": 16291,
      "batch_id": "p2m3_rahardiansyah_fatoni_data_clean-videogame_sales"
    },
    "meta": {}
  },
  "result": {
    "observed_value": 16291
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

###2.4.6 - Expectation 6

In [10]:
# Expectation 6 : The column count (entries) to equal 11

validator.expect_table_column_count_to_equal(11)

Calculating Metrics:   0%|          | 0/3 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_table_column_count_to_equal",
    "kwargs": {
      "value": 11,
      "batch_id": "p2m3_rahardiansyah_fatoni_data_clean-videogame_sales"
    },
    "meta": {}
  },
  "result": {
    "observed_value": 11
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

###2.4.7 - Expectation 7

In [11]:
# Expectation 6 : The `global_sales` must be decreasing as the rank of the videogame increases,
# mostly is added since the `global_sales` column is in million dollars and therefore a difference in 0.1 is about $ 100,000 difference.

validator.expect_column_values_to_be_decreasing('global_sales', mostly = 0.1)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_decreasing",
    "kwargs": {
      "mostly": 0.1,
      "column": "global_sales",
      "batch_id": "p2m3_rahardiansyah_fatoni_data_clean-videogame_sales"
    },
    "meta": {}
  },
  "result": {
    "element_count": 16291,
    "unexpected_count": 3,
    "unexpected_percent": 0.018415075808728743,
    "partial_unexpected_list": [
      1.59,
      1.17,
      0.27
    ],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.018415075808728743,
    "unexpected_percent_nonmissing": 0.018415075808728743
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

##2.5 - Data Docs

In [17]:
# Build data docs

context.build_data_docs()

{'local_site': 'file:///content/gx/uncommitted/data_docs/local_site/index.html'}