In [2]:
!pip install -q great-expectations

In [5]:
from great_expectations.data_context import FileDataContext

context = FileDataContext.create(project_root_dir='./')

# 1 - Connect to A Datasource

In [6]:
# Give a name to a Datasource. This name must be unique between Datasources.
datasource_name = 'superstoremc'
datasource = context.sources.add_pandas(datasource_name)

# Give a name to a data asset
asset_name = 'marketingcampaign'
path_to_data = 'P2M3_samuel_christian_data_clean.csv'
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)

# Build batch request
batch_request = asset.build_batch_request()

# 2 - Create an Expectation Suite

In [7]:
# Creat an expectation suite
expectation_suite_name = 'expectation-superstore-dataset'
context.add_or_update_expectation_suite(expectation_suite_name)

# Create a validator using above expectation suite
validator = context.get_validator(
    batch_request = batch_request,
    expectation_suite_name = expectation_suite_name
)

# Check the validator
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,id,year_birth,education,marital_status,income,kidhome,teenhome,dt_customer,recency,mntwines,mntfruits,mntmeatproducts,mntfishproducts,mntsweetproducts,mntgoldprods,numdealspurchases,numwebpurchases,numcatalogpurchases,numstorepurchases,numwebvisitsmonth,response,complain
0,1826,1970,Graduation,Divorced,84835.0,0,0,6/16/2014,0,189,104,379,111,189,218,1,4,4,6,1,1,0
1,1,1961,Graduation,Single,57091.0,0,0,6/15/2014,0,464,5,64,7,0,37,1,7,3,7,5,1,0
2,10476,1958,Graduation,Married,67267.0,0,1,5/13/2014,0,134,11,59,15,2,30,1,3,2,5,2,0,0
3,1386,1967,Graduation,Together,32474.0,1,1,11/5/2014,0,10,0,1,0,0,0,1,1,0,2,7,0,0
4,5371,1989,Graduation,Single,21474.0,1,0,8/4/2014,0,6,16,24,11,0,34,2,3,1,2,7,1,0


# 3 - Expectations

## 3.1 - Expectation 1 : Column must be unique

In [8]:
validator.expect_column_values_to_be_unique('id')



Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 2216,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

## 3.2 - Expectation 2 : The maximum value of column year_birth must be between 1893, 1997

In [9]:
validator.expect_column_max_to_be_between('year_birth', 1892, 1997)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 1996
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

## 3.3 - Expectation 3 : Column kidhome be in set 1,2

In [17]:
validator.expect_column_values_to_be_in_set("complain", [0,1])

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 2216,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

## 3.4 - Expectation 4 : Column income must in form of integer or float

In [10]:
validator.expect_column_values_to_be_in_type_list('income', ['integer', 'float'])

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": "float64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

## 3.5 - expectation 5 : The mean value of column `income` must be between 50000, 53000.

In [20]:
validator.expect_column_mean_to_be_between('income', 50000, 53000)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 52247.25135379061
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

## 3.6 - expectation 6 : The median value of column `income` must be between 50000, 53000.

In [18]:
validator.expect_column_median_to_be_between('income', 50000, 53000)



Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 51381.5
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

## 3.7 - expectation 7 : The mode value of column income must be between 6000, 8000.


In [28]:
validator.expect_column_most_common_value_to_be_in_set("recency", [56])



Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": [
      56
    ]
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

# 4 - Save into Expectation Suite

In [29]:

validator.save_expectation_suite(discard_failed_expectations=False)

# 5 - Create a checkpoint

In [30]:
checkpoint_1 = context.add_or_update_checkpoint(
    name = 'checkpoint_1',
    validator = validator,
)

In [31]:
# Run a checkpoint

checkpoint_result = checkpoint_1.run()

Calculating Metrics:   0%|          | 0/42 [00:00<?, ?it/s]

# 6 - Data Docs

In [32]:
# Build data docs

context.build_data_docs()

{'local_site': 'file://c:\\Users\\Chris\\Documents\\HACKTIVE\\PHASE 2\\Prepare data Milestone 3\\gx\\uncommitted/data_docs/local_site/index.html'}