## Data Validation with Voluptuous (Schema Definitions)


In [75]:
import logging
import pandas as pd
from datetime import datetime
from voluptuous import Schema, Required, Range, All, ALLOW_EXTRA
from voluptuous.error import MultipleInvalid, Invalid


In [76]:
logger = logging.getLogger(0)
logger.setLevel(logging.WARNING)

In [77]:
path =r'C:\Users\risha\Documents\KRMU\AIML_assigment\datasets\sales_data.csv'
sales = pd.read_csv(path)
sales.head()

Unnamed: 0.1,Unnamed: 0,timestamp,city,store_id,sale_number,sale_amount,associate
0,0,2018-09-10 05:00:45,Williamburgh,6,1530,1167.0,Gary Lee
1,1,2018-09-12 10:01:27,Ibarraberg,1,2744,258.0,Daniel Davis
2,2,2018-09-13 12:01:48,Sarachester,2,1908,266.0,Michael Roth
3,3,2018-09-14 20:02:19,Caldwellbury,14,771,-108.0,Michaela Stewart
4,4,2018-09-16 01:03:21,Erikaland,11,1571,-372.0,Mark Taylor


In [78]:
sales=sales.drop(['Unnamed: 0'], axis=1)

In [79]:
sales.dtypes

timestamp       object
city            object
store_id         int64
sale_number      int64
sale_amount    float64
associate       object
dtype: object

In [80]:
sales['timestamp'].map(lambda x: datetime.strptime(x,'%Y-%m-%d %H:%M:%S'))

0     2018-09-10 05:00:45
1     2018-09-12 10:01:27
2     2018-09-13 12:01:48
3     2018-09-14 20:02:19
4     2018-09-16 01:03:21
              ...        
208   2019-09-01 06:46:44
209   2019-09-03 12:47:26
210   2019-09-05 18:47:30
211   2019-09-07 23:48:08
212   2018-09-09 04:48:48
Name: timestamp, Length: 213, dtype: datetime64[ns]

### Data Quality Check

In [81]:
sales.head()

Unnamed: 0,timestamp,city,store_id,sale_number,sale_amount,associate
0,2018-09-10 05:00:45,Williamburgh,6,1530,1167.0,Gary Lee
1,2018-09-12 10:01:27,Ibarraberg,1,2744,258.0,Daniel Davis
2,2018-09-13 12:01:48,Sarachester,2,1908,266.0,Michael Roth
3,2018-09-14 20:02:19,Caldwellbury,14,771,-108.0,Michaela Stewart
4,2018-09-16 01:03:21,Erikaland,11,1571,-372.0,Mark Taylor


In [82]:
sales.dtypes

timestamp       object
city            object
store_id         int64
sale_number      int64
sale_amount    float64
associate       object
dtype: object

## Defining our first schema

In [83]:
schema = Schema({ Required('sale_amount'): All(float, Range(min=2.50, max=1450.99)),}, extra=ALLOW_EXTRA)

In [84]:
error_count = 0
for s_id, sale in sales.T.to_dict().items():
    try:
        schema(sale)
    except MultipleInvalid as e:
        logging.warning('issue with sale: %s (%s) - %s', s_id, sale['sale_amount'], e)
        error_count += 1



In [85]:
error_count

69

In [86]:
sales.shape

(213, 6)

### Questions we might want to answer:
- Do we have an improperly defined schema?
- Are negative values possibly returns or falsely marked? (data entry proceedures)
- Are higher values combined purchases or special sales? (or potentially fraud?)
- What should we do with our schema and our failing data points?

### Adding a custom Validation Case

In [87]:
def ValidDate(fmt='%Y-%m-%d %H:%M:%S'):
    return lambda v: datetime.strptime(v, fmt)

In [88]:
schema = Schema({
    Required('timestamp'): All(ValidDate()),}, extra=ALLOW_EXTRA)

In [89]:
error_count = 0
for s_id, sale in sales.T.to_dict().items():
    try:
        schema(sale)
    except MultipleInvalid as e:
        logging.warning('issue with sale: %s (%s) - %s', s_id, sale['timestamp'], e)
        error_count += 1

In [90]:
error_count

0

## So we have valid date structures, what about actual valid dates?

In [91]:
def ValidDate(fmt='%Y-%m-%d %H:%M:%S'):
    def validation_func(v):
        try:
            assert datetime.strptime(v, fmt) <= datetime.now()
        except AssertionError:
            raise Invalid('date is in the future! %s' % v)
    return validation_func

In [92]:
schema = Schema({
    Required('timestamp'): All(ValidDate()),}, extra=ALLOW_EXTRA)

In [93]:
error_count = 0
for s_id, sale in sales.T.to_dict().items():
    try:
        schema(sale)
    except MultipleInvalid as e:
        logging.warning('issue with sale: %s (%s) - %s', 
                        s_id, sale['timestamp'], e)
        error_count += 1

In [94]:
error_count

0