In [15]:
import boto3
import json
from time import time, ctime
import os
import psycopg2
import pandas as pd
from tabulate import tabulate

In [None]:
# Downloading the data files
!aws s3 cp s3://wysde-datasets/cars/billing-datawarehouse.tgz data/billing-datawarehouse.tgz

In [4]:
# Extracting files
!cd data && tar -xvzf billing-datawarehouse.tgz

x DimCustomer.sql
x DimMonth.sql
x FactBilling.sql
x star-schema.sql
x verify.sql


In [6]:
def get_secret(secret_name, region_name="us-east-1"):
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name)
    get_secret_value_response = client.get_secret_value(SecretId=secret_name)
    get_secret_value_response = json.loads(get_secret_value_response['SecretString'])
    return get_secret_value_response

In [9]:
creds = get_secret("wysde")
USERNAME = creds["RDS_POSTGRES_USERNAME"]
PASSWORD = creds["RDS_POSTGRES_PASSWORD"]
HOST = creds["RDS_POSTGRES_HOST"]
DATABASE = "sparsh"

you might need to run rds psql commands in shell if facing difficulty in entering responses in ipython notebook

In [None]:
# Creating schema
!psql -h $HOST -U $USERNAME -p 5432 $DATABASE < data/star-schema.sql

In [None]:
# Loading data
!psql -h $HOST -U $USERNAME -p 5432 $DATABASE < data/DimCustomer.sql
!psql -h $HOST -U $USERNAME -p 5432 $DATABASE < data/DimMonth.sql
!psql -h $HOST -U $USERNAME -p 5432 $DATABASE < data/FactBilling.sql

In [12]:
print("Finished loading data")

Finished loading data


In [None]:
# Verifying data
!psql -h $HOST -U $USERNAME -p 5432 $DATABASE < data/verify.sql

In [11]:
print("Successfully setup the staging area")

Successfully setup the staging area


## Data Quality

In [13]:
# Check for nulls
conn = None

def run_data_quality_check(**options):
    print("*" * 50)
    print(ctime(time()))
    start_time = time()
    testname = options.pop("testname")
    test = options.pop("test")
    print(f"Starting test {testname}")
    status = test(**options)
    print(f"Finished test {testname}")
    print(f"Test Passed {status}")
    end_time = time()
    options.pop("conn")
    print("Test Parameters")
    for key,value in options.items():
        print(f"{key} = {value}")
    print()
    print("Duration : ", str(end_time - start_time))
    print(ctime(time()))
    print("*" * 50)
    return testname,options.get('table'),options.get('column'),status


def check_for_nulls(column,table,conn=conn):
    SQL=f'SELECT count(*) FROM "{table}" where {column} is null'
    cursor = conn.cursor()
    cursor.execute(SQL)
    row_count = cursor.fetchone()
    cursor.close()
    return bool(row_count)


#Check for min max range

def check_for_min_max(column,table,minimum,maximum,conn=conn):
    SQL=f'SELECT count(*) FROM "{table}" where  {column} < {minimum} or {column} > {maximum}'
    cursor = conn.cursor()
    cursor.execute(SQL)
    row_count = cursor.fetchone()
    cursor.close()
    return not bool(row_count)

#Check for any invalid entries

def check_for_valid_values(column, table, valid_values=None,conn=conn):
    SQL=f'SELECT distinct({column}) FROM "{table}"'
    cursor = conn.cursor()
    cursor.execute(SQL)
    result = cursor.fetchall()
    #print(result)
    actual_values = {x[0] for x in result}
    print(actual_values)
    status = [value in valid_values for value in actual_values]
    #print(status)
    cursor.close()
    return all(status)

#Check for duplicate entries

def check_for_duplicates(column,table,conn=conn):
    SQL=f'SELECT count({column}) FROM "{table}" group by {column} having count({column}) > 1'
    cursor = conn.cursor()
    cursor.execute(SQL)
    row_count = cursor.fetchone()
    #print(row_count)
    cursor.close()
    return not bool(row_count)

In [20]:
test1={
	"testname":"Check for nulls",
	"test":check_for_nulls,
	"column": "monthid",
	"table": "DimMonth"
}

test2={
	"testname":"Check for min and max",
	"test":check_for_min_max,
	"column": "month",
	"table": "DimMonth",
	"minimum":1,
	"maximum":12
}

test3={
	"testname":"Check for valid values",
	"test":check_for_valid_values,
	"column": "category",
	"table": "DimCustomer",
	"valid_values":{'Individual','Company'}
}

test4={
	"testname":"Check for duplicates",
	"test":check_for_duplicates,
	"column": "monthid",
	"table": "DimMonth"
}

test5={
    "testname":"Check for nulls",
    "test":check_for_nulls,
    "column": "year",
    "table": "DimMonth"
}

test6={
    "testname":"Check for min and max",
    "test":check_for_min_max,
    "column": "quarter",
    "table": "DimMonth",
    "minimum":1,
    "maximum":4
}

test7={
    "testname":"Check for valid values",
    "test":check_for_valid_values,
    "column": "quartername",
    "table": "DimMonth",
    "valid_values":{'Q1','Q2','Q3','Q4'}
}

test8={
    "testname":"Check for duplicates",
    "test":check_for_duplicates,
    "column": "customerid",
    "table": "DimCustomer"
}

test9 = {
    "testname":"Check for nulls",
    "test":check_for_nulls,
    "column": "billedamount",
    "table": "FactBilling"
}

test10 = {
    "testname":"Check for duplicates",
    "test":check_for_duplicates,
    "column": "billid",
    "table": "FactBilling"
}

test11 = {
    "testname":"Check for valid values",
    "test":check_for_valid_values,
    "column": "quarter",
    "table": "DimMonth",
    "valid_values":{'Q1','Q2','Q3','Q4'}
}

tests = [test1, test2, test3, test4,
         test5, test6, test7, test8,
         test9, test10, test11]

In [16]:
# Connect to database
conn = psycopg2.connect(
		user = USERNAME,
	    password = PASSWORD,
	    host = HOST,
	    port = "5432",
	    database = DATABASE)

print("Connected to data warehouse")

Connected to data warehouse


In [21]:
# Start of data quality checks
results = []
for test in tests:
    test['conn'] = conn
    results.append(run_data_quality_check(**test))

**************************************************
Sat Dec 17 22:26:18 2022
Starting test Check for nulls
Finished test Check for nulls
Test Passed True
Test Parameters
column = monthid
table = DimMonth

Duration :  0.9150419235229492
Sat Dec 17 22:26:19 2022
**************************************************
**************************************************
Sat Dec 17 22:26:19 2022
Starting test Check for min and max
Finished test Check for min and max
Test Passed False
Test Parameters
column = month
table = DimMonth
minimum = 1
maximum = 12

Duration :  0.40681910514831543
Sat Dec 17 22:26:19 2022
**************************************************
**************************************************
Sat Dec 17 22:26:19 2022
Starting test Check for valid values
{'Company', 'Individual'}
Finished test Check for valid values
Test Passed True
Test Parameters
column = category
table = DimCustomer
valid_values = {'Company', 'Individual'}

Duration :  0.40922069549560547
Sat Dec 17 22:26:20 

In [22]:
# Print results
df=pd.DataFrame(results)
df.index+=1
df.columns = ['Test Name', 'Table','Column','Test Passed']
print(tabulate(df,headers='keys',tablefmt='psql'))

+----+------------------------+-------------+--------------+---------------+
|    | Test Name              | Table       | Column       | Test Passed   |
|----+------------------------+-------------+--------------+---------------|
|  1 | Check for nulls        | DimMonth    | monthid      | True          |
|  2 | Check for min and max  | DimMonth    | month        | False         |
|  3 | Check for valid values | DimCustomer | category     | True          |
|  4 | Check for duplicates   | DimMonth    | monthid      | True          |
|  5 | Check for nulls        | DimMonth    | year         | True          |
|  6 | Check for min and max  | DimMonth    | quarter      | False         |
|  7 | Check for valid values | DimMonth    | quartername  | True          |
|  8 | Check for duplicates   | DimCustomer | customerid   | True          |
|  9 | Check for nulls        | FactBilling | billedamount | True          |
| 10 | Check for duplicates   | FactBilling | billid       | True          |

In [23]:
# End of data quality checks
conn.close()
print("Disconnected from data warehouse")

Disconnected from data warehouse
