# SQL Analysis Development Notebook

This notebook is used for testing code concepts for the SQL Analysis module.

## Table of Contents

<ol>
    <li><a href='#start'>Modifying Class Dictionary</a></li>
</ol>

<a id='start'></a>
## Modifying Class Dictionary

In [1]:
class DictTest:
    def __init__(self):
        self.my_dict = {}
        self.label = 'count'
    
    def run_test(self, value):
        if value > 10:
            self.my_dict[value] = 'This is an update.'

In [9]:
tester = DictTest()

my_list = [1, 15, 6, 8, 20, 13]
my_labels = ['abc', 'xyz', 'def', 'lmn', 'ijk', 'str']

for value, label in zip(my_list, my_labels):
    tester.label = label
    tester.run_test(value)

In [10]:
tester.my_dict

{15: 'This is an update.', 20: 'This is an update.', 13: 'This is an update.'}

## Summary Storage

In [26]:
from importlib import reload
import pandas as pd
import sql_analysis as san

In [78]:
reload(san)

<module 'sql_analysis' from 'C:\\Users\\rbarnes\\Documents\\sql_analysis\\sql_analysis.py'>

In [79]:
# Specify count fields
comparison_fields = ['application_id', 'application_id']
groupby_fields = ['ss_dt', 'snapshot_date']
table_names = ['EDASTG.ods_hc_application_active_stg', 'EDA.etl_applicant_legacy']
table_alias = ['ods', 'etl']

# Create gatherer
gatherer = san.SQLGatherData(comparison_fields, groupby_fields, 
                             table_names, table_alias, db_server='dev', 
                             test_type='high_distinct')

# Create tester
tester = san.SQLUnitTest(count_df,
                         comparison_names=table_alias, 
                         save_location='Q:/Project Work/MS Access/Testing/Applicant',
                         summary_field='ss_dt')

high_distinct_fields = [('adjusted_application_date', 'adjusted_application_date'),
                        ('household_id', 'household_id'),
                        ('original_application_date', 'original_application_date'),
                        ('postal_code', 'postal_code')]

for comparison_fields in high_distinct_fields:
    # Update gatherer fields for low_distinct
    gatherer.comparison_fields = comparison_fields

    low_df = gatherer.gather_data()
    
    # Update tester fields for low_distinct
    tester.data = low_df
    tester.test_field = comparison_fields[0]

    tester.run_test()

Commencing adjusted_application_date query...
Query for adjusted_application_date complete.

Commencing test for adjusted_application_date...
Test for adjusted_application_date complete.

Commencing household_id query...
Query for household_id complete.

Commencing test for household_id...
Test for household_id complete.

Commencing original_application_date query...
Query for original_application_date complete.

Commencing test for original_application_date...
Test for original_application_date complete.

Commencing postal_code query...
Query for postal_code complete.

Commencing test for postal_code...
Test for postal_code complete.



In [53]:
tester._priority_review

{'has_no_fixed_address_etl': 'MISSING VALUE for etl_has_no_fixed_address',
 'hh_principle_age_group_etl': 'PRIORITY REVIEW on etl_hh_principle_age_group: 10.0',
 'principal_income_source_etl': 'PRIORITY REVIEW on etl_principal_income_source: inf'}

In [38]:
tester._exceptions

{}

In [71]:
tester._results.columns[1]

'ss_dt'

In [69]:
tester._results

Unnamed: 0,date,ss_dt,ods_count,etl_count,ods_minus_etl,perc_diff_etl
0,02-Aug-19,2009-03-31,9873,9839,34,0.344374
1,02-Aug-19,2010-03-31,10870,10846,24,0.220791
2,02-Aug-19,2011-03-31,12419,12397,22,0.177148
3,02-Aug-19,2012-03-31,13701,13684,17,0.124079
4,02-Aug-19,2013-03-31,13936,13913,23,0.16504
5,02-Aug-19,2014-03-31,14569,14553,16,0.109822
6,02-Aug-19,2014-06-30,14660,14635,25,0.170532
7,02-Aug-19,2014-09-30,15106,15082,24,0.158877
8,02-Aug-19,2014-12-31,14778,14756,22,0.14887
9,02-Aug-19,2015-03-31,14870,14848,22,0.147949


In [80]:
tester._summary

Unnamed: 0,ss_dt,adjusted_application_date_etl,original_application_date_etl,postal_code_etl,perc_diff_etl
0,2008-06-30,,,,
1,2008-09-30,,,,
2,2008-12-31,,,,
3,2009-03-31,,0.344374,,100.000000
4,2009-06-30,,,,
5,2009-09-30,,,,
6,2009-12-31,,,,
7,2010-03-31,,0.220791,,100.000000
8,2010-06-30,,,,
9,2010-09-30,,,,


In [16]:
df_a = pd.DataFrame({'a_field': [1, 2, 3]})
df_b = pd.DataFrame({'b_field': [7, 8, 9]})

test = pd.concat([df_a, df_b])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.


In [17]:
test

Unnamed: 0,a_field,b_field
0,1.0,
1,2.0,
2,3.0,
0,,7.0
1,,8.0
2,,9.0
