In [1]:
from databaker.framework import *

In [33]:
# There needs to be a lot of thought put into the convensions on how 
# files are handled, named, and tracked so we know what the previous 
# file is supposed to be

# Here we just do the process against two versions of the same file
inputfile1 = "ilchtablestemplatesa2016-06-17.xls"
inputfile2 = "ilchtablestemplatesa.xls"


In [34]:
# Shared functions

# Get the growth period
def get_growthPeriod(tab):
    tab_title = tab.excel_ref('A1')
    
    if tab_title.filter(contains_string("year on year")):
        gp = "Annual"
    elif tab_title.filter(contains_string("quarter on quarter")):
        gp = "Quarterly"
    elif tab_title.filter(contains_string("growth rates")):
        gp = "Annual"
    return gp


# Get the measure type
def get_measureType(tab):
    tab_title = tab.excel_ref('A1')
                       
    if tab_title.filter(contains_string("year on year")):
        mt = "Percent"
    elif tab_title.filter(contains_string("quarter on quarter")):
        mt = "Percent"
    elif tab_title.filter(contains_string("growth rates")):
        mt = "Percent"
    else:
        mt = "Index"
    return mt



In [35]:

# !!! You should pass parameters in (like tabs_growth), rather than relying on 
# global values to work it out for you

def growth_recipe(saOrNsa, tabs_growth):
    
    conversionsegments = []

    for tab in tabs_growth:

        # Set anchor one to the left of cell with "Agriculture" 
        anchor = tab.filter(contains_string("eriod")).assert_one()

        # set up a waffle
        datarows = anchor.fill(DOWN).is_not_blank()
        datacols = anchor.shift(DOWN).fill(RIGHT).is_not_blank()
        obs = datarows.waffle(datacols).is_not_blank()

        # set the growth period & measuretype
        gp = get_growthPeriod(tab)
        mt = get_measureType(tab)

        dimensions = [
                HDimConst(MEASURETYPE, mt),
                HDim(datarows, TIME, DIRECTLY, LEFT),
                HDim(datacols.parent(), "Costs", DIRECTLY, ABOVE),
                HDim(anchor.fill(RIGHT).parent(), "SIC", CLOSEST, LEFT),
                HDimConst("Growth Period", gp),
                HDimConst("SA / NSA", saOrNsa)
                     ]

        # TIME has wierd data markings, get them out
        time = dimensions[1]
        assert time.name == 'TIME', "Time needs to be dimension 0"
        for val in time.hbagset:
            if '(r)' in val.value or ('p') in val.value:
                time.cellvalueoverride[val.value] = val.value[:6]

        conversionsegment = ConversionSegment(tab, dimensions, obs)
        conversionsegments.append(conversionsegment)
    
    return conversionsegments


In [36]:
[t.name for t in tabs1]

['INTRODUCTION',
 'DEFINITIONS',
 '1. Industry level SA',
 '2. Sector level SA',
 '3. Industry annual growth SA',
 '4. Sector annual growth SA',
 '5. Industry quarterly growth SA',
 '6. Sector quarterly growth SA']

In [37]:
# Here we write the two files into the two different filenames 
# (note use of os.path.splitext)

# The ConversionSegments lists are separated out rather than 
# passed straight into the writetechnicalCSV function so we can 
# refer to them later on.

# Perhaps we will need a new class representing a file, which will manage 
# the list of ConversionSegments, filename, previous filename 
# and the error comparisons

tabs1 = loadxlstabs(inputfile1)
tabs_growth1 = [x for x in tabs1 if 'growth' in x.name]
assert len(tabs_growth1) == 4, "We expect the NSA file to have 2 tabs with the word 'growth' in them"

tabs2 = loadxlstabs(inputfile2)
tabs_growth2 = [x for x in tabs2 if 'growth' in x.name]
assert len(tabs_growth2) == 4, "We expect the NSA file to have 2 tabs with the word 'growth' in them"

outputfile1 = 'Output-NSA-growth-' + os.path.splitext(inputfile1)[0] + '.csv'
outputfile2 = 'Output-NSA-growth-' + os.path.splitext(inputfile2)[0] + '.csv'

growthsegments1 = growth_recipe("Not seasonally adjusted", tabs_growth1)
growthsegments2 = growth_recipe("Not seasonally adjusted", tabs_growth2)

#writetechnicalCSV(outputfile1, growthsegments1)
#writetechnicalCSV(outputfile2, growthsegments2)


Loading ilchtablestemplatesa2016-06-17.xls which has size 399360 bytes
Table names: ['INTRODUCTION', 'DEFINITIONS', '1. Industry level SA', '2. Sector level SA', '3. Industry annual growth SA', '4. Sector annual growth SA', '5. Industry quarterly growth SA', '6. Sector quarterly growth SA']
Loading ilchtablestemplatesa.xls which has size 407552 bytes
Table names: ['INTRODUCTION', 'DEFINITIONS', '1. Industry level SA', '2. Sector level SA', '3. Industry annual growth SA', '4. Sector annual growth SA', '5. Industry quarterly growth SA', '6. Sector quarterly growth SA']


In [38]:
# Compare the new ConversionSegments against the outputfile from the previous conversionsegments
msglistperseg = CompareConversionSegments(growthsegments2, outputfile1, bprintwarnings=False)

print("This gives the type of differences we're getting")
[ (iseg, [msg[0] for msg in msglist])  for iseg, msglist in msglistperseg.items() ]



segment 0 completed with 5760 rows
segment 1 completed with 2160 rows
segment 2 completed with 6048 rows
segment 3 completed with 2268 rows
conversionwrite segment size 5952 table '3. Industry annual growth SA; TIMEUNIT='Quarter'
conversionwrite segment size 2232 table '4. Sector annual growth SA; TIMEUNIT='Quarter'
conversionwrite segment size 6240 table '5. Industry quarterly growth SA; TIMEUNIT='Quarter'
conversionwrite segment size 2340 table '6. Sector quarterly growth SA; TIMEUNIT='Quarter'


[(0, ['NEWVALUESINSEGMENT', 'WDAEXTRAVALUES']),
 (1, ['NEWVALUESINSEGMENT', 'WDAEXTRAVALUES']),
 (2, ['NEWVALUESINSEGMENT', 'WDAEXTRAVALUES']),
 (3, ['NEWVALUESINSEGMENT', 'WDAEXTRAVALUES'])]

In [45]:
# If you print the contents of msglistperseg, it's way too much, 
# so I'm going to break down the CompareConversionSegments function a bit 

# msglistperseg

In [19]:
# Let's start by reading the segments from the file
wdasegs = readtechnicalCSV(outputfile1, True)

segment 0 completed with 5760 rows
segment 1 completed with 2160 rows
segment 2 completed with 6048 rows
segment 3 completed with 2268 rows


In [39]:
# Then we do the matching up for the first segment in the new output 
# to the first segment in the WDA file
wdaseg = wdasegs[0]
conversionsegment = growthsegments2[0]
msglist = [ ]
headers = extraheaderscheck(conversionsegment, wdaseg, msglist)
headers = checktheconstantdimensions(conversionsegment, headers, wdaseg, msglist)
oheaders = [OBS]+list(headers)
ccounts = [tuple(row.get(h)  for h in oheaders)  for row in conversionsegment.processedrows]
wcounts = [tuple(wrow.get(h)  for h in oheaders)  for wrow in wdaseg]


In [46]:
# There is a considerable lack of overlapping
print("New data size", len(ccounts), "Old size", len(wcounts))  # new is bigger than old
print("Records in new data not in old data", len(set(ccounts).difference(wcounts)))
print("Records in old data not in old data", len(set(wcounts).difference(ccounts)))


New data size 5952 Old size 5760
Records in new data not in old data 4489
Records in old data not in old data 4297


In [48]:
# Print the first ten in these files (which are aligned because we sorted by cell position)
for i, (c, w) in enumerate(zip(ccounts, wcounts)):
    print(c)
    print(w)
    if i > 10:
        break
    print()


('16.9', 'ILCH_A\nAgriculture, Forestry and Fishing', 'Labour Costs per Hour', '2001 Q1')
('16.5', 'ILCH_A\nAgriculture, Forestry and Fishing', 'Labour Costs per Hour', '2001 Q1')

('17.7', 'ILCH_A\nAgriculture, Forestry and Fishing', 'Wage Costs per Hour', '2001 Q1')
('17.7', 'ILCH_A\nAgriculture, Forestry and Fishing', 'Wage Costs per Hour', '2001 Q1')

('9.7', 'ILCH_A\nAgriculture, Forestry and Fishing', 'Other Costs per Hour', '2001 Q1')
('9.4', 'ILCH_A\nAgriculture, Forestry and Fishing', 'Other Costs per Hour', '2001 Q1')

('16.4', 'ILCH_A\nAgriculture, Forestry and Fishing', 'Labour Costs per Hour Excluding Bonuses and Arrears', '2001 Q1')
('16.3', 'ILCH_A\nAgriculture, Forestry and Fishing', 'Labour Costs per Hour Excluding Bonuses and Arrears', '2001 Q1')

('19.7', 'ILCH_B\nMining and Quarrying', 'Labour Costs per Hour', '2001 Q1')
('19.7', 'ILCH_B\nMining and Quarrying', 'Labour Costs per Hour', '2001 Q1')

('18.2', 'ILCH_B\nMining and Quarrying', 'Wage Costs per Hour', '2001

In [None]:
# There are subtle changes in the 2001 numbers between a spreadsheet relesed in June 2016
# and the latest version.  Really???