In [1]:
from databaker.framework import *
inputfile = "constructionoutputseptemberq32016.xls"
outputfile = "constructionoutputtab4.csv"
previewfile = "preview.html"

tab = loadxlstabs(inputfile, "Table 4")[0]


Loading constructionoutputseptemberq32016.xls which has size 1051648 bytes
Table names: ['Table 4']


In [2]:
# get the columns out from this object 
anchor = tab.is_bold().regex("Period")
year = anchor.fill(DOWN).is_not_blank()
subyear = anchor.shift(RIGHT).fill(DOWN)
coderow = year.is_not_blank().by_index(1).shift(UP).fill(RIGHT)

obs = coderow.fill(DOWN).is_not_blank()

hd1 = HDim(year, "year", CLOSEST, ABOVE)
hd2 = HDim(subyear, "subyear", DIRECTLY, LEFT)

#savepreviewhtml([hd1, subyear, coderow, obs])
cs = ConversionSegment(tab, [hd1, hd2], obs)
savepreviewhtml(cs, previewfile)



tablepart 'Table 4' written #injblock1001
javascript calculated


In [3]:
# this classically looks like a job for subdimensions, either implemented by 
# dimensions looking up other dimensions, or by concatenating dimension values

# but we can do it more powerfully with cellvalueoverride, which implements the 
# same feature, but gives us also the ability to clean up footnotes and other 
# things that often mess up the dates, which we would need to use the cellvalueoverride
# system for anyway.  So we'll try and see if we can live without the subdims feature

obs = coderow.is_not_blank().by_index(1).fill(DOWN).is_not_blank()   # smaller OBS set of one column

# But first check that the dateunit thing is working just on the years (including fixing up the .0 value)
hdt = HDim(year, TIME, CLOSEST, ABOVE)
cs = ConversionSegment(tab, [hdt], obs)
writetechnicalCSV(outputfile, cs)

writing 1 conversion segments into /home/goatchurch/sensiblecode/quickcode-ons-recipes/constructionoutput/constructionoutputtab4.csv
conversionwrite segment size 389 table 'Table 4; TIMEUNIT='Year'


In [4]:
# The years don't have the .0 problem.  It's been done in stages of 
# post processing like so:
cs1 = ConversionSegment(tab, [hdt], obs)

print("Initial processing")
cs1.process()  
print(cs1.processedrows[:3])

print("\n\nTIMEUNIT adding")
st = cs1.guesstimeunit()  
print("timeunitmessage:", st)
print(cs1.processedrows[:3])

# this function only does something with years, but we can talk about doing it more systematically
print("\n\nForce TIME to match TIMEUNIT")
cs1.fixtimefromtimeunit()
print(cs1.processedrows[:3])


Initial processing
[{-2: 1955.0, -9: 331.0}, {-2: 1956.0, -9: 319.0}, {-2: 1957.0, -9: 303.0}]


TIMEUNIT adding
timeunitmessage: TIMEUNIT='Year'
[{-1: 'Year', -2: 1955.0, -9: 331.0}, {-1: 'Year', -2: 1956.0, -9: 319.0}, {-1: 'Year', -2: 1957.0, -9: 303.0}]


Force TIME to match TIMEUNIT
[{-1: 'Year', -2: '1955', -9: 331.0}, {-1: 'Year', -2: '1956', -9: 319.0}, {-1: 'Year', -2: '1957', -9: 303.0}]


In [5]:
# Now back to the waterfall like structure, where we're going to 
# apply the year into the month column
hdt = HDim(year, TIME, CLOSEST, ABOVE)
lsubyear = list(subyear.unordered_cells)  # <--- unordered_cells are bare cell type, 
                                          # which we can use in cellvalueoverride

print("Here are some subyear column elements (some are blank)")
for sy in lsubyear[:10]:
    print(sy)

print("\nHere are how they lookup to the year column (it's a local lookup) to cell and value of cell")
for sy in lsubyear[:10]:
    print(sy, hdt.cellvalobs(sy))


Here are some subyear column elements (some are blank)
<B37 ''>
<B94 'Q1'>
<B229 'Q4'>
<B192 'Q3'>
<B170 'Q1'>
<B191 'Q2'>
<B265 'Q4'>
<B33 ''>
<B390 'Sep'>
<B379 'Oct'>

Here are how they lookup to the year column (it's a local lookup) to cell and value of cell
<B37 ''> (<A37 1981.0>, 1981.0)
<B94 'Q1'> (<A94 1960.0>, 1960.0)
<B229 'Q4'> (<A226 1993.0>, 1993.0)
<B192 'Q3'> (<A190 1984.0>, 1984.0)
<B170 'Q1'> (<A170 1979.0>, 1979.0)
<B191 'Q2'> (<A190 1984.0>, 1984.0)
<B265 'Q4'> (<A262 2002.0>, 2002.0)
<B33 ''> (<A33 1977.0>, 1977.0)
<B390 'Sep'> (<A382 2015.0>, 2015.0)
<B379 'Oct'> (<A370 2014.0>, 2014.0)


In [6]:
# this creates the concatenation of the year and subyear
cvodate = { }
for sy in lsubyear:
    h, v = hdt.cellvalobs(sy)
    if v:
        cvodate[sy] = (str(int(v))+sy.value) 
        

In [7]:
# just in case you want to do that all in one line...
cvodate = dict((sy, "%s %s" % (int(hdt.cellvalobs(sy)[1] or 0), (sy.value or "")))  for sy in subyear.unordered_cells)


In [8]:
hdd = HDim(subyear, TIME, DIRECTLY, LEFT, cellvalueoverride=cvodate)
cs2 = ConversionSegment(tab, [hdd], obs)
writetechnicalCSV(outputfile, cs2)
#writetechnicalCSV(None, cs2)
# see lots of errors

In [14]:
# But we get some datematching that doesn't work out, 
# We could either fix up the datematch code in the base library, or hack in this 
# lookup table to get the month in front of the year when it's there

# this creates the concatenation of the year and subyear
cvodate = { }
for sy in lsubyear:
    h, v = hdt.cellvalobs(sy)
    if v:
        y = str(int(v))
        if sy.value:
            if sy.value[0] == "Q":
                cvodate[sy] = (y+" "+sy.value) 
            else:
                cvodate[sy] = (sy.value.strip()+" "+y) 
        else:
            cvodate[sy] = y

hdd = HDim(subyear, TIME, DIRECTLY, LEFT, cellvalueoverride=cvodate)
cs2 = ConversionSegment(tab, [hdd], obs)
writetechnicalCSV(outputfile, cs2)

# and this comes out quite nicely


writing 1 conversion segments into /home/goatchurch/sensiblecode/quickcode-ons-recipes/constructionoutput/constructionoutputtab4.csv
conversionwrite segment size 389 table 'Table 4; multiple TIMEUNITs: 'Month'(81), 'Quarter'(247), 'Year'(61)
