In [2]:
from databaker.framework import *
inputfile = "constructionoutputtab4.xls"
outputfile = "constructionoutputtab4.csv"
previewfile = "preview.html"

tab = loadxlstabs(inputfile, "Table 4")[0]


Loading constructionoutputtab4.xls which has size 85504 bytes
Table names: ['Table 4']


In [3]:
# get the columns out from this object 
anchor = tab.is_bold().regex("Period")
year = anchor.fill(DOWN).is_not_blank()
subyear = anchor.shift(RIGHT).fill(DOWN)
coderow = year.is_not_blank().by_index(1).shift(UP).fill(RIGHT)

obs = coderow.fill(DOWN).is_not_blank()

hd1 = HDim(year, "year", CLOSEST, ABOVE)
hd2 = HDim(subyear, "subyear", DIRECTLY, LEFT)

#savepreviewhtml([hd1, subyear, coderow, obs])
cs = ConversionSegment(tab, [hd1, hd2], obs)
savepreviewhtml(cs, previewfile)



tablepart 'Table 4' written #injblock1001
javascript calculated


In [4]:
# this classically looks like a job for subdimensions, either implemented by 
# dimensions looking up other dimensions, or by concatenating dimension values

# but we can do it more powerfully with cellvalueoverride, which implements the 
# same feature, but gives us also the ability to clean up footnotes and other 
# things that often mess up the dates, which we would need to use the cellvalueoverride
# system for anyway.  So we'll try and see if we can live without the subdims feature

obs = coderow.is_not_blank().by_index(1).fill(DOWN).is_not_blank()   # smaller OBS set of one column

# But first check that the dateunit thing is working just on the years (including fixing up the .0 value)
hdt = HDim(year, TIME, CLOSEST, ABOVE)
cs = ConversionSegment(tab, [hdt], obs)
writetechnicalCSV(outputfile, cs)

writing 1 conversion segments into /home/goatchurch/sensiblecode/quickcode-ons-recipes/constructionoutput/constructionoutputtab4.csv
conversionwrite segment size 389 table 'Table 4; TIMEUNIT='Year'


In [5]:
# The years don't have the .0 problem.  It's been done in stages of 
# post processing like so:
cs1 = ConversionSegment(tab, [hdt], obs)

print("Initial processing")
cs1.process()  
print(cs1.processedrows[:3])

print("\n\nTIMEUNIT adding")
st = cs1.guesstimeunit()  
print("timeunitmessage:", st)
print(cs1.processedrows[:3])

# this function only does something with years, but we can talk about doing it more systematically
print("\n\nForce TIME to match TIMEUNIT")
cs1.fixtimefromtimeunit()
print(cs1.processedrows[:3])


Initial processing
[{-2: 1955.0, -9: 331.0}, {-2: 1956.0, -9: 319.0}, {-2: 1957.0, -9: 303.0}]


TIMEUNIT adding
timeunitmessage: TIMEUNIT='Year'
[{-1: 'Year', -2: 1955.0, -9: 331.0}, {-1: 'Year', -2: 1956.0, -9: 319.0}, {-1: 'Year', -2: 1957.0, -9: 303.0}]


Force TIME to match TIMEUNIT
[{-1: 'Year', -2: '1955', -9: 331.0}, {-1: 'Year', -2: '1956', -9: 319.0}, {-1: 'Year', -2: '1957', -9: 303.0}]


In [6]:
# Now back to the waterfall like structure, where we're going to 
# apply the year into the month column
hdt = HDim(year, TIME, CLOSEST, ABOVE)
lsubyear = list(subyear.unordered_cells)  # <--- unordered_cells are bare cell type, 
                                          # which we can use in cellvalueoverride

print("Here are some subyear column elements (some are blank)")
for sy in lsubyear[:10]:
    print(sy)

print("\nHere are how they lookup to the year column (it's a local lookup) to cell and value of cell")
for sy in lsubyear[:10]:
    print(sy, hdt.cellvalobs(sy))


Here are some subyear column elements (some are blank)
<B168 'Q3'>
<B191 'Q2'>
<B251 'Q2'>
<B9 ''>
<B351 'Jun'>
<B18 ''>
<B67 ''>
<B100 'Q3'>
<B307 'Q2'>
<B176 'Q3'>

Here are how they lookup to the year column (it's a local lookup) to cell and value of cell
<B168 'Q3'> (<A166 1978.0>, 1978.0)
<B191 'Q2'> (<A190 1984.0>, 1984.0)
<B251 'Q2'> (<A250 1999.0>, 1999.0)
<B9 ''> (None, None)
<B351 'Jun'> (<A346 2012.0>, 2012.0)
<B18 ''> (<A18 1962.0>, 1962.0)
<B67 ''> (<A67 2011.0>, 2011.0)
<B100 'Q3'> (<A98 1961.0>, 1961.0)
<B307 'Q2'> (<A306 2013.0>, 2013.0)
<B176 'Q3'> (<A174 1980.0>, 1980.0)


In [15]:
# this creates the concatenation of the year and subyear
cvodate = { }
for sy in lsubyear:
    h, v = hdt.cellvalobs(sy)
    if v:
        cvodate[sy] = (str(int(v))+sy.value.strip()) 

{<B168 'Q3'>: '1978Q3',
 <B191 'Q2'>: '1984Q2',
 <B238 'Q1'>: '1996Q1',
 <B275 'Q2'>: '2005Q2',
 <B133 'Q4'>: '1969Q4',
 <B251 'Q2'>: '1999Q2',
 <B271 'Q2'>: '2004Q2',
 <B222 'Q1'>: '1992Q1',
 <B351 'Jun'>: '2012Jun',
 <B18 ''>: '1962',
 <B67 ''>: '2011',
 <B136 'Q3'>: '1970Q3',
 <B223 'Q2'>: '1992Q2',
 <B395 'Feb '>: '2016Feb',
 <B100 'Q3'>: '1961Q3',
 <B307 'Q2'>: '2013Q2',
 <B176 'Q3'>: '1980Q3',
 <B400 'Jul'>: '2016Jul',
 <B101 'Q4'>: '1961Q4',
 <B204 'Q3'>: '1987Q3',
 <B283 'Q2'>: '2007Q2',
 <B221 'Q4'>: '1991Q4',
 <B38 ''>: '1982',
 <B198 'Q1'>: '1986Q1',
 <B188 'Q3'>: '1983Q3',
 <B372 'Mar '>: '2014Mar',
 <B232 'Q3'>: '1994Q3',
 <B302 'Q1'>: '2012Q1',
 <B42 ''>: '1986',
 <B405 ''>: '2016',
 <B52 ''>: '1996',
 <B144 'Q3'>: '1972Q3',
 <B45 ''>: '1989',
 <B310 'Q1 '>: '2014Q1',
 <B210 'Q1'>: '1989Q1',
 <B280 'Q3'>: '2006Q3',
 <B79 'Q2'>: '1956Q2',
 <B286 'Q1'>: '2008Q1',
 <B203 'Q2'>: '1987Q2',
 <B62 ''>: '2006',
 <B384 'Mar '>: '2015Mar',
 <B390 'Sep'>: '2015Sep',
 <B164 'Q3'>: '1

In [16]:
# just in case you want to do that all in one line...
cvodate = dict((sy, "%s%s" % (int(hdt.cellvalobs(sy)[1] or 0), (sy.value or "").strip()))  for sy in subyear.unordered_cells)


In [17]:
hdd = HDim(subyear, TIME, DIRECTLY, LEFT, cellvalueoverride=cvodate)
cs2 = ConversionSegment(tab, [hdd], obs)
writetechnicalCSV(outputfile, cs2)
#writetechnicalCSV(None, cs2)
# see lots of errors

writing 1 conversion segments into /home/goatchurch/sensiblecode/quickcode-ons-recipes/constructionoutput/constructionoutputtab4.csv
conversionwrite segment size 389 table 'Table 4; multiple TIMEUNITs: ''(81), 'Quarter'(247), 'Year'(61)




In [19]:
# But we get some datematching that doesn't work out, 
# We could either fix up the datematch code in the base library, or hack in this 
# lookup table to get the month in front of the year when it's there

# this creates the concatenation of the year and subyear
cvodate = { }
for sy in lsubyear:
    h, v = hdt.cellvalobs(sy)
    if v:
        y = str(int(v))
        if sy.value:
            if sy.value[0] == "Q":
                cvodate[sy] = (y+sy.value.strip()) 
            else:
                cvodate[sy] = (sy.value.strip()+" "+y) 
        else:
            cvodate[sy] = y

hdd = HDim(subyear, TIME, DIRECTLY, LEFT, cellvalueoverride=cvodate)
cs2 = ConversionSegment(tab, [hdd], obs)
writetechnicalCSV(None, cs2)

# and this comes out quite nicely


observation,data_marking,statistical_unit_eng,statistical_unit_cym,measure_type_eng,measure_type_cym,observation_type,empty,obs_type_value,unit_multiplier,unit_of_measure_eng,unit_of_measure_cym,confidentuality,empty1,geographic_area,empty2,empty3,time_dim_item_id,time_dim_item_label_eng,time_dim_item_label_cym,time_type,empty4,statistical_population_id,statistical_population_label_eng,statistical_population_label_cym,cdid,cdiddescrip,empty5,empty6,empty7,empty8,empty9,empty10,empty11,empty12
331.0,,,,,,,,,,,,,,,,,1955,1955,,Year,,,,,,,,,,,,,,
319.0,,,,,,,,,,,,,,,,,1956,1956,,Year,,,,,,,,,,,,,,
303.0,,,,,,,,,,,,,,,,,1957,1957,,Year,,,,,,,,,,,,,,
256.0,,,,,,,,,,,,,,,,,1958,1958,,Year,,,,,,,,,,,,,,
249.0,,,,,,,,,,,,,,,,,1959,1959,,Year,,,,,,,,,,,,,,
241.0,,,,,,,,,,,,,,,,,1960,1960,,Year,,,,,,,,,,,,,,
253.0,,,,,,,,,,,,,,,,,1961,1961,,Year,,,,,,,,,,,,,,
288.0,,,,,,,,,,,,,,,,,1962,1962,,Year,,,,,,,,,,,,,,
325.0,,,,,,,,,,,,,,,,,1963,1963,,Year,,,,,,,,,,,,,,
412.0,,,,,,,,,,,,,,,,,19