In [10]:
import os
import xlutils
import databaker
import databaker.constants
from databaker.constants import *
import xypath
import databaker.databakersolo as ds
from databaker.jupybakeutils import savepreviewhtml, writetechnicalCSV, procrows, TechnicalCSV, yield_dimension_values
Dcelllookup = ds.overrides.Dcelllookup
batchcelllookup = ds.overrides.batchcelllookup


In [4]:
inputfile = "gdpq3m2reftables_tcm77-425649.xls"
outputfile = "%s.csv" % os.path.splitext(inputfile)[0]
previewfile = "preview.html"


In [5]:
print("Loading %s which has size %d bytes" % (inputfile, os.path.getsize(inputfile)))
tableset = xypath.loader.table_set(inputfile, extension='xls')
tabs = list(xypath.loader.get_sheets(tableset, "*"))
print("Table names", [tab.name  for tab in tabs])


Loading gdpq3m2reftables_tcm77-425649.xls which has size 2115072 bytes
Table names ['Content', 'Index', 'A1 AGGREGATES', 'A2 AGGREGATES', 'B1 CVM OUTPUT', 'B2 CVM OUTPUT', 'C1 EXPENDITURE', 'C2 EXPENDITURE', 'D INCOME', 'H1 TRADE', 'H2 TRADE', 'L GVAbp', 'M Alignment adjustments', 'N Financial Year Variables', 'O Selected imp def', 'P GDP per head']


In [7]:
conversionsegments = [ ]

for tab in tabs:
    
    if tab.name != "B1 CVM OUTPUT": 
        continue
    
    skipcount = 2 if tab.name in ['B1 CVM OUTPUT', 'B2 CVM OUTPUT'] else 1
    anchor = tab.excel_ref('A2').expand(DOWN).is_not_blank().is_not_whitespace().by_index(skipcount)
    obs = anchor.fill(RIGHT).expand(DOWN).is_not_blank().is_not_whitespace()
    obs = obs - tab.excel_ref('B2').expand(DOWN).is_not_blank().is_not_whitespace().expand(RIGHT).expand(LEFT).expand(DOWN)        
        
    if tab.name in ["A1 AGGREGATES", "A2 AGGREGATES"]:
        anchor = tab.excel_ref('A2').expand(DOWN).is_not_blank().is_not_whitespace().by_index(1)
        obs = anchor.fill(RIGHT).expand(DOWN).is_not_blank().is_not_whitespace()
        obs = obs - tab.excel_ref('B2').expand(DOWN).is_not_blank().is_not_whitespace().expand(RIGHT).expand(LEFT).expand(DOWN)        
        
        dimensions = [ 
            (tab.excel_ref('A3').expand(RIGHT).is_not_blank().is_not_whitespace(), "Cat1", DIRECTLY, ABOVE), 
            (tab.excel_ref('A2').expand(RIGHT).is_not_blank().is_not_whitespace().parent(), "Cat2", CLOSEST, LEFTRIGHT), 
        ]
    elif tab.name in ['B1 CVM OUTPUT', 'B2 CVM OUTPUT']:
        top = tab.excel_ref('A2').expand(RIGHT).is_not_blank().is_not_whitespace()# .parent()
        dimensions = [ 
            (top, "Category", CLOSEST, LEFT), 
            (tab.excel_ref('A3').expand(RIGHT).is_not_blank().is_not_whitespace(), "Cat2", DIRECTLY, ABOVE), 
        ]
    else:
        continue

    # further dimensions common to all pages
    dimensions.append((anchor.expand(DOWN).is_not_blank().is_not_whitespace().is_not_bold(), TIME, DIRECTLY, LEFT))

    mtype = { 'A1 AGGREGATES':'Number', 'A2 AGGREGATES':'Value', 'B1 CVM OUTPUT':'Number', 'B2 CVM OUTPUT':'Number', 'C1 EXPENDITURE':'Value', 'C2 EXPENDITURE':'Value',  
               'D INCOME':'Value', 'L GVAbp':'Number', 'M Alignment adjustments':'Value', 'N Financial Year Variables':'Value', 'O Selected imp def':'Number', 
               'P GDP per head':'Number', 'H1 TRADE':'Value' ,'H2 TRADE':'Value'}   
    dimensions.append((mtype[tab.name], MEASURETYPE, None, None))

    if tab.name == 'C1 EXPENDITURE':
        dimensions.append(('£ million', UNITOFMEASURE, None, None))
    if tab.name == 'C2 EXPENDITURE':
        dimensions.append(('Reference year 2012, £ million', UNITOFMEASURE, None, None))

    cdid_horizontal = anchor.shift(UP).fill(RIGHT).is_not_blank().is_not_whitespace()       
    cdid_vertical = anchor.expand(DOWN).is_blank()
    cdid = cdid_horizontal.waffle(cdid_vertical).is_not_blank().is_not_whitespace() | cdid_horizontal
    obs = obs - cdid
    obs = obs - obs.filter(contains_string('-'))
    dimensions.append((cdid, 'CDID', DIRECTLY, ABOVE))
    dimensions.append((anchor.expand(DOWN).is_not_blank().is_not_whitespace().is_bold(), 'Output', CLOSEST, ABOVE))    

    conversionsegments.append((tab, dimensions, obs))
    
# this is the preview system
conversionsegment = conversionsegments[-1]
savepreviewhtml(conversionsegment, None and batchcelllookup, previewfile)


opening file preview.html
table written


In [11]:
print("converting and writing %d conversion segments into %s" % (len(conversionsegments), outputfile))
convertedrows = [ ]
for conversionsegment in conversionsegments:
    print("conversion segment size %d" % len(conversionsegment[2]))
    rows = procrows(conversionsegment, batchcelllookup)
    convertedrows.append(rows)

# here we can do further sorting and substitution    
csvout = TechnicalCSV(outputfile, False)
for rows, conversionsegment in zip(convertedrows, conversionsegments):
    headernames = [None]+[dimension[1]  for dimension in conversionsegment[1]  if type(dimension[1]) != int ]
    for row in rows:
        values = dict((k if type(k)==int else headernames.index(k), v)  for k, v in row.items())
        output_row = yield_dimension_values(values, headernames)
        csvout.output(output_row)
csvout.footer()



converting and writing 1 conversion segments into gdpq3m2reftables_tcm77-425649.csv
conversion segment size 3533


In [None]:


# alternatively, using pivot table (made by Peter)
import pandas as pd
df = ( 
    pd.DataFrame.from_records(mallvalues)
    .rename(columns={-6:"MEASURETYPE", -2:"TIME", -9:"OBS"})
    .assign(TIME=lambda df:df.TIME.astype(int))
    #.assign(TIME2=lambda df:df.TIME.astype(int)*2)
)

pdf = df.pivot_table("OBS", ["TIME"], ["Mergers", "MEASURETYPE", "Area"])
# ["Area Analysis of Acquisitions Abroad by UK Companies"].xs("Number", level="MEASURETYPE", axis=1).plot()
acqtypes = pdf.columns.get_level_values(0).unique()
valtypes = pdf.columns.get_level_values(1).unique()

acqtype = acqtypes[0]
valtype = valtypes[0]
pdf[acqtype][valtype].plot(legend=False, title=acqtype)
