In [50]:
import pdftotext, re, pandas as pd
from glob import glob
from datetime import datetime

def parse_visa(f):
    with open(f, "rb") as fp:
        pdf = pdftotext.PDF(fp)

    result = ''
    for page in pdf:
        start = None
        end = None
        possible_starts = ('FECHA',)
        possible_ends = ('¿Quiere pagar', 'SALDO ACTUAL')
        for s in possible_starts:
            if s in page:
                i = page.index(s)
                if start is None or i > start:
                    start = i

        for e in possible_ends:
            if e in page:
                i = page.index(e)
                if end is None or i > end:
                    end = i

        if None not in (start, end):
            result += page[start:end]

    def parse_value(v):
        v = v.strip()
        if v.startswith('FECHA') or v.startswith('SALDO ANTERIOR'):
            return None
        # print(v)
        res = re.match(
            r'^([0-9]{2})\.([0-9]{2})\.([0-9]{2})\s+'
            r'(?:[0-9]{6}.?)?\s*'
            r'(.+?)'
            r'(\s{1,}Cuota\s+\d{2}/\d{2})?\s{2,}'
            r'([A-Z]{3}\s*[\d\,\.]+-?)?\s*'
            r'([\d\,\.]+-?)(?:\s*_)?$'
            , v)
        if res is None:
            # print(v)
            return None
        g = res.groups()
        n = float(g[6].replace('.', '').replace('-', '').replace(',', '.'))
        is_usd = g[5] is None and g[3] != 'SU PAGO EN USD'
        n *= -1 if '-' in g[6] else 1
        return (datetime(2000 + int(g[2]), int(g[1]), int(g[0])), g[3], 0.0 if not is_usd else n, 0.0 if is_usd else n)

    values = [parse_value(v) for v in result.split('\n')]
    return pd.DataFrame([v for v in values if v is not None], columns=('date', 'description', 'ars', 'usd'))

    def dashrepl(matchobj):
        if matchobj.group(0) == '***,**': return ' '
        else: return '***,**'
    return re.sub(r'[0-9\.]+,[0-9]{2}', dashrepl, result)

df = pd.concat([parse_visa(x) for x in glob('data/resumen_cuenta_visa*.pdf')]).reset_index().drop(['index'], axis=1)
df['ars'] = df['ars'].apply(lambda x: '0.00' if x == 0.0 else ('-***.**' if x < 0.0 else '***.**'))
df['usd'] = df['usd'].apply(lambda x: '0.00' if x == 0.0 else ('-***.**' if x < 0.0 else '***.**'))
df

Unnamed: 0,date,description,ars,usd
0,2018-11-05,SU PAGO EN PESOS,-***.**,0.00
1,2018-11-05,SU PAGO EN USD,0.00,-***.**
2,2018-01-12,CLARO EQUIPOS,***.**,0.00
3,2018-04-05,DISTRIBUIDORES INDEPEND PSA,***.**,0.00
4,2018-05-06,DESPEGAR.COM,***.**,0.00
5,2018-05-06,DESPEGAR.COM,***.**,0.00
6,2018-05-06,DESPEGAR.COM,***.**,0.00
7,2018-05-14,WWW.DESPEGAR.COM,***.**,0.00
8,2018-08-30,WWW.JUMBOACASA.COM.AR/5231,***.**,0.00
9,2018-09-06,WWW.JUMBOACASA.COM.AR/5231,***.**,0.00


In [69]:
def parse_icbc(f):
    with open(f, "rb") as fp:
        pdf = pdftotext.PDF(fp)

    # This breaks cross-year, but it doesn't seem to happen
    year = int(re.search(r'PERIODO \d\d-\d\d-(\d{4})', pdf[0]).groups()[0])
    result = ''
    for page in pdf:
        start = None
        end = None
        possible_starts = ('FECHA',)
        possible_ends = ('CONTINUA', 'SALDO FINAL')
        for s in possible_starts:
            if s in page:
                i = page.index(s)
                if start is None or i > start:
                    start = i

        for e in possible_ends:
            if e in page:
                i = page.index(e)
                if end is None or i > end:
                    end = i

        if None not in (start, end):
            result += page[start:end]

    def parse_value(v):
        v = v.strip()
        if v.startswith('FECHA') or v.startswith('SALDO ANTERIOR'):
            return None
        # print(v)
        res = re.match(
            r'^([0-9]{2})-([0-9]{2})\s+'
            r'(?:[0-9]{6}.?)?\s*'
            r'(.+?)'
            r'([\d\,\.]+-?)(?:\s*_)?$'
            , v)
        if res is None:
            # print(v)
            return None
        g = res.groups()
        n = float(g[3].replace('.', '').replace('-', '').replace(',', '.'))
        n *= -1 if '-' in g[3] else 1
        return (datetime(year, int(g[1]), int(g[0])), g[2], n, 0.0)

    values = [parse_value(v) for v in result.split('\n')]
    return pd.DataFrame([v for v in values if v is not None], columns=('date', 'description', 'ars', 'usd'))

    def dashrepl(matchobj):
        if matchobj.group(0) == '***,**': return ' '
        else: return '***,**'
    return re.sub(r'[0-9\.]+,[0-9]{2}', dashrepl, result)

df = pd.concat([parse_icbc(x) for x in glob('data/EXT.DE.MOVIMIENTOS*.pdf')]).reset_index().drop(['index'], axis=1)
df['ars'] = df['ars'].apply(lambda x: '0.00' if x == 0.0 else ('-***.**' if x < 0.0 else '***.**'))
df['usd'] = df['usd'].apply(lambda x: '0.00' if x == 0.0 else ('-***.**' if x < 0.0 else '***.**'))
df

Unnamed: 0,date,description,ars,usd
0,2018-04-03,CRED. CAPITAL POR PAGO DE IPF ...,***.**,0.00
1,2018-04-03,CREDITO INT. DE PLAZO FIJO ...,***.**,0.00
2,2018-04-03,CRED. CAPITAL POR PAGO DE IPF ...,***.**,0.00
3,2018-04-03,CREDITO INT. DE PLAZO FIJO ...,***.**,0.00
4,2018-04-03,PAGO EDESUR ...,-***.**,0.00
5,2018-04-03,PAGO GCBA INM/ABL ...,-***.**,0.00
6,2018-04-03,PAGO METROGAS ...,-***.**,0.00
7,2018-04-03,PAGO AYSA ...,-***.**,0.00
8,2018-04-03,TRANSF. ACC.B. ...,-***.**,0.00
9,2018-04-03,TRANS PAG SUEL ...,***.**,0.00
