## Preprocesamiento de datos

Este notebook muestra como limpiar y normalizar los valores de cuota de las AFP en Chile. Los datos de los fondos de pensiones fueron descargados desde el sitio web de la [Superintendencia de Pensiones](https://www.spensiones.cl/apps/valoresCuotaFondo/vcfAFP.php)

In [1]:
import pandas as pd
import numpy as np
import os
import re
import requests
import time

from datetime import datetime

pd.set_option('display.max_colwidth', None)

## Descargar los datos actualizados desde el sitio web

In [2]:
dataDir = '../data'
today = datetime.today()
yearList = np.arange(2020,2021)
fondoList = ['A', 'B', 'C', 'D', 'E']
dataList = []

for year in yearList:
    for fondo in fondoList:
        dataList.append({'year':year, 'fondo':fondo, 'URL':'https://www.spensiones.cl/apps/valoresCuotaFondo/vcfAFPxls.php?tf={}&aaaaini=0000&aaaafin={}&fecconf={}'.format(fondo, year, today.strftime("%Y%m%d"))})

dataFileDF = pd.DataFrame(dataList)
dataFileDF.head()

Unnamed: 0,year,fondo,URL
0,2020,A,https://www.spensiones.cl/apps/valoresCuotaFondo/vcfAFPxls.php?tf=A&aaaaini=0000&aaaafin=2020&fecconf=20201216
1,2020,B,https://www.spensiones.cl/apps/valoresCuotaFondo/vcfAFPxls.php?tf=B&aaaaini=0000&aaaafin=2020&fecconf=20201216
2,2020,C,https://www.spensiones.cl/apps/valoresCuotaFondo/vcfAFPxls.php?tf=C&aaaaini=0000&aaaafin=2020&fecconf=20201216
3,2020,D,https://www.spensiones.cl/apps/valoresCuotaFondo/vcfAFPxls.php?tf=D&aaaaini=0000&aaaafin=2020&fecconf=20201216
4,2020,E,https://www.spensiones.cl/apps/valoresCuotaFondo/vcfAFPxls.php?tf=E&aaaaini=0000&aaaafin=2020&fecconf=20201216


In [3]:
for index, rowFile in dataFileDF.iterrows():
    
    url = rowFile['URL']
    print("Downloading URL: ", url)
    r = requests.get(url)
    if r.status_code == 200:
        content = r.headers['content-disposition']
        
        fileName = re.findall("filename=(.+\.\w{3})", content)[0]
        fileName = os.path.join(dataDir, fileName)
        
        with open(fileName, 'wb') as f:
            f.write(r.content)
            dataFileDF.loc[index, 'fileName'] = fileName
            
        sleepTime = 1+np.random.random(1)[0]
        time.sleep(sleepTime)

Downloading URL:  https://www.spensiones.cl/apps/valoresCuotaFondo/vcfAFPxls.php?tf=A&aaaaini=0000&aaaafin=2020&fecconf=20201216
Downloading URL:  https://www.spensiones.cl/apps/valoresCuotaFondo/vcfAFPxls.php?tf=B&aaaaini=0000&aaaafin=2020&fecconf=20201216
Downloading URL:  https://www.spensiones.cl/apps/valoresCuotaFondo/vcfAFPxls.php?tf=C&aaaaini=0000&aaaafin=2020&fecconf=20201216
Downloading URL:  https://www.spensiones.cl/apps/valoresCuotaFondo/vcfAFPxls.php?tf=D&aaaaini=0000&aaaafin=2020&fecconf=20201216
Downloading URL:  https://www.spensiones.cl/apps/valoresCuotaFondo/vcfAFPxls.php?tf=E&aaaaini=0000&aaaafin=2020&fecconf=20201216


## Creamos la carpeta results para guardar el output

In [4]:
if not os.path.exists('../results'):
    os.mkdir('../results')

In [5]:
reBlock = re.compile(r'^Valores.*')
reHeader = re.compile(r'^Fecha.*')
reData = re.compile(r'^\d{4}-\d{2}-\d{2}.*')
      
dataList = []
for index, rowFile in dataFileDF.iterrows():
    
    print("Processing file: ", rowFile['fileName'])

    blockIdx=0
    dataRaw = []
    dataRawList=[]
    
    with open(rowFile['fileName'], 'r') as f:
        for lineIdx, line in enumerate(f):
            matchBlock = reBlock.match(line)
            matchHeader = reHeader.match(line)
            matchData = reData.match(line)
            if matchBlock:
                if len(dataRaw) > 0:
                    dataDict['headerRaw'] = headerRaw
                    dataDict['dataRaw'] = dataRaw
                    dataRawList.append(dataDict)
                    dataRaw = []

                dataDict = {'blockIdx':blockIdx, 'lineIdx':lineIdx}
                blockIdx = blockIdx+1
                
            if matchHeader:
                headerRaw = matchHeader.group(0)
                
            if matchData:
                dataRaw.append(matchData.group(0))
                
    dataDict['headerRaw'] = headerRaw
    dataDict['dataRaw'] = dataRaw
    dataRawList.append(dataDict)
    dataRaw = []
    
    for item in dataRawList:
        print(item['blockIdx'], item['lineIdx'], len(item['dataRaw']))
        
        header = item['headerRaw'].split(';')
        #print("header, len={}".format(len(header)))
        #print(header)

        for rowData in item['dataRaw']:
            rowValues = rowData.split(';')

            #print("rowValues, len={}".format(len(rowValues)))

            for i in range(1, len(header), 2):
                dataDict={'fecha': datetime.strptime(rowValues[0], '%Y-%m-%d'),
                          'fondo':rowFile['fondo'] }

                dataDict['administradora'] = header[i]
                if re.match("[0-9]+\..*", rowValues[i]):
                    dataDict['valor cuota'] = float(rowValues[i].replace('.','').replace(',','.'))
                else:
                    dataDict['valor cuota'] = np.nan
                if re.match("[0-9]+.*", rowValues[i]):
                    dataDict['valor patrimonio'] = float(rowValues[i+1])
                else:
                    dataDict['valor cuota'] = np.nan

                dataList.append(dataDict)
                
dataDF = pd.DataFrame(dataList)

Processing file:  ../data\vcfA2020-2020.csv
0 1 349
1 355 14
Processing file:  ../data\vcfB2020-2020.csv
0 1 349
1 355 14
Processing file:  ../data\vcfC2020-2020.csv
0 1 349
1 355 14
Processing file:  ../data\vcfD2020-2020.csv
0 1 349
1 355 14
Processing file:  ../data\vcfE2020-2020.csv
0 1 349
1 355 14


In [6]:
dataDF[dataDF['fondo']=='A'].tail(7)

Unnamed: 0,fecha,fondo,administradora,valor cuota,valor patrimonio
2534,2020-12-14,A,CAPITAL,50966.52,4079566000000.0
2535,2020-12-14,A,CUPRUM,53034.07,5059062000000.0
2536,2020-12-14,A,HABITAT,54347.4,6502753000000.0
2537,2020-12-14,A,MODELO,52515.25,1283214000000.0
2538,2020-12-14,A,PLANVITAL,49276.5,537196600000.0
2539,2020-12-14,A,PROVIDA,53796.84,3928257000000.0
2540,2020-12-14,A,UNO,54753.52,44448750000.0


In [7]:
print(len(dataDF))

12705


In [8]:
outputFileName = '../results/afp_chile_2020.csv'

dataDF.to_csv(outputFileName, index=False)