## Preprocesamiento de datos

Este notebook muestra como limpiar y normalizar los valores de cuota de las AFP en Chile. Los datos de los fondos de pensiones fueron descargados desde el sitio web de la Superintendencia de AFP
https://www.spensiones.cl/apps/valoresCuotaFondo/vcfAFP.php

In [22]:
import pandas as pd
import numpy as np
import os
import re

from datetime import datetime

In [23]:
dataFileDF = pd.DataFrame([{'fondo':'A', 'fileName':'../data/vcfA2020-2020.csv'},
                           {'fondo':'B', 'fileName':'../data/vcfB2020-2020.csv'},
                           {'fondo':'C', 'fileName':'../data/vcfC2020-2020.csv'},
                           {'fondo':'D', 'fileName':'../data/vcfD2020-2020.csv'},
                           {'fondo':'E', 'fileName':'../data/vcfE2020-2020.csv'}])
dataFileDF.head()

Unnamed: 0,fondo,fileName
0,A,../data/vcfA2020-2020.csv
1,B,../data/vcfB2020-2020.csv
2,C,../data/vcfC2020-2020.csv
3,D,../data/vcfD2020-2020.csv
4,E,../data/vcfE2020-2020.csv


In [24]:
# Creamos la carpeta results para guardar el output

if not os.path.exists('../results'):
    os.mkdir('../results')

In [33]:
reHeader = re.compile(r'^Fecha.*')
reData = re.compile(r'^\d{4}-\d{2}-\d{2}.*')
       
dataList=[]
for index, rowFile in dataFileDF.iterrows():
    
    print("Processing file: ", rowFile['fileName'])
    
    dataRaw = []
    with open(rowFile['fileName'], 'r') as f:
        for line in f:
            #print(line)
            matchHeader = reHeader.match(line)
            matchData = reData.match(line)
            if matchHeader:
                headerRaw = matchHeader.group(0)
            if matchData:
                dataRaw.append(matchData.group(0))
                
    header = headerRaw.split(';')
    
    for rowData in dataRaw:
        rowValues = rowData.split(';')
                
        for i in range(1, len(header), 2):
            dataDict={'fecha': datetime.strptime(rowValues[0], '%Y-%m-%d'),
                      'fondo':rowFile['fondo'] }

            dataDict['administradora'] = header[i]
            dataDict['valor cuota'] = float(rowValues[i].replace('.','').replace(',','.'))
            dataDict['valor patrimonio'] = float(rowValues[i+1])
            
            dataList.append(dataDict)
           
dataDF = pd.DataFrame(dataList)
dataDF.head(10)

Processing file:  ../data/vcfA2020-2020.csv
Processing file:  ../data/vcfB2020-2020.csv
Processing file:  ../data/vcfC2020-2020.csv
Processing file:  ../data/vcfD2020-2020.csv
Processing file:  ../data/vcfE2020-2020.csv


Unnamed: 0,fecha,fondo,administradora,valor cuota,valor patrimonio
0,2020-01-01,A,CAPITAL,48914.38,4075437000000.0
1,2020-01-01,A,CUPRUM,51398.91,5095773000000.0
2,2020-01-01,A,HABITAT,52141.66,5928281000000.0
3,2020-01-01,A,MODELO,50732.63,1202176000000.0
4,2020-01-01,A,PLANVITAL,47650.9,498737000000.0
5,2020-01-01,A,PROVIDA,51292.03,4232862000000.0
6,2020-01-01,A,UNO,52831.96,8902967000.0
7,2020-01-02,A,CAPITAL,49130.11,4093501000000.0
8,2020-01-02,A,CUPRUM,51590.88,5112758000000.0
9,2020-01-02,A,HABITAT,52349.68,5951221000000.0


In [34]:
outputFileName = '../results/afp_chile_2020.csv'

dataDF.to_csv(outputFileName, index=False)