# ETL for Banxico & INEGI data

The following series are going to be extracted:
- FX: MXN to USD
- TIIE 28 days
- Inflation rate INPC


@roman_avj

18 mar 24

---
# Settings

In [1]:
import os
import boto3
import awswrangler as wr
import pandas as pd
import yaml
from dotenv import load_dotenv
from datetime import datetime
from tqdm import tqdm

from INEGIpy import Indicadores
from sie_banxico import SIEBanxico

In [2]:
# load environment variables
load_dotenv()

# get config file
with open('../config.yaml', 'r') as file:
    config = yaml.safe_load(file)

# today
TODAY = datetime.today().strftime('%Y-%m-%d')


---
# Extract

## INEGI

In [3]:
# Api
inegi_api = Indicadores(token=os.getenv('API_INEGI'))

# Read INPC
# get inpc
df_inpc = inegi_api.obtener_df(
    indicadores=list(config['inegi']['series'].values()),
    nombres=list(config['inegi']['series'].keys()),
    inicio=config['inegi']['starting_date'].strftime('%Y-%m-%d'),
    )
# change the name of the index
df_inpc.index.name = 'date'


  df.set_index(pd.to_datetime(df.fechas),inplace=True, drop=True)


In [4]:
# look data
df_inpc

Unnamed: 0_level_0,inpc
date,Unnamed: 1_level_1
2018-01-01,98.795000
2018-02-01,99.171374
2018-03-01,99.492157
2018-04-01,99.154847
2018-05-01,98.994080
...,...
2023-10-01,130.609000
2023-11-01,131.445000
2023-12-01,132.373000
2024-01-01,133.555000


## Banxico

In [7]:
# Api
banxico_api = SIEBanxico(
    token=os.getenv('API_BANXICO'),
    id_series=list(config['banxico']['series'].values()),
    language='en'
)

# Read FX and TIIE
# get jsons
series_data = banxico_api.get_timeseries_range(
    init_date=config['banxico']['starting_date'].strftime('%Y-%m-%d'),
    end_date=TODAY
)

# create dataframe
dict_df = {}
for serie in series_data['bmx']['series']:
    # extract data
    df = pd.DataFrame(serie['datos'])

    # change index
    df['date'] = pd.to_datetime(df['fecha'], format='%d/%m/%Y')
    df = df.set_index('date')
    df = df.drop(columns=['fecha'])
    df.index = df.index.strftime('%Y-%m-%d')

    # rename column
    df = df.rename(columns={'dato': serie['idSerie']})
    dict_df[serie['idSerie']] = df

# join dataframes by index, caveat: some series have different dates
df_banxico = pd.concat(dict_df.values(), axis=1, join='outer')

# index to DateTimeIndex
df_banxico.index = pd.to_datetime(df_banxico.index)

# rename columns
cols2rename = {v: k for k, v in config['banxico']['series'].items()}
df_banxico = df_banxico.rename(columns=cols2rename)

In [8]:
# show data
df_banxico

Unnamed: 0_level_0,tiie,dollar_fx
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-02,7.6311,19.48990
2018-01-03,7.6275,19.37170
2018-01-04,7.6250,19.24270
2018-01-05,7.6250,19.27370
2018-01-08,7.6292,19.23230
...,...,...
2024-03-13,11.4748,16.71270
2024-03-14,11.4700,16.69200
2024-03-15,11.4750,16.71000
2024-03-19,11.4775,16.85230


---
# Transform

## INEGI

In [9]:
# look nan
df_inpc.isna().sum()

inpc    0
dtype: int64

In [10]:
# dtypes
df_inpc.dtypes

inpc    float64
dtype: object

In [11]:
# generate index time series from beginging to end
idx = pd.date_range(
    start=config['inegi']['starting_date'].strftime('%Y-%m-%d'),
    end=TODAY,
    freq='MS'
)

# reindex
df_inpc = df_inpc.reindex(idx)

In [12]:
# look nan
df_inpc[df_inpc.isna().any(axis=1)]

Unnamed: 0,inpc
2024-03-01,


In [13]:
# rename index
df_inpc.index.name = 'date'

In [14]:
# final data
df_inpc

Unnamed: 0_level_0,inpc
date,Unnamed: 1_level_1
2018-01-01,98.795000
2018-02-01,99.171374
2018-03-01,99.492157
2018-04-01,99.154847
2018-05-01,98.994080
...,...
2023-11-01,131.445000
2023-12-01,132.373000
2024-01-01,133.555000
2024-02-01,133.681000


Don't input anything to missing value in INPC because is an observed value

## Banxico

In [15]:
# look nan
df_banxico.isna().sum()


tiie         0
dollar_fx    1
dtype: int64

In [16]:
# dtypes
df_banxico.dtypes


tiie         object
dollar_fx    object
dtype: object

In [17]:
# dtype to float
df_inpc = df_inpc.astype(float)

In [18]:
# generate index time series from beginging to end in business days
idx = pd.bdate_range(
    start=config['banxico']['starting_date'].strftime('%Y-%m-%d'),
    end=TODAY
)

# reindex
df_banxico = df_banxico.reindex(idx)

In [19]:
# look nan
df_banxico[df_banxico.isna().any(axis=1)]

Unnamed: 0,tiie,dollar_fx
2018-01-01,,
2018-02-05,,
2018-03-19,,
2018-03-29,,
2018-03-30,,
2018-05-01,,
2018-11-02,,
2018-11-19,,
2018-12-12,,
2018-12-25,,


In [20]:
# look # of nana
df_banxico.isna().sum()

tiie         58
dollar_fx    59
dtype: int64

In [21]:
# fill nan with ffill & bfill
df_banxico = df_banxico.ffill().bfill()

In [22]:
# rename index
df_banxico.index.name = 'date'
df_banxico

Unnamed: 0_level_0,tiie,dollar_fx
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-01,7.6311,19.48990
2018-01-02,7.6311,19.48990
2018-01-03,7.6275,19.37170
2018-01-04,7.6250,19.24270
2018-01-05,7.6250,19.27370
...,...,...
2024-03-14,11.4700,16.69200
2024-03-15,11.4750,16.71000
2024-03-18,11.4750,16.71000
2024-03-19,11.4775,16.85230


---
# Load

In [23]:
# AWS Settings
session = boto3.Session(profile_name="arquitectura")
s3 = session.client('s3')

In [24]:
# Bucket
BUCKET_NAME = config['aws']['bucket']

# Sub Bucket
SUB_BUCKET = config['aws']['sub-bucket']

In [28]:
# Upload to S3 each dataframe
def write_to_s3(df, folder, filename):
    # create folder if not exists
    filename_dir = filename.replace('.csv', '')
    s3.put_object(Bucket=BUCKET_NAME, Key=f"{SUB_BUCKET}/{folder}/{filename_dir}/")

    # write to s3
    wr.s3.to_csv(
        df=df,
        path=f"s3://{BUCKET_NAME}/{SUB_BUCKET}/{folder}/{filename_dir}/{filename}",
        index=True,
        boto3_session=session,

    )

# Write dataframes to s3
df_to_s3 = {
    'inpc': df_inpc,
    'dollar_fx': df_banxico['dollar_fx'],
    'tiie': df_banxico['tiie']
}

for key, value in tqdm(df_to_s3.items()):
    write_to_s3(
        df=value,
        folder='raw',
        filename=config['aws']['filenames'][key]
    )

  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:02<00:00,  1.35it/s]
