# Comunicados de Política Monetaria
## Parte 2.

En este notebook analizaremos por medio de expresiones regulares los anunciones de las decisiones de política monetaria de Banxico.

In [1]:
import os
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from unidecode import unidecode
from textract import process
from multiprocessing import pool

ModuleNotFoundError: No module named 'unidecode'

In [None]:
%config InlineBackend.figure_format = "retina"
plt.rcParams["figure.figsize"] = (12, 4)

In [None]:
comunicados = pd.read_pickle("comunicados-banxico.pkl")
comunicados.head()

In [None]:
# Creamos una nueva carpeta llamada 'comunicados' si esta no existes
wrkdir = "comunicados"
if not os.path.exists(wrkdir):
    os.makedirs(wrkdir)

Descarga cada uno de los comunicados de Banxico y guárdalos como .pdf dentro de la carpeta `comunicados`. Para cada archivo, guarda el comunicado con formato `yyyymmdd.pdf`

In [None]:
def download_statements(statements, workdir):
    for date, vals in tqdm(statements.iterrows()):
        filename = f"{date.strftime('%Y%m%d')}.pdf"
        filename = os.path.join(workdir, filename)
        url = vals["url"]
        r = requests.get(url)
        with open(filename, "wb") as f:
            f.write(r.content)

In [None]:
if len(os.listdir(wrkdir)) == 0:
    download_statements(comunicados, wrkdir)
else:
    print("Statements downloaded...")

In [None]:
files_minutas = os.listdir(wrkdir)
len(files_minutas)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, confusion_matrix
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from collections import defaultdict

In [None]:
void_words = stopwords.words("spanish")
tokenizer = RegexpTokenizer("(?:\w+|[0-9]*\.[0-9]%)+")

In [None]:
def lee_minuta(path):
    encoding = "iso-8859-1"
    return process(path, encoding=encoding).decode(encoding)

def bag_words(path, stopwords):
    minuta = lee_minuta(path)
    minuta = unidecode(minuta.lower())
    words = [word for word in tokenizer.tokenize(minuta) if word not in stopwords]
    return words

def parse_minuta(path, stopwords):
    words = bag_words(path, stopwords)
    return pd.Series(words).value_counts()

In [None]:
def categorize_minutas(comunicados, working_dir=".", stop_words=[], elements=None):
    elements = slice(None) if elements is None else elements
    comunicados = comunicados.loc[elements]
    minutas_info = defaultdict(list)
    category_count = defaultdict(int)
    for release_date, info in comunicados.iterrows():
        release_date_file = release_date.strftime("%Y%m%d.pdf")
        path = os.path.join(working_dir, release_date_file)
        words = bag_words(path, stop_words)
        category = info["category"]
        minutas_info[category].extend(words)
        category_count[category] += 1
        
    return minutas_info, category_count

In [2]:
train = comunicados.assign(category=comunicados.category.apply(map_key))
train, test = train_test_split(train, test_size=0.2, random_state=314)

NameError: name 'comunicados' is not defined

In [195]:
train = train.sort_index()

In [196]:
npartitions = 6
delta = (comunicados.index[-1] - comunicados.index[0]) / (npartitions - 1)
delta = delta.days

init_date = comunicados.index[0]
time_slices = [slice(init_date + pd.Timedelta(days=i * delta), init_date + pd.Timedelta(days=(i + 1) * delta)) for i in range(npartitions)]
time_slices

[slice(Timestamp('2008-02-15 00:00:00'), Timestamp('2010-06-22 00:00:00'), None),
 slice(Timestamp('2010-06-22 00:00:00'), Timestamp('2012-10-27 00:00:00'), None),
 slice(Timestamp('2012-10-27 00:00:00'), Timestamp('2015-03-04 00:00:00'), None),
 slice(Timestamp('2015-03-04 00:00:00'), Timestamp('2017-07-09 00:00:00'), None),
 slice(Timestamp('2017-07-09 00:00:00'), Timestamp('2019-11-14 00:00:00'), None),
 slice(Timestamp('2019-11-14 00:00:00'), Timestamp('2022-03-21 00:00:00'), None)]

In [197]:
def part(slices):
    return categorize_minutas(train, wrkdir, void_words, slices)

pool = Pool(processes=npartitions)
res = []
for element in tqdm(pool.imap_unordered(part, time_slices), total=npartitions):
    res.append(element)
pool.close()

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))




In [54]:
def map_key(decision):
    if decision == "disminuye" or decision == "incrementa":
        return "cambio"
    else:
        return decision

In [199]:
minutas_info = defaultdict(list)
category_count = defaultdict(int)
for info, cat in res:
    for key, vals in cat.items():
        category_count[key] += vals
    for key, vals in info.items():
        minutas_info[key].extend(vals)

In [200]:
categories = category_count.keys()
categories

dict_keys(['cambio', 'mantiene'])

In [201]:
priors = pd.DataFrame.from_dict(category_count, orient="index").T
priors = priors / priors.values.sum()
priors

Unnamed: 0,cambio,mantiene
0,0.261905,0.738095


In [202]:
log_priors = np.log(priors)

In [203]:
minutas_df = pd.concat([pd.Series(minutas_info[cat]).value_counts() for cat in categories],
                       axis=1, sort=True, keys=categories)

In [204]:
minutas_df["mantiene"].dropna().sort_values(ascending=False)

inflacion       826.0
precios         379.0
monetaria       340.0
ciento          314.0
economia        291.0
                ...  
corroboro         1.0
manifestaria      1.0
tomadas           1.0
manifestarse      1.0
implicado         1.0
Name: mantiene, Length: 2742, dtype: float64

In [264]:
# Removing most repeated word among all clases
topw = minutas_df.assign(total_count=minutas_df.sum(axis=1)).sort_values("total_count", ascending=False)
topw = topw.head(20).index

In [265]:
minutas_df.loc[topw] = np.nan

In [266]:
nb = minutas_df.fillna(0) + 1
nb = nb / nb.sum(axis=0)

In [267]:
log_nb = np.log(nb)

## Testing the Model

In [268]:
log_priors.filter([0], axis=0)

Unnamed: 0,cambio,mantiene
0,-1.339774,-0.303682


In [269]:
def naive_estimate(log_priors, log_probs, bag_words):
    Ck = log_priors + log_probs.filter(bag_words, axis=0).sum(axis=0)
    return Ck

In [270]:
y = test["category"].values

In [271]:
yhat = np.empty_like(y)

for ix, (release_date, _) in enumerate(test.iterrows()):
    release_date_file = release_date.strftime("%Y%m%d.pdf")
    path = os.path.join(wrkdir, release_date_file)
    print(path, end="\r")

    trgt_minuta = parse_minuta(path, void_words)
    Ck = naive_estimate(log_priors.loc[0], log_nb, trgt_minuta.index)
    yhat[ix] = Ck.idxmax()

comunicados/20120120.pdf

In [272]:
from sklearn.metrics import recall_score, precision_score

In [273]:
confusion_matrix(y, yhat)

array([[7, 4],
       [1, 9]])

In [274]:
precision_score(y, yhat, pos_label="cambio")

0.875

In [275]:
recall_score(y, yhat, pos_label="cambio")

0.6363636363636364