# Time Series Analysis 1

## References:

- **TSA**: [Time Series Analysis and Its Applications](https://www.springer.com/gp/book/9783319524511).
- **AFTS**: [Analysis of Financial Time Series](https://www.amazon.com/Analysis-Financial-Time-Ruey-Tsay/dp/0470414359)

## Notation

- $\{X_t\}_{t\in T}$ denotes a collection of random variables with indexes in some set $T$ (a *time series* - ts).
- $\{x_t\}_{t\in T}$ denotes a collection of constants with indexes in some set $T$ (a realization of a ts).

### Imports

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from IPython.display import display, HTML, Markdown
from datetime import date
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from glob import glob
from statsmodels.tsa.stattools import acovf
from statsmodels.tsa.stattools import acf, ccf

path_ibov =  glob('/kaggle/input/ibovespa-stocks/b3*.csv')[0]
path_usd =  glob('/kaggle/input/ibovespa-stocks/usd*.csv')[0]

## Loading Ibov data

In [None]:
df = pd.read_csv(path_ibov)
df.loc[:, "datetime"]  = pd.to_datetime(df.datetime)
df = df.set_index(["ticker", "datetime", ]).sort_index()

### Plot Functions

In [None]:
def plot_acf(x, lag_range, reverse=True, figsize=(12, 5),
             title_fontsize=15, xlabel_fontsize=16, ylabel_fontsize=16):
    """
    plot autocorrelation of series x
    :param x: series that we will perform the lag
    :type x: pd.Series
    :param lag_range: range of lag
    :type lag_range: int
    :param out_path: path to save figure
    :type out_path: str
    :param ccf: cross-correlation function
    :type ccf: function
    :param reverse: param to reverse lags
    :type reverse: boolean
    :param figsize: figure size
    :type figsize: tuple
    :param title_fontsize: title font size
    :type title_fontsize: int
    :param xlabel_fontsize: x axis label size
    :type xlabel_fontsize: int
    :param ylabel_fontsize: y axis label size
    :type ylabel_fontsize: int
    """

    title = "{}".format(x.name)
    lags = range(lag_range)
    ac = acf(x,fft=False,nlags=lag_range)
    sigma = 1 / np.sqrt(x.shape[0])
    fig, ax = plt.subplots(figsize=figsize)
    ax.vlines(lags, [0], ac)
    plt.plot(lags, [0] * len(lags), c="black", linewidth=1.0)
    plt.plot(lags, [2 * sigma] * len(lags), '-.', c="blue", linewidth=0.6)
    plt.plot(lags, [-2 * sigma] * len(lags), '-.', c="blue", linewidth=0.6)
    ax.set_xlabel('Lag', fontsize=xlabel_fontsize)
    ax.set_ylabel('autocorrelation', fontsize=ylabel_fontsize)
    fig.suptitle(title, fontsize=title_fontsize, fontweight='bold', y=0.93)
    


def plot_ccf(x, y, lag_range,
             figsize=(12, 5),
             title_fontsize=15, xlabel_fontsize=16, ylabel_fontsize=16):
    """
    plot cross-correlation between series x and y
    :param x: series that we leads y on the left
    :type x: pd.Series
    :param y: series that we leads x on the right
    :type y: pd.Series
    :param lag_range: range of lag
    :type lag_range: int
    :param figsize: figure size
    :type figsize: tuple
    :param title_fontsize: title font size
    :type title_fontsize: int
    :param xlabel_fontsize: x axis label size
    :type xlabel_fontsize: int
    :param ylabel_fontsize: y axis label size
    :type ylabel_fontsize: int
    """

    title = "{} & {}".format(x.name, y.name)
    lags = range(-lag_range, lag_range + 1)
    left = ccf(y, x)[:lag_range + 1]
    right = ccf(x, y)[:lag_range]

    left = left[1:][::-1]
    cc = np.concatenate([left, right])

    sigma = 1 / np.sqrt(x.shape[0])
    fig, ax = plt.subplots(figsize=figsize)
    ax.vlines(lags, [0], cc)
    plt.plot(lags, [0] * len(lags), c="black", linewidth=1.0)
    plt.plot(lags, [2 * sigma] * len(lags), '-.', c="blue", linewidth=0.6)
    plt.plot(lags, [-2 * sigma] * len(lags), '-.', c="blue", linewidth=0.6)
    ax.set_xlabel('Lag', fontsize=xlabel_fontsize)
    ax.set_ylabel('cross-correlation', fontsize=ylabel_fontsize)    
    ax.xaxis.set_major_locator(MaxNLocator(integer=True))
    fig.suptitle(title, fontsize=title_fontsize, fontweight='bold', y=0.93)
    
def get_lead_matrix(lead_series, fixed_series, lag_range):
    
    """
    get ccf vector of size 'max_lag' for each ts in 'lead_series' in
    relation with 'fixed_series'. All the ccf results
    are arranged in matrix format.

    :param lead_series: list of series to be lagged. 
                        All series are indexed by time.
    :type lead_series: [pd.Series]
    :param fixed_series: list of series indexed by time.
    :type fixed_series: pd.Series
    :param lag_range: range of lag
    :type lag_range: int
    :return: matrix of ccf information
    :rtype: pd.DataFrame
    """

    ccf_rows = []
    for ts in lead_series:
        merged = pd.merge_asof(ts, fixed_series,
                               left_index=True, right_index=True)

        lagged_ts = merged[ts.name]
        fixed_ts = merged[fixed_series.name]
        row = ccf(fixed_ts, lagged_ts)[:lag_range +1]
        ccf_rows.append(row)



    ccf_matrix = np.array(ccf_rows)
    ccf_matrix = pd.DataFrame(ccf_matrix,
                              columns=["lag_{}".format(i) for i in range(lag_range +1)],
                              index=[ts.name for ts in lead_series]) 
    return ccf_matrix

## Time Series Examples
### 1) Returns from stock price

In [None]:
ticker_name = "BOVA11"
# ticker_name = "ITUB4"
# ticker_name = "VVAR3"  


ticker_ts = df.xs(ticker_name).close
ticker_ts.name = ticker_name

simple_net_return = ticker_ts.pct_change().dropna()
simple_gross_return = 1 + simple_net_return
log_return = np.log(simple_gross_return)

fig, ax = plt.subplots(1,3,figsize=(26,6))
simple_net_return.plot(ax=ax[0]);
simple_gross_return.plot(ax=ax[1]);
log_return.plot(ax=ax[2]);
ax[0].set_title("Simple Net Return\n", fontsize=18);
ax[1].set_title("Simple Gross Return\n", fontsize=18);
ax[2].set_title("Log Return\n", fontsize=18);
plt.suptitle(ticker_name, fontsize=20, y=1.1);

### 2)  White Noise

In [None]:
wnoise = pd.Series(np.random.normal(0,1,500))
wnoise.name = "white noise"
fig, ax = plt.subplots(1,2,figsize=(20,5))
wnoise.plot(ax=ax[0]);
wnoise.rolling(30).mean().plot(ax=ax[1]);
ax[0].set_title("White Noise\n", fontsize=18);
ax[1].set_title("White Noise (Moving Average)\n", fontsize=18);


### 3)  Autoregressions

**Example**

\begin{equation}
X_t = X_{t-1}  - 0.9X_{t-2} + W_t \,,
\end{equation}

where $W_t \thicksim N(0,1)$.


In [None]:
steps = 500
w = np.random.normal(0,1,steps)
init1,init2 = np.random.normal(0,1,2)
xs = [init1, init2]

for i in range(2, steps):
    new = xs[i-1] - 0.9*xs[i-2] + w[i]
    xs.append(new)
    
auto_ts = pd.Series(xs)
w = pd.Series(w)
fig, ax = plt.subplots(1,2,figsize=(20,5))
auto_ts.plot(ax=ax[0]);
w.plot(ax=ax[1]);
ax[1].set_title("White Noise\n", fontsize=18);
ax[0].set_title("Autoregressive series\n", fontsize=18);

### 4) Random walk

**Example**

\begin{equation}
X_t = \delta + X_{t-1} + W_t \,,
\end{equation}

where $\delta \in \mathbb{R}$ is a constant called the *drift* and $W_t \thicksim N(0,1)$. When $\delta=0$, $\{X_t\}_{t\in T}$ is called a *random walk*.


In [None]:
steps = 500
w = np.random.normal(0,1,steps)
drift1 = 0.2
drift2 = 0.4
drift3 = - 0.1
init = 0
x1 = [init]
x2 = [init]
x3 = [init]

for i in range(1, steps):
    new1 = drift1 + x1[i-1] + w[i]
    new2 = drift2 + x2[i-1] + w[i]
    new3 = drift3 + x3[i-1] + w[i]
    x1.append(new1)
    x2.append(new2)
    x3.append(new3)

    
rw1 = pd.Series(x1)
rw1.name = r"$\delta = {}$".format(drift1)
rw2 = pd.Series(x2)
rw2.name = r"$\delta = {}$".format(drift2)
rw3 = pd.Series(x3)
rw3.name = r"$\delta = {}$".format(drift3)
fig, ax = plt.subplots(1,1,figsize=(10,5))
rw1.plot(ax=ax);
rw2.plot(ax=ax);
rw3.plot(ax=ax);
ax.set_title("Randon walk\n", fontsize=18);
ax.legend(loc="best");

### 5) Linear Process

A **Linear processs** $X_t$ is defined to be a linear combination of white noise variates $W_t$, and is given by

\begin{equation}
X_t = \mu + \sum_{j = -\infty}^{\infty}\beta_j W_{t-j} \,, \quad \sum_{-\infty}^{\infty}|\beta_j| < \infty.
\end{equation}

**Example**

In [None]:
wnoise1 = pd.Series(np.random.normal(0,1,500))
wnoise2 = pd.Series(np.random.normal(0,1,500))
lp1 = wnoise1.rolling(30).sum() + 100
lp2 = wnoise2.rolling(30).sum() + 100
fig, ax = plt.subplots(1,1,figsize=(10,5))
lp1.plot(ax=ax);
lp2.plot(ax=ax);
ax.set_title("Linear Process\n", fontsize=18);


## Measures of Dependence

### Autocovariance

Let $\{X_t\}_{t\in T}$ be a ts such that each $X_t$ has a finite variance. The **mean value function** is the function $\mu_X: T \rightarrow \mathbb{R}$ such that

\begin{equation}
\mu_X(t) = \mathbb{E}[X_t].
\end{equation}

The **autocovariance function** is the function $\gamma_X: T\times T \rightarrow \mathbb{R}$ such that

\begin{equation}
\gamma_X(s,t) =  \mathbb{E}[(X_s - \mu_X(s))(X_t - \mu_X(t))].
\end{equation}

Clearly, for every $t\in T$:

\begin{equation}
\gamma_X(t,t) =  \mathbb{E}[(X_t - \mu_X(t))(X_t - \mu_X(t))] = \mathbb{E}[(X_t - \mu_X(t))^2] = \mathbb{V}[X_t].
\end{equation}

### Autocorrelation

The **autorcorrelation function** is defined as

\begin{equation}
\rho_X(s,t) = \frac{\gamma_X(s,t)}{\sqrt{\gamma_X(s,s) \gamma_X(t,t)}}.
\end{equation}

Using the [Cauchy-Schwartz inequality](https://www.probabilitycourse.com/chapter6/6_2_4_cauchy_schwarz.php) we can prove that for every $s, t \in T$, $\rho_X(s,t) \in [-1, 1]$. Moreover, $|\rho_X(s,t)| =1$ if and only if $X_t = \beta_0 + \beta_1 X_s$ for $\beta_0, \beta_1 \in \mathbb{R}$. **Hence, we have a rough measure of the ability to forecast the series at time $t$ from the value at time $s$.**

When it is clear from the context which ts we are referring to, we use $\mu_t, \gamma(s,t), \rho(s,t)$ to denote $\mu_X(t), \gamma_X(s,t), \rho_X(s,t)$, respectively.

### Cross-Covariance and Cross-Correlation 

The **cross-covariance function** between two ts $\{X_t\}_{t\in T}$  and $\{Y_t\}_{t\in T}$ (both with finite variance) is defined as:

\begin{equation}
\gamma_{XY}(s,t) = \mathbb{E}[(X_s - \mu_X(s))(Y_t - \mu_Y(t))].
\end{equation}

The **cross-correlation function** is given by 


\begin{equation}
\rho_{XY}(s,t) = \frac{\gamma_{XY}(s,t)}{\sqrt{\gamma_X(s,s) \gamma_Y(t,t)}}.
\end{equation}


## Stationary Time Series

Let $\{X_t\}_{t\in T}$ be a ts such that each $X_t$ has a finite variance. For $t_1, \dots, t_n \in T$ let $F_{t_1, \dots, t_n}$ be the joint distribuiton, i.e., for each sample point $(x_1, \dots, x_n)\in \mathbb{R}^n$

\begin{equation}
F_{t_1, \dots, t_n}(x_1, \dots, x_n) = \mathbb{P}(X_{t_1}\leq x_1, \dots, X_{t_n}\leq x_n).
\end{equation}

We say that $\{X_t\}_{t\in T}$ is a **strictly stationary** time series if for all set of indices  $\{t_1, \dots, t_n\} \subseteq T$  and all time shifts $h \in \mathbb{N}$:

\begin{equation}
F_{t_1, \dots, t_n} = F_{t_{1+h}, \dots, t_{n+h}}.
\end{equation}

When $\{X_t\}_{t\in T}$ is a strictly stationary ts, we can deduce two facts:

- **i)** For every $t$, $\mu_t = \mu$, where $\mu \in \mathbb{R}$.

- **ii)** For every $t,s \in T$ and shift $h$ we have:

\begin{equation}
\gamma(s,t) =  \mathbb{E}_{F_{t,s}}[(X_s - \mu)(X_t - \mu)] = \mathbb{E}_{F_{t+h,s+h}}[(X_{s+h} - \mu)(X_{t+h} - \mu)] = \gamma(s+h,t+h).
\end{equation}

We say that $\{X_t\}_{t\in T}$ is a **weakly stationary** ts if it satisfies the properties **i)** and **ii)**. We say that a series is **stationary** if it is **weakly stationary**.


The **autocovariance function of a stationary time series** can be defined as a function that takes only the argument $h$:

\begin{equation}
\gamma(h) = \gamma(h,0) = \gamma(t+h,t),
\end{equation}

where $t$ is an arbitrary index in $T$. One of the properties of this autocovariance function is that it is symmetric around the origin:

\begin{align}
\gamma(h) &= \gamma(h,0)\\
 &= \gamma(t+h,t)\\
 &= \gamma(t,t+h)\\
 &= \gamma((t+h)-h,(t+h))\\
 &= \gamma(-h,0)\\
 &= \gamma(-h)\\
\end{align}


Similarly, the **autocorrelation function (ACF) of a stationary time series** is defined as:

\begin{equation}
\rho(h) = \frac{\gamma(t+h,t)}{\sqrt{\gamma(t+h,t+h) \gamma(t,t)}} = \frac{\gamma(h)}{\gamma(0)}.
\end{equation}

We say that two time series $\{X_t\}_{t\in T}$ and $\{Y_t\}_{t\in T}$ are **jointly stationary** if they are each stationary, and the cross-covariance function

\begin{equation}
\gamma_{XY}(h) =   \mathbb{E}[(X_{t+h} - \mu_X)(Y_{t} - \mu_Y)].
\end{equation}

is a function only of lag $h$.

In this same line, the **cross-correlation function (CCF)** of jointly stationary time series $\{X_t\}_{t\in T}$ and $\{Y_t\}_{t\in T}$ is defined as 

\begin{equation}
\rho_{XY}(h) = \frac{\gamma_{XY}(h)}{\sqrt{\gamma_X(0) \gamma_Y(0)}}.
\end{equation}

### Example of Stationary and non-Stationary Time Series


In [None]:
window = 30

ex1 = rw1.rolling(window).mean()
ex1.name = "random walk (non-stationary)"
ex2 = wnoise.rolling(window).mean()
ex2.name = "white noise (stationary)"

fig, ax = plt.subplots(1,1,figsize=(10,5))
ex1.plot(ax=ax);
ex2.plot(ax=ax);
ax.set_title("Rolling Mean\n", fontsize=18);
ax.legend(loc="best");

### Stationary in Finance 

> In the finance literature, it is common to assume that an asset return series is weakly stationary. This assumption can be checked empirically provided that a sufficient number of historical returns are available. For example, one can divide the data into subsamples and check the consistency of the results obtained across the subsamples. (AFTS, p.30)

In [None]:
window = 60

ticker_name1 = "ITUB4"
ticker_name2 = "PETR3"  
ticker_name3 = "VALE3"  
ticker_ts1 = df.xs(ticker_name1).close
ticker_ts1.name = ticker_name1
ticker_ts2 = df.xs(ticker_name2).close
ticker_ts2.name = ticker_name2
ticker_ts3 = df.xs(ticker_name3).close
ticker_ts3.name = ticker_name3
simple_net_return1 = ticker_ts1.pct_change().dropna()
simple_net_return2 = ticker_ts2.pct_change().dropna()
simple_net_return3 = ticker_ts3.pct_change().dropna()


fig, ax = plt.subplots(1,1,figsize=(10,5))
simple_net_return1.rolling(window).mean().plot(ax=ax);
simple_net_return2.rolling(window).mean().plot(ax=ax);
simple_net_return3.rolling(window).mean().plot(ax=ax);
ax.set_title("Rolling Mean Ticker Returns \n", fontsize=18);
ax.legend(loc="best");

## Estimation of Correlation

> Although the theoretical autocorrelation and cross-correlation functions are useful for describing the properties of certain hypothesized models, most of the analyses must be performed using sampled data. This limitation means the sampled points $x_1, x_2 , \dots, x_n$ only are available for estimating the mean, autocovariance, and autocorrelation functions. From the point of view of classical statistics, this poses a problem because we will typically not have iid copies of $X_t$ that are available for estimating the covariance and correlation functions. In the usual situation with only one realization, however, the assumption of stationarity becomes critical. Somehow, we must use averages over this single realization to estimate the population means and covariance functions. (TSA, p.26)


When $\{X_t\}_{t\in T}$ is stationary ts, we have:

- The sample mean 

\begin{equation}
\bar{X}_n = \frac{1}{n}\sum_{t=1}^n X_t ,
\end{equation}

is an unbiased estimator of the constant function $\mu_X$.

- The **sample autocovariance function** is a biased estimator of $\gamma(h)$

\begin{equation}
\hat{\gamma}(h) =  \frac{1}{n}\sum_{t=1}^{n-h}(X_{t+h} - \bar{X}_n)(X_t - \bar{X}_n)
\end{equation}

where $h \in {0, \dots, n-1}$. It is easy to show that $\hat{\gamma}(h) = \hat{\gamma}(-h)$ (note: for negative values we need to change the order in the summation).

- The **sample autocorrelation function** is a estimator of $\rho(h)$

\begin{equation}
\hat{\rho}(h) =  \frac{\hat{\gamma}(h)}{\hat{\gamma}(0)}
\end{equation}

where $h \in {0, \dots, n-1}$. 

### Large-Sample distribution of the ACF

Under some general conditions, $\hat{\rho}(h)$ is a consistent estimate of $\rho(h)$. For example, if
$X_1, \dots, X_n$ is an independent and identically distributed (iid) sequence and $X_t$ has finite variance, then $\hat{\rho}(h)$ is asymptotically normal with mean zero and standart deviation $1/\sqrt{n}$.

> Based on the previous result, we obtain a rough method of assessing whether peaks in $\hat{\rho}(h)$ are significant by determining whether the observed peak is outside the interval $\pm 2/\sqrt{n}$ (or plus/minus two standard errors); for a white noise sequence, approximately $95\%$ of the sample ACFs should be within these limits. (TSA p.29).


In [None]:
# sanity check for the statsmodel function
ts = wnoise
n = ts.shape[0]
test_size = 40
check1_cov = acovf(ts,fft=False)[:test_size]
check1_corr = acf(ts,fft=False)[:test_size]
for i in range(test_size):
    raw_cov = ((ts - ts.mean())*(ts.shift(i) - ts.mean())).sum() / n
    raw_corr = raw_cov / ts.var(ddof=0)
    stats_cov = check1_cov[i]
    stats_corr = check1_corr[i]
    
    test_cov = (raw_cov - stats_cov)**2
    test_corr = (raw_corr - stats_corr)**2
    assert test_cov < 1e-4
    assert test_corr < 1e-4    

### ACF Plot Examples

In [None]:
plot_acf(wnoise, lag_range=100)
plot_acf(simple_net_return1, lag_range=100)
plot_acf(simple_net_return2, lag_range=100)
plot_acf(simple_net_return3, lag_range=100)

### Cross-Correlation, Lead and Lag

For two samples $\{X_t\}_{t\leq n}$ and $\{Y_t\}_{t\leq n}$ the **sample cross-covariance function** is given by

\begin{equation}
\hat{\gamma}_{XY}(h) =  \frac{1}{n}\sum_{t=1}^{n-h}(X_{t+h} - \bar{X}_n)(Y_t - \bar{Y}_n)
\end{equation}

where $h \in {0, \dots, n-1}$ (note that $\hat{\gamma}_{XY}(h) = \hat{\gamma}_{YX}(-h)$). 

Similarly, the **sample cross-correlation function** is defined as


\begin{equation}
\hat{\rho}_{XY}(h) = \frac{\hat{\gamma}_{XY}(h)}{\sqrt{\hat{\gamma}_{X}(0) \, \hat{\gamma}_{Y}(0)} }
\end{equation}

where $h \in {0, \dots, n-1}$. 


We can use the cross-correlation function to understand the relation between series. Given two series $\{X_t\}_{t\in T}$ and $\{Y_t\}_{t\in T}$, $l\geq 0$ and $W_t$ uncorrelated with $X_t$, if the model

\begin{equation}
Y_t = \beta X_{t-l} + W_t,
\end{equation}

holds, we say that $X_t$ **leads** $Y_t$.

Similarly, if the relationship

\begin{equation}
Y_t = \beta X_{t+l} + W_t,
\end{equation}

holds, we say that $X_t$ **lags** $Y_t$.

For two samples $\{X_t\}_{t\leq n}$ and $\{Y_t\}_{t\leq n}$, the cross-correlation function **can help us guessing at the linear dependence relationship between $X_t$ and $Y_t$**.

For two independent linear processes $\{X_t\}_{t\leq n}$ and $\{Y_t\}_{t\leq n}$, the large sample distribution of $\hat{\rho}_{XY}(h)$ is normal with mean zero and standart deviation $1/\sqrt{n}$, if at leat one of the processes is independent white noise (TSA p.31).

In [None]:
# sanity check for the statsmodel function
# ccf([FIXED SERIES], [LAGGED SERIES])


x = pd.Series(np.random.normal(0,1,100))
y = x.shift(9)
y = y.fillna(1)


x_mean =  x.mean()
y_mean =  y.mean()
std_x = x.std()
std_y = y.std()
stats_corr = ccf(y, x)
my_corr = []

for h in range(90):
    cov = (x.shift(h) - x_mean)*(y - y_mean)
    cov = cov.dropna().mean()
    corr = cov/(std_x*std_y)
    my_corr.append(corr)
    test = (corr - stats_corr[h])**2
    assert test < 1e-3, print("error: h= {} | my_corr = {:.4f} | stats_corr = {:.4f}".format(h, corr, stats_corr[h]))

my_corr = np.array(my_corr)
print(my_corr.max(), stats_corr.max())
print(my_corr.min(), stats_corr.min())
plot_ccf(x, y, 12)

### CCF Plot Examples

### Remember: Pandas Notation

Given a ts $x_t$:

- $x_{t+h} \rightarrow$ `x.shift(-h)`
- $x_{t-h} \rightarrow$ `x.shift(h)`




In [None]:
a = pd.Series(np.random.normal(0,1,100))
a.name = "a"
b = a.shift(2)
b = b.fillna(1)
b.name = "a.shift(2)"
plot_ccf(a, b, 8)

a = pd.Series(np.random.normal(0,1,100))
a.name = "a"
b = a.shift(-4)
b = b.fillna(1)
b.name = "a.shift(-4)"
plot_ccf(a, b, 8)


a = pd.Series(np.random.normal(0,1,100))
a.name = "a"
b = a.shift(50) 
b = b.fillna(1)
b.name = "a.shift(50)"
plot_ccf(a, b, 60)

a = pd.Series(np.random.normal(0,1,100))
a.name = "a"
w = pd.Series(np.random.normal(0,0.1,100))
b = w
b.name = "noise"
plot_ccf(a, b, 60)

## Application Example


### Can we use the performance of a ticker on the day $t-h$ to predict the percetange change of the BRL to USD ratio for the day $t$?

With the ccf function, it is possible to guess at the linear depence of these two instruments.


In [None]:
usd_brl = pd.read_csv(path_usd)
usd_brl.loc[:, "datetime"]  = pd.to_datetime(usd_brl.datetime)
usd_brl = usd_brl.set_index("datetime")["usd_brl"]
brl_usd = (1/usd_brl)
brl_usd = brl_usd.pct_change().dropna()
brl_usd.name = "brl_usd"


fig, ax = plt.subplots(1,1,figsize=(10,5))
brl_usd.plot(ax=ax);
ax.set_title("BRL/USD pct_change \n", fontsize=18);
ax.legend(loc="best");

ibov = ["ABEV3", "AZUL4", "B3SA3", "BBAS3", "BBDC3", "BBDC4", "BBSE3", "BPAC11", "BRAP4",
        "BRDT3", "BRFS3", "BRKM5", "BRML3", "BTOW3", "CCRO3", "CIEL3", "CMIG4", "COGN3", "CRFB3",
        "CSAN3", "CSNA3", "CVCB3", "CYRE3", "ECOR3", "EGIE3", "ELET3", "ELET6", "EMBR3", "ENBR3",
        "EQTL3", "FLRY3", "GGBR4", "GNDI3", "GOAU4", "GOLL4", "HAPV3", "HGTX3", "HYPE3", "IGTA3",
        "IRBR3", "ITSA4", "ITUB4", "JBSS3", "KLBN11", "LAME4", "LREN3", "MRFG3","MGLU3",
        "MRVE3", "MULT3", "NTCO3", "PCAR4", "PETR3", "PETR4", "QUAL3", "RADL3",
        "RAIL3", "RENT3", "SANB11", "SBSP3", "SMLS3", "SULA11", "SUZB3", "TAEE11",
        "TIMP3", "TOTS3", "UGPA3", "USIM5", "VALE3", "VIVT4", "VVAR3", "WEGE3", "YDUQ3"]

lead_series = [] 
lead_series_dict = {}

for ticker_name in ibov:
    ticker_ts = df.xs(ticker_name).close
    ticker_ts.name = ticker_name
    simple_net_return = ticker_ts.pct_change().dropna()
    lead_series.append(simple_net_return)
    lead_series_dict[ticker_name] = simple_net_return
    
lead_m = get_lead_matrix(lead_series, brl_usd, 5)
lead_m = lead_m.sort_values("lag_1", ascending=False)
fig, ax = plt.subplots(figsize=(8,20))
ax.set_title("Tickers cross-correlation on BRL/USD pct change \n", fontsize=18)
sns.heatmap(lead_m, center=0,cmap='PuOr', linewidths=1, annot=True, fmt=".3f", ax=ax, cbar=False);
plt.xticks(rotation=45);
plt.yticks(rotation=0);

### Check plots

In [None]:
ticker_1 = "ITSA4"
ticker_2 = "BBAS3"
lag = 1

ts1 = lead_series_dict[ticker_1]
ts2 = lead_series_dict[ticker_2]
m1 = pd.merge_asof(ts1, brl_usd,
                   left_index=True, right_index=True)
plot_ccf(m1[ticker_1], m1["brl_usd"], 5)
m1.loc[:, ticker_1] = m1[ticker_1].shift(lag)
m2 = pd.merge_asof(ts2, brl_usd,
                   left_index=True, right_index=True)
plot_ccf(m2[ticker_2], m2["brl_usd"], 5)
m2.loc[:, ticker_2] = m2[ticker_2].shift(lag)

fig, ax = plt.subplots(1,2,figsize=(12,5))
ax[0].scatter(m1[ticker_1], m1["brl_usd"]);
ax[1].scatter(m2[ticker_2], m2["brl_usd"]);
ax[0].set_ylabel("BRL/USD pct_change", fontsize=14);
ax[0].set_xlabel("{} simple net return (lag = {})".format(ticker_1,lag), fontsize=14);
ax[1].set_ylabel("BRL/USD pct_change", fontsize=14);
ax[1].set_xlabel("{} simple net return (lag = {})".format(ticker_2,lag), fontsize=14);
ax[0].set_title("corr = {:.3f}\nsample size = {}".format(m1.corr().iloc[0,1], m1.shape[0]), fontsize=18)
ax[1].set_title("corr = {:.3f}\nsample size = {}".format(m2.corr().iloc[0,1], m2.shape[0]), fontsize=18)
plt.subplots_adjust(wspace=0.3)
fig.suptitle("Naive correlation analysis", fontsize=20, y=1.1);

