## Key Causal Signals Appear in Clusters 2 and 3

each row represents the caused variable and each column represents the causing variable

This function should return a pd.DataFrame of the shape (5, 5), representing the  𝑝
 -value matrix for all pairwise Granger Causality tests.

 

In [3]:
import pycaret
from pycaret.clustering import *
import pandas as pd
import datetime
from pycaret.regression import *
from pycaret.regression import RegressionExperiment

import seaborn as sns

#get most recent clustering element, or specify by name
fn = 'efficiency_metric/2023-09-02 16:24:56.csv'
ec = pd.read_csv(fn)
ec.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,group,time,s_MP,change,type,length,sum_change,area,...,p_MP,precursor_buy_cap_pct_change,precursor_ask_cap_pct_change,p_totalBidVol,p_totalAskVol,length.1,sum_change.1,area.1,Cluster,efficiency
0,0,0,1,1660222000000.0,30.0,0.505364,surge,1,0.505364,0.505364,...,29.98,-0.000618,-1.7e-05,-0.00021,-0.002384,6.0,-0.005009518,-0.030057,Cluster 2,3.10%
1,1,1,3,1660222000000.0,29.86,0.00067,surge,1,0.00067,0.00067,...,29.89,-0.002358,-2e-06,-0.000818,-0.000333,1.0,0.000134564,0.000135,Cluster 2,3.10%
2,2,2,5,1660222000000.0,29.88,0.001273,surge,2,0.001808,0.003615,...,29.94,0.00475,-3.6e-05,0.001573,-0.004835,3.0,-0.003410602,-0.010232,Cluster 2,3.10%
3,3,3,7,1660222000000.0,29.8,0.000873,surge,2,0.002114,0.004229,...,29.87,-0.000987,2e-06,-0.000398,6.2e-05,2.0,8.855895e-07,2e-06,Cluster 2,3.10%
4,4,4,9,1660223000000.0,29.9,0.001305,surge,1,0.001305,0.001305,...,29.95,0.001531,1e-05,0.000334,0.001094,7.0,-0.01377036,-0.096393,Cluster 2,3.10%


In [4]:
ec.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'group', 'time', 's_MP', 'change', 'type',
       'length', 'sum_change', 'area', 'surge_area', 'group.1', 'time.1',
       'change.1', 'type.1', 'p_MP', 'precursor_buy_cap_pct_change',
       'precursor_ask_cap_pct_change', 'p_totalBidVol', 'p_totalAskVol',
       'length.1', 'sum_change.1', 'area.1', 'Cluster', 'efficiency'],
      dtype='object')

In [5]:
from statsmodels.tsa.api import VAR
from statsmodels.tsa.vector_ar.var_model import VARResults, VARResultsWrapper

def test_granger(df, p):
    """
    Fits a VAR(p) model on the input df and performs pairwise Granger Causality tests
    """
     # Fit VAR model on first-order differences
    model = VAR(df.diff().dropna())
    results = model.fit(p)
    
    # Initialize p-value matrix
    p_matrix = pd.DataFrame(index=df.columns, columns=df.columns)
    
    # Perform pairwise Granger Causality tests
    for caused in df.columns:
        for causing in df.columns:
            if caused != causing:
                test_result = results.test_causality(caused, causing)
                p_value = test_result.pvalue
                p_matrix.loc[caused, causing] = p_value
    # Ensure all columns have float dtype
    p_matrix = p_matrix.astype(float)
    return p_matrix

# Load data
p=7
setx=['p_MP', 'p_change', 'p_buyCap', 'p_askCap', 'p_totalBidVol','p_totalAskVol',]

## Cluster 0 GC

In [6]:
sendit0 = ec[ec['Cluster']=='Cluster 0'][setx]
p_matrix0 = test_granger(sendit0, p)
# p_matrix0
caul_mtrx = p_matrix0.rename(index={item: f"{item} caused by" for item in p_matrix0.index})
caul_mtrx.where(caul_mtrx.isna(), caul_mtrx <= 0.01)

KeyError: "['p_change', 'p_buyCap', 'p_askCap'] not in index"

## Cluster 1

In [None]:
sendit1 = ec[ec['Cluster']=='Cluster 1'][setx]
p_matrix1 = test_granger(sendit1, p)
# p_matrix0
caul_mtrx1 = p_matrix1.rename(index={item: f"{item} caused by" for item in p_matrix1.index})
caul_mtrx1.where(caul_mtrx1.isna(), caul_mtrx1 <= 0.01)

## Cluster 2

In [None]:
sendit2 = ec[ec['Cluster']=='Cluster 2'][setx]
p_matrix2 = test_granger(sendit2, p)
# p_matrix0
caul_mtrx2 = p_matrix2.rename(index={item: f"{item} caused by" for item in p_matrix2.index})
caul_mtrx2.where(caul_mtrx2.isna(), caul_mtrx2 <= 0.01)

In [None]:
sns.heatmap(p_matrix2.corr())

## Cluster 3: largest space for profitable trades 
(length = duration, height = positive variance)

In [None]:
sendit3 = ec[ec['Cluster']=='Cluster 3'][setx]
p_matrix3 = test_granger(sendit3, p)
# p_matrix0
caul_mtrx3 = p_matrix3.rename(index={item: f"{item} caused by" for item in p_matrix3.index})
caul_mtrx3.where(caul_mtrx3.isna(), caul_mtrx3 <= 0.01)

## Cluster 3 regression study
highest performing cluster for length x height

In [None]:
## correlation matrix
p_matrix3.corr()

**positively correlated**: p_change / p_MP

**negatively**: p_askCap / p_totalAskVol
            p_askCap / p_totalAskVol

In [None]:
sns.heatmap(p_matrix3.corr())

## multi-model regression on surge area

[from](https://github.com/pycaret/pycaret/blob/master/tutorials/Tutorial%20-%20Regression.ipynb)

In [None]:
s = setup(ec, target = 's_MP', session_id = 42)

exp = RegressionExperiment()

exp.setup(ec, target = 's_MP', session_id = 42)



In [None]:
best =compare_models()

In [None]:
# plot residuals
plot_model(best, plot = 'residuals')

In [None]:
# plot error
plot_model(best, plot = 'error')

In [None]:
# plot feature importance
plot_model(best, plot = 'feature')

In [None]:
evaluate_model(best)