In [1]:
import pandas as pd
from stats import get_entropy, entropy_metric

## **Definições**

**Entropia:**

$$
H(X) = - \sum_{x \in X} p(x) \log p(x)
$$

**Informação mútua:**

$$I(X, T) = H(X) + H(T) - H(X,T)$$

# **Valores Teóricos**

In [2]:
path_case_1 = "data/valor_teorico.txt"
path_case_2 = "data/valor_teorico_2T.txt"


df1_true = pd.read_csv(path_case_1, sep=" ", header=None)
df2_true = pd.read_csv(path_case_2, sep=" ", header=None)

In [3]:
def parse_proba_true(df, index, col):
    fst, end = index
    return entropy_metric(df.filter(range(fst,end), axis = "index")
                            .filter([col])
                            .astype(float)
                          ).values[0]

In [4]:
# Caso 1 
marginal1_proba_x = parse_proba_true(df1_true, index=(11,14),col=1)
marginal1_proba_t = parse_proba_true(df1_true, index=(15,18),col=1)
joint1_proba = parse_proba_true(df1_true, index=(1,10),col=2)
                                    
i1_true = marginal1_proba_x+marginal1_proba_t-joint1_proba
# Caso 2 
joint2_proba = parse_proba_true(df2_true, index=(1,28),col=3)
marginal2_proba_x = parse_proba_true(df2_true, index=(29,32),col=1)
marginal2_proba_t1 = parse_proba_true(df2_true, index=(33,36),col=1)
marginal2_proba_t2 = parse_proba_true(df2_true, index=(37,40),col=1)
i2_true = marginal2_proba_x + marginal2_proba_t1 + marginal2_proba_t2 - joint2_proba

# **Valores Estimados Histograma** 

In [5]:
path_case_1 = "data/amostras.txt"
path_case_2 = "data/amostras_2T.txt"


df1 = pd.read_csv(path_case_1, sep=" ", header=None, names=["X", "T"])
df2 = pd.read_csv(path_case_2, sep=" ", header=None, names=["X", "T1", "T2"])

print(f"Quantidade de amostras em {path_case_1}: {len(df1)}")
print(f"Quantidade de amostras em {path_case_2}: {len(df2)}")

Quantidade de amostras em data/amostras.txt: 1000
Quantidade de amostras em data/amostras_2T.txt: 10000


<font color = "orange">**Supondo que cada desfecho da variável aleatória é um bin.**</font>

In [6]:
def get_mutual_information(df:pd.DataFrame):
    marginal_probabilitys = df.apply(get_entropy)
    joint_probability = entropy_metric(df.groupby(df.columns.tolist())
                                          .size()
                                          .div(len(df)))

    return marginal_probabilitys.sum() - joint_probability

i1_sample = get_mutual_information(df1)
i2_sample = get_mutual_information(df2)

print(f"I(X,T) real: {i1_true}\nI(X,T) estimada: {i1_sample}\n")
print(f"I(X,T1,T2) real: {i2_true}\nI(X,T1,T2) estimada: {i2_sample}")

I(X,T) real: 0.06788493864012324
I(X,T) estimada: 0.06780761097990462

I(X,T1,T2) real: 0.3463584014408658
I(X,T1,T2) estimada: 0.3461748898695589


<font color = "orange">**Vou binarizar as colunas antes de calcular a informação mútua**</font>

In [9]:
def create_bins(x, n_bin):
    aux = pd.cut(x,n_bin)
    print(f"Column {x.name} - bin widths: {aux.cat.categories.values.to_tuples()}")
    return aux

print(f"I(X,T) real: {i1_true:.8f}\n")
for bins in range(1, 6):
    i1 = (df1.apply(create_bins, args=[bins])
             .pipe(get_mutual_information) 
         )
    print(f"\033[1mbins:{bins} - I(X,T) estimada: {i1:.8f}\n\033[0m")      

I(X,T) real: 0.06788494

Column X - bin widths: [(-2.003, 1.5)]
Column T - bin widths: [(-1.002, 1.0)]
[1mbins:1 - I(X,T) estimada: 0.00000000
[0m
Column X - bin widths: [(-2.003, -0.25) (-0.25, 1.5)]
Column T - bin widths: [(-1.002, 0.0) (0.0, 1.0)]
[1mbins:2 - I(X,T) estimada: 0.00545163
[0m
Column X - bin widths: [(-2.003, -0.833) (-0.833, 0.333) (0.333, 1.5)]
Column T - bin widths: [(-1.002, -0.333) (-0.333, 0.333) (0.333, 1.0)]
[1mbins:3 - I(X,T) estimada: 0.01044592
[0m
Column X - bin widths: [(-2.003, -1.125) (-1.125, -0.25) (-0.25, 0.625) (0.625, 1.5)]
Column T - bin widths: [(-1.002, -0.5) (-0.5, 0.0) (0.0, 0.5) (0.5, 1.0)]
[1mbins:4 - I(X,T) estimada: 0.06780761
[0m
Column X - bin widths: [(-2.003, -1.3) (-1.3, -0.6) (-0.6, 0.1) (0.1, 0.8) (0.8, 1.5)]
Column T - bin widths: [(-1.002, -0.6) (-0.6, -0.2) (-0.2, 0.2) (0.2, 0.6) (0.6, 1.0)]
[1mbins:5 - I(X,T) estimada: 0.06780761
[0m


In [10]:
print(f"I(X,T1, T2) real: {i2_true:.8f}\n")
for bins in range(1, 6):
    i2 = (df2.apply(create_bins, args=[bins])
             .pipe(get_mutual_information) 
         )
    print(f"\033[1mbins:{bins} - I(X,T1,T2) estimada: {i2:.8f}\n\033[0m")    

I(X,T1, T2) real: 0.34635840

Column X - bin widths: [(-2.003, 1.5)]
Column T1 - bin widths: [(-1.002, 1.0)]
Column T2 - bin widths: [(-1.002, 1.0)]
[1mbins:1 - I(X,T1,T2) estimada: 0.00000000
[0m
Column X - bin widths: [(-2.003, -0.25) (-0.25, 1.5)]
Column T1 - bin widths: [(-1.002, 0.0) (0.0, 1.0)]
Column T2 - bin widths: [(-1.002, 0.0) (0.0, 1.0)]
[1mbins:2 - I(X,T1,T2) estimada: 0.04498194
[0m
Column X - bin widths: [(-2.003, -0.833) (-0.833, 0.333) (0.333, 1.5)]
Column T1 - bin widths: [(-1.002, -0.333) (-0.333, 0.333) (0.333, 1.0)]
Column T2 - bin widths: [(-1.002, -0.333) (-0.333, 0.333) (0.333, 1.0)]
[1mbins:3 - I(X,T1,T2) estimada: 0.21634533
[0m
Column X - bin widths: [(-2.003, -1.125) (-1.125, -0.25) (-0.25, 0.625) (0.625, 1.5)]
Column T1 - bin widths: [(-1.002, -0.5) (-0.5, 0.0) (0.0, 0.5) (0.5, 1.0)]
Column T2 - bin widths: [(-1.002, -0.5) (-0.5, 0.0) (0.0, 0.5) (0.5, 1.0)]
[1mbins:4 - I(X,T1,T2) estimada: 0.34617489
[0m
Column X - bin widths: [(-2.003, -1.3) (-1.3