In [10]:
from matplotlib import pyplot as plt
import time

import numpy as np
import pandas as pd
from scipy.stats import norm

from qolmat.analysis.holes_characterization import PKLMTest
from qolmat.benchmark.missing_patterns import UniformHoleGenerator

plt.rcParams.update({"font.size": 12})

In [2]:
rng = np.random.default_rng(42)
n_rows, n_cols = 5000, 4
matrix = rng.normal(size=(n_rows, n_cols))
num_nan = int(n_rows * n_cols * 0.20)
nan_indices = rng.choice(n_rows * n_cols, num_nan, replace=False)
matrix.flat[nan_indices] = np.nan

### First test with 'exact_p_value' = True

In [3]:
pklm_test = PKLMTest(
    nb_projections=100,
    nb_permutation=30,
    nb_trees_per_proj=200,
    exact_p_value=True,
    random_state=42
)

start_time = time.time()
p_v = pklm_test.test(matrix)
print(p_v)
print("--- %s seconds ---" % (time.time() - start_time))

0.7096774193548387
--- 11.930958271026611 seconds ---


### First test with 'exact_p_value' = False

In [4]:
pklm_test = PKLMTest(
    nb_projections=100,
    nb_permutation=30,
    nb_trees_per_proj=200,
    exact_p_value=False,
    random_state=42
)

start_time = time.time()
p_v = pklm_test.test(matrix)
print(p_v)
print("--- %s seconds ---" % (time.time() - start_time))

0.7096774193548387
--- 10.284496068954468 seconds ---


__Notes__ :

- First this a weird to get the exact same results.
- Second, this is also weird that the second execution is juste one second faster.

This need to be reviewed and fixed

### Test with pandas dataframe and mixed datatypes

In [5]:
import pandas as pd
import numpy as np

# Définir le nombre de lignes
n_rows = 100

col1 = np.random.rand(n_rows) * 100
col2 = np.random.randint(1, 100, n_rows)
col3 = np.random.choice([True, False], n_rows)
modalities = ['A', 'B', 'C', 'D']
col4 = np.random.choice(modalities, n_rows)

df = pd.DataFrame({
    'Numeric1': col1,
    'Numeric2': col2,
    'Boolean': col3,
    'Category': col4
})

df.head()

Unnamed: 0,Numeric1,Numeric2,Boolean,Category
0,24.845528,92,False,B
1,45.993513,69,False,D
2,86.659194,2,False,B
3,41.945953,34,False,C
4,88.457495,71,True,C


__Holes_creation__ : According tot he Qolmat tool

In [8]:
hole_gen = UniformHoleGenerator(
    n_splits=1,
    ratio_masked=0.2,
    subset=['Numeric1', 'Numeric2', 'Boolean', 'Category'],
    random_state=42
)
df_mask = hole_gen.generate_mask(df)
df_nan = df.where(~df_mask, np.nan)
df_nan.head()

Unnamed: 0,Numeric1,Numeric2,Boolean,Category
0,,92.0,False,B
1,45.993513,69.0,False,D
2,86.659194,2.0,False,B
3,41.945953,34.0,False,C
4,,71.0,True,


In [9]:
pklm_test = PKLMTest(
    nb_projections=100,
    nb_permutation=30,
    nb_trees_per_proj=200,
    random_state=42
)

start_time = time.time()
p_v = pklm_test.test(df_nan)
print(p_v)
print("--- %s seconds ---" % (time.time() - start_time))

0.6129032258064516
--- 1.973733901977539 seconds ---


### Go back with the previous examples

In [11]:
rng = np.random.RandomState(42)
data = rng.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=200)
df = pd.DataFrame(data=data, columns=["Column 1", "Column 2"])

q975 = norm.ppf(0.975)

In [21]:
pklm_test = PKLMTest(
    nb_projections=100,
    nb_permutation=100,
    nb_trees_per_proj=200,
    random_state=42
)

### Case 1: MCAR holes (True negative)¶

In [22]:
hole_gen = UniformHoleGenerator(
    n_splits=1, random_state=rng, subset=["Column 2"], ratio_masked=0.2
)
df_mask = hole_gen.generate_mask(df)
df_nan = df.where(~df_mask, np.nan)

start_time = time.time()
p_v = pklm_test.test(df_nan)
print(p_v)
print("--- %s seconds ---" % (time.time() - start_time))

0.9207920792079208
--- 2.251690149307251 seconds ---


### Case 2: MAR holes with mean bias (True positive)¶

In [23]:
df_mask = pd.DataFrame({"Column 1": False, "Column 2": df["Column 1"] > q975}, index=df.index)

df_nan = df.where(~df_mask, np.nan)

start_time = time.time()
p_v = pklm_test.test(df_nan)
print(p_v)
print("--- %s seconds ---" % (time.time() - start_time))

0.009900990099009901
--- 2.0787599086761475 seconds ---


### Case 3: MAR holes with any mean bias (False negative)

In [24]:
df_mask = pd.DataFrame(
    {"Column 1": False, "Column 2": df["Column 1"].abs() > q975}, index=df.index
)

df_nan = df.where(~df_mask, np.nan)

start_time = time.time()
p_v = pklm_test.test(df_nan)
print(p_v)
print("--- %s seconds ---" % (time.time() - start_time))

0.009900990099009901
--- 2.1425230503082275 seconds ---
