In [2]:
import numpy as np
import pandas as pd
import plotly.express as px
from scipy.special import expit, logit
from scipy.stats import bernoulli, norm, uniform

#### Example 1: 2D

$X \sim \text{Uniform}(0,1)$

$Y = X$

In [3]:
# generate data
n = 10000
X = uniform.rvs(size=n)
Y = X
data = {"X": X, "Y": Y}
df = pd.DataFrame(data)

In [3]:
# plot the data distribution
fig = px.scatter(df, x="X", y="Y", width=600, height=600)
fig.show()

In [4]:
# plot the negative sample distribution
Y_neg = np.random.choice(Y, size=n)
data_neg = {"X": X, "Y_neg": Y_neg}
df_neg = pd.DataFrame(data_neg)
fig = px.scatter(df_neg, x="X", y="Y_neg", width=600, height=600)
fig.show()

In [4]:
# plot the negative sample distribution where you sample each Y from the data without replacement
Y_neg = np.random.permutation(Y)
data_neg = {"X": X, "Y_neg": Y_neg}
df_neg = pd.DataFrame(data_neg)
fig = px.scatter(df_neg, x="X", y="Y_neg", width=600, height=600)
fig.show()

#### Example 2: 2D

$X \sim \text{Uniform}(0,1)$

$Y = \sigma(\sigma^{-1}(X) + 0.1 * \mathcal{N}(0, 1))$ when $\sigma$ is the sigmoid function.

In this case, $Y$ is not strictly a deterministic function of $X$ because plus we're sampling from a Normal random variable when calculating $Y$.

In [39]:
# generate data
n = 10000
X = uniform.rvs(size=n)
weighted_normals = norm.rvs(size=n) * 0.1
Y = expit(logit(X) + weighted_normals)
data = {"X": X, "Y": Y}
df = pd.DataFrame(data)

In [40]:
# plot the data distribution
fig = px.scatter(df, x="X", y="Y", width=600, height=600)
fig.show()

In [41]:
# plot the negative sample distribution
Y_neg = np.random.choice(Y, size=n)
data_neg = {"X": X, "Y_neg": Y_neg}
df_neg = pd.DataFrame(data_neg)
fig = px.scatter(df_neg, x="X", y="Y_neg", width=600, height=600)
fig.show()

#### Example 3: 3D

$X \sim \text{Uniform}(0,1)$

$Y = X$

$Z = X$

In [6]:
# generate data
n = 10000
X = uniform.rvs(size=n)
Y = X
Z = X
data = {"X": X, "Y": Y, "Z": Z}
df = pd.DataFrame(data)

In [7]:
# plot the data distribution
fig = px.scatter_3d(df, x="X", y="Y", z="Z", width=600, height=600)
fig.show()

In [8]:
# plot the negative sample distribution
Y_neg = np.random.choice(Y, size=n)
Z_neg = np.random.choice(Z, size=n)
data_neg = {"X": X, "Y_neg": Y_neg, "Z_neg": Z_neg}
df_neg = pd.DataFrame(data_neg)
fig = px.scatter_3d(df_neg, x="X", y="Y_neg", z="Z_neg", width=600, height=600)
fig.show()

In [11]:
# plot when Z is always missing
Z_neg = np.random.choice(Z, size=n)
data_neg = {"X": X, "Y": Y, "Z_neg": Z_neg}
df_neg = pd.DataFrame(data_neg)
fig = px.scatter_3d(df_neg, x="X", y="Y", z="Z_neg", width=600, height=600)
fig.show()

In [20]:
# plot when Z is sometimes missing
m = bernoulli.rvs(0.1, size=n)
Z_missing = np.where(m == 0, Z, Z_neg)
data_missing = {"X": X, "Y": Y, "Z_missing": Z_missing}
df_missing = pd.DataFrame(data_missing)
fig = px.scatter_3d(df_missing, x="X", y="Y", z="Z_missing", width=600, height=600)
fig.show()

In [21]:
# plot when Z is sometimes missing
m = bernoulli.rvs(0.05, size=n)
Z_missing = np.where(m == 0, Z, Z_neg)
data_missing = {"X": X, "Y": Y, "Z_missing": Z_missing}
df_missing = pd.DataFrame(data_missing)
fig = px.scatter_3d(df_missing, x="X", y="Y", z="Z_missing", width=600, height=600)
fig.show()

In [22]:
# plot when Z is sometimes missing
m = bernoulli.rvs(0.01, size=n)
Z_missing = np.where(m == 0, Z, Z_neg)
data_missing = {"X": X, "Y": Y, "Z_missing": Z_missing}
df_missing = pd.DataFrame(data_missing)
fig = px.scatter_3d(df_missing, x="X", y="Y", z="Z_missing", width=600, height=600)
fig.show()

In [23]:
# plot when Y and Z are sometimes missing
m = bernoulli.rvs(0.01, size=n)
l = bernoulli.rvs(0.01, size=n)
Z_missing = np.where(m == 0, Z, Z_neg)
Y_missing = np.where(l == 0, Y, Y_neg)
data_missing = {"X": X, "Y_missing": Y_missing, "Z_missing": Z_missing}
df_missing = pd.DataFrame(data_missing)
fig = px.scatter_3d(df_missing, x="X", y="Y_missing", z="Z_missing", width=600, height=600)
fig.show()