In [None]:
import numpy as np
import pandas as pd
import torch.random
from matplotlib import pyplot as plt
import sys 
sys.path.append("../..") 
from models.vae import *
from cal_mcc import mean_corr_coef

In [None]:
np.random.seed(1234)
torch.manual_seed(1234)

In [None]:
n_labels = 5
sample_size = 2000
z_dim = 2
x_dim = 300
embedding_dim = 16
random_item_size = 15
sparse_ratio = 0.1
rating_noise_ratio = 10.
treatment_noise_ratio = 0.
confounding_effect_rating = 2.
name_suffix = "_sr_{}_cr_{}_nr_10.0_tr_{}".format(sparse_ratio, confounding_effect_rating, treatment_noise_ratio)

In [None]:
def gen_confounder(sample_size, label_size, confounder_dim):
    mu_true = np.random.uniform(-10, 10, [confounder_dim, label_size])
    var_true = np.random.uniform(1, 5, [confounder_dim, label_size])
    w_true = np.array(np.tile(np.arange(label_size), int(sample_size / label_size)), dtype="int")
    z_true = np.vstack([
        np.random.normal(mu_true[i][w_true], np.sqrt(var_true[i][w_true])) for i in range(confounder_dim)
    ]).T
    return w_true, z_true, mu_true, var_true

In [None]:
def gen_treatment(treatment_dim, confounder, emb_z, sparse_ratio, treatment_noise_ratio = 0):
    exposure_model = MLP(input_dim=confounder.shape[1], hidden_dims=[treatment_dim, treatment_dim],
                         activations=["relu", "sigmoid"])
    # x_prob = exposure_model(torch.tensor(confounder, dtype=torch.float)).detach() * sparse_ratio
    # W = -1 * torch.ones((confounder.shape[1], treatment_dim))
    W = 0.3 * torch.rand((confounder.shape[1], confounder.shape[1]))
    x_prob = nn.LeakyReLU(0.2)(confounder @ W @ emb_z.T)

    x_prob = x_prob 
    noise = (torch.randn_like(x_prob)) * treatment_noise_ratio
    print(x_prob.abs().mean(), noise.abs().mean())
    
    x_prob += noise
    # W = 0.01 * torch.rand((treatment_dim, treatment_dim))
    # x_prob = torch.relu(confounder @ emb_z.T @ W)

    # plt.show()
    x_prob = torch.sigmoid(x_prob) * sparse_ratio
    return x_prob, torch.bernoulli(x_prob)

In [None]:
def gen_gaussian_embedding(size, embedding_dim, add_bias=False):
    emb = torch.randn((size, embedding_dim)) * 5
    if add_bias:
        bias = torch.randint(4, (size, 1))
        emb += bias
    return emb


def gen_uniform_embedding(size, embedding_dim):
    emb = torch.rand((size, embedding_dim))
    return emb

In [None]:
w_true, z_true, mu_true, var_true = gen_confounder(sample_size, n_labels, z_dim)
z_true = torch.tensor(z_true, dtype=torch.float)

In [None]:
emb_u = gen_uniform_embedding(sample_size, embedding_dim)
emb_i = gen_gaussian_embedding(x_dim, embedding_dim, add_bias=True)
emb_z = gen_uniform_embedding(x_dim, z_dim)

In [None]:
x_prob, x_obs = gen_treatment(x_dim, z_true, emb_z, sparse_ratio, treatment_noise_ratio)

In [None]:
emb_z.shape

In [None]:
x_prob

In [None]:
plt.hist(x_prob.mean(1))

In [None]:
torch.bernoulli(x_prob)

In [None]:
plt.hist(x_obs.sum(1))

In [None]:

x_obs.sum()/300/2000

In [None]:
x_obs.sum(1).mean()

In [None]:
x_obs.sum(1).min()

In [None]:
x_obs.sum(0).min()

In [None]:
exp_effect = emb_u @ emb_i.T
confounder_effect = z_true @ emb_z.T * confounding_effect_rating
noise = torch.randn((sample_size, x_dim)) * rating_noise_ratio

mf_res = exp_effect + confounder_effect + noise
# soft_mf_res = torch.pow((mf_res - mf_res.min()) / (mf_res.max() - mf_res.min()), 0.6)
soft_mf_res = torch.pow(
    (mf_res - torch.quantile(mf_res, 0.05)) / (torch.quantile(mf_res, 0.95) - torch.quantile(mf_res, 0.05)), 1)

# soft_mf_res = torch.sigmoid(mf_res)
rating_matrix = torch.ceil(soft_mf_res * 5)
rating_matrix[rating_matrix > 5] = 5
rating_matrix[rating_matrix < 1] = 1

In [None]:
plt.hist(exp_effect.abs().mean(1))

In [None]:
plt.hist(confounder_effect.abs().mean(1))

In [None]:
plt.hist(noise.abs().mean(1))

In [None]:
x_obs

In [None]:
rating_matrix

In [None]:
rating_matrix.mean(0)

In [None]:
uids, iids = x_obs.nonzero(as_tuple=True)
ratings = rating_matrix[uids, iids]

In [None]:
uids.unique().size()

In [None]:
random_iids_list = list()
for i in range(sample_size):
    random_iids_list.append(torch.randperm(x_dim)[:random_item_size])
random_iids = torch.cat(random_iids_list)
random_uids = torch.arange(0, sample_size).view(-1, 1).repeat(1, random_item_size).view(-1)
random_ratings = rating_matrix[random_uids, random_iids]


In [None]:
def save_csv(uids, iids, ratings, name):
    df = pd.DataFrame(
        data={"user_id": uids, "item_id": iids, "rating": ratings}
    )
    df.to_csv(name, sep=",", index=None)
    return df

In [None]:
df_train = save_csv(uids, iids, ratings, "train{}.csv".format(name_suffix))

In [None]:
df_random = save_csv(random_uids, random_iids, random_ratings, "random{}.csv".format(name_suffix))

In [None]:
user_feat_onehot = pd.get_dummies(w_true)

In [None]:
pd.Series(w_true).to_csv("user_feat_label.csv", index=None)

In [None]:
user_feat_onehot.to_csv("user_feat_onehot.csv", index=None)

In [None]:
w_true

In [None]:
# plt.hist(mf_res.view(-1))

In [None]:
# plt.hist(soft_mf_res.view(-1))

In [None]:
df_train["rating"].describe()

In [None]:
df_train["rating"].hist()

In [None]:
df_random["rating"].hist()

In [None]:
df_random["rating"].value_counts()

In [None]:
vae_z_mean = torch.load("../sim_vae/sr_0.1_tr_0.0/mean.pt")
ivae_z_mean = torch.load("../sim_ivae/sr_0.1_tr_0.0/mean.pt")
plt.figure(figsize=(12, 12))
ax1 = plt.subplot(2, 2, 1)
ax1.set_title("True 2-dim latent")
plt.scatter(z_true.T[0], z_true.T[1], c=w_true, s=1)
ax2 = plt.subplot(2, 2, 2)
ax2.set_title("VAE")
plt.scatter(vae_z_mean.T[0].detach().numpy(), vae_z_mean.T[1].detach().numpy(), c=w_true, s=1)
ax3 = plt.subplot(2, 2, 3)
ax3.set_title("iVAE")
plt.scatter(ivae_z_mean.T[0].detach().numpy(), ivae_z_mean.T[1].detach().numpy(), c=w_true, s=1)
print(mean_corr_coef(z_true, vae_z_mean).item())
print(mean_corr_coef(z_true, ivae_z_mean).item())