In [2]:
import numpy as np
from scipy.linalg import sqrtm, inv
from sklearn.linear_model import LogisticRegression
d, r = 4, 0.1
sigma = np.identity(d) * (1-r) + r
M = sqrtm(sigma)
mu = np.repeat(1, d).reshape((-1,1))

### Source data

In [3]:
n_source, pi_source = 200, 0.5
y_source = np.random.binomial(1, pi_source, (n_source,1))
x_source = np.random.normal(0, 1, (n_source, d)) @ M + y_source @ mu.T

### Target data

In [4]:
n_target, pi_target = 200, 0.75
y_target = np.random.binomial(1, pi_target, (n_target,1))
x_target = np.random.normal(0, 1, (n_target, d)) @ M + y_target @ mu.T
x_target = x_target[:, :3]

### Test data set

In [5]:
n_test, pi_test = 200, pi_target
y_test = np.random.binomial(1, pi_test, (n_test,1))
x_test = np.random.normal(0, 1, (n_test, d)) @ M + y_test @ mu.T
x_test = x_test[:, :3]

### Logistic classifier 

We fit logistic regression with the full source data, target data with estimated last co-ordinate and measure the performance with estimated last co-ordinate. 

#### Source classifier

In [6]:
scl = LogisticRegression(max_iter=1000)
scl.fit(x_source, y_source)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

#### Mean and co-variance matrix estimation

In [11]:
# Mean estimation
mu_present = np.mean(np.concatenate([x_source[:, :3], x_target, x_test], 0), axis=0)
mu_missing = np.mean(x_source[:, 3])
# Covariance estimation
cov_present = np.cov(np.concatenate([x_source[:, :3], x_target, x_test], 0), rowvar=False)
cov_missing = np.cov(x_source[:, 3], rowvar=False)
cov_present_vs_missing = (x_source[:, :3].T @x_source[:, 3]/200).T

### Imputation of the missing covariates


 $\hat x = \Sigma_{\text{missing}, \text{present}}\Sigma_{\text{present}}^{-1}(x_{\text{present}} - \mu_{\text{present}})$

In [None]:
def covariate_imputation(x, y, mu = np.array([0]*3), sigma_present = np.identity(d)\
    sigma_missing = 1, cov = np.array([0]*3)):
    return (((x - mu) @ inv(sigma[:3, :3]) @ sigma[:3, 3])\
         + y * mu[3]).reshape((-1,1))
