In [909]:
import pandas as pd
import numpy as np
import scipy
from sklearn.preprocessing import StandardScaler
from bed_reader import open_bed, sample_file

class Data:
    def __init__(self):
        self.gen = None
        self.index = 0

In [910]:
def simulator(allgen, sigma):
    '''
    Simulate phenotype y from X and sigmas
    '''
    X = allgen[0].gen
    N = X.shape[0]
    M = X.shape[1]
    y = np.zeros((N,1))
    sigma_epsilon=1 - sigma # environmental effect sizes
    betas = np.random.randn(M,1)*np.sqrt(sigma) # additive SNP effect sizes
    y += X@betas/np.sqrt(M) # weight 
    #print(f'sigma_epislon={sigma_epsilon}') 
    y += np.random.randn(N,1)*np.sqrt(sigma_epsilon) # add the bias
    return y, betas

def simulator_multi_bin(allgen, sigma_list):
    '''
    Simulate phenotype y from X and sigma_list
    '''
    N_indv = allgen[0].gen.shape[0]
    h = 1 - sum(sigma_list) # residual covariance

    sigma_epsilon = np.random.multivariate_normal([0] * N_indv, np.diag(np.full(N_indv, h)))

    sigma_epsilon = np.array(sigma_epsilon).reshape((len(sigma_epsilon), 1))

    y = np.zeros((N_indv,1))

    betas = []

    for i, data in enumerate(allgen):
        X = data.gen
        M = X.shape[1]
        sigma = sigma_list[i]
        beta = np.random.multivariate_normal([0] * M, np.diag(np.full(M, sigma / M)))
        beta = np.array(beta).reshape((len(beta), 1))
        betas.append(beta)
        y += X@beta
    
    y +=  sigma_epsilon 

    return y, betas

def solve_linear_equation(X, y):
    '''
    Solve least square
    '''
    sigma = np.linalg.lstsq(X, y, rcond=None)[0]
    return sigma


def solve_linear_qr(X, y):
    '''
    Solve least square using QR decomposition
    '''
    Q, R = scipy.linalg.qr(X)
    sigma = scipy.linalg.solve_triangular(R, np.dot(Q.T, y))
    return sigma

def RHE_multi_bin(allgen,y,num_random_vect=10,seed=1,verbose=False):
    '''
    RHE estimation for multi bins
    '''

    K = len(allgen)

    T = np.zeros((K+1, K+1))

    q = np.zeros((K+1, 1))

    for i, data_1 in enumerate(allgen):
        for j, data_2 in enumerate(allgen):
            gen_1 = data_1.gen
            gen_2 = data_2.gen
            N1 = gen_1.shape[0]
            N2 = gen_2.shape[0]
            assert N1 == N2
            M1 = gen_1.shape[1]
            M2 = gen_1.shape[1]
            Xi_1 = gen_1.copy()/np.sqrt(M1)
            Xi_2 = gen_2.copy()/np.sqrt(M2)
            for _ in range(num_random_vect):
                # Generate random vector to estimate trace
                rand_vector = np.random.randn(N1,1)
                T[i,j] += rand_vector.T@Xi_1@Xi_1.T@Xi_2@Xi_2.T@rand_vector/num_random_vect
    
    for i, data in enumerate(allgen):
        X = data.gen
        M = X.shape[1]
        Xi = X.copy()/np.sqrt(M)
        T[i, K] = np.trace(Xi@Xi.T)
        T[K, i] = np.trace(Xi@Xi.T)
        q[i] = y.T@Xi@Xi.T@y

    T[K, K] = allgen[0].gen.shape[0]

    q[K] = y.T@y


    if verbose:
        print(T)
    sigma_est = solve_linear_qr(T,q)
    return sigma_est

In [911]:
def simulate_geno_from_random(p_j):
    rval = np.random.random()
    dist_pj = [(1-p_j)*(1-p_j), 2*p_j*(1-p_j), p_j*p_j]
    
    if rval < dist_pj[0]:
        return 0
    elif rval >= dist_pj[0] and rval < (dist_pj[0] + dist_pj[1]):
        return 1
    else:
        return 2


def impute_geno(X):
    N = X.shape[0]
    M = X.shape[1]
    X_imp = X.copy()

    for m in range(M):
        
        observed_sum = 0
        observed_ct = 0
        for n in range(N):
            if not np.isnan(X[n, m]):
                observed_ct += 1
                observed_sum += X[n, m]
        
        observed_sum = (observed_sum  / observed_ct)* 0.5

        for j in range(N):
            if np.isnan(X[j,m]):
                X_imp[j, m] = simulate_geno_from_random(observed_sum)
                
    # standardize
    X_imp = (X_imp-np.mean(X_imp, axis=0))/np.std(X_imp, axis=0)

    return X_imp

In [912]:
geno_path="/Users/nijiayi/RHE_project/data/test2/actual_geno_1.bed"
bed = open_bed(geno_path)
X = bed.read()

In [913]:
X_imp = impute_geno(X)
Nindv = X_imp.shape[0]
Nsnp = X_imp.shape[1]

In [914]:
# Simulate annotation file
Nbin = 8

import random

def create_annot_file(Nsnp, Nbin, filename):
    with open(filename, 'w') as f:
        for _ in range(Nsnp):
            row = [0] * Nbin
            random_col = random.randint(0, Nbin - 1)
            row[random_col] = 1
            f.write(' '.join(str(val) for val in row) + '\n')

filename = '/Users/nijiayi/RHE_project/data/test2/annot.txt'
create_annot_file(Nsnp, Nbin, filename)

In [915]:
from RHE.util.file_processing import read_annot

filename = '/Users/nijiayi/RHE_project/data/test2/annot.txt'
annot_matrix, jack_bin = read_annot(filename, 1)

Number of SNPs per block: 996
134 SNPs in 0 -th bin
120 SNPs in 1 -th bin
119 SNPs in 2 -th bin
127 SNPs in 3 -th bin
123 SNPs in 4 -th bin
134 SNPs in 5 -th bin
105 SNPs in 6 -th bin
134 SNPs in 7 -th bin


In [916]:
def bin_to_snp(annot_matrix_chunk): 
    bin_to_snp_indices = []

    for bin_index in range(annot_matrix_chunk.shape[1]):
        snp_indices = np.nonzero(annot_matrix_chunk[:, bin_index])[0]
        bin_to_snp_indices.append(snp_indices.tolist())

    return bin_to_snp_indices

bin_to_snp_map = bin_to_snp(annot_matrix)

def create_allgen(bin_to_snp_map, X):
    allgen = [Data() for _ in range(len(bin_to_snp_map))]

    for i, data in enumerate(allgen):
        data.index = len(bin_to_snp_map[i])
        data.gen = X[:, bin_to_snp_map[i]]

    return allgen

In [917]:
sigma_list = [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]

bin_to_snp_map = bin_to_snp(annot_matrix)
allgen = create_allgen(bin_to_snp_map, X_imp)

y, beta_list = simulator_multi_bin(allgen, sigma_list)

print(f'Actual sigmas are: {sigma_list}')

sigma_est=RHE_multi_bin(allgen, y, num_random_vect=10, seed=42) # run py-RHE
print('RHE estimated sigmas are: {}'.format(", ".join(str(s) for s in sigma_est)))

Actual sigmas are: [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
RHE estimated sigmas are: [0.12918466], [0.13577179], [0.05269134], [0.10714339], [0.12205465], [0.13589909], [0.0659057], [0.14074607], [0.14654497]
