In [1]:
import numpy as np
from pandas_plink import read_plink
from limix_lmm import LMM
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from statsmodels.stats.multitest import multipletests
import seaborn as sns

In [2]:
import scipy
import scipy.stats as st

if not hasattr(scipy, 'dot'):
    scipy.dot = np.dot
if not hasattr(scipy, 'einsum'):
    scipy.einsum = np.einsum
if not hasattr(scipy, 'log'):
    scipy.log = np.log
if not hasattr(scipy, 'sign'):
    scipy.sign = np.sign
if not hasattr(scipy, 'sqrt'):
    scipy.sqrt = np.sqrt

In [3]:
def simulate_pheno(X, idx_caus, var_expl, rng, direction=None):
    # Ensure that the number of causal variant indices matches the number of variances explained.
    assert len(idx_caus) == len(var_expl)

    # If no direction is provided, randomly assign a positive or negative direction for each causal variant.
    if direction is None:
        direction = 2. * (rng.random(len(idx_caus)) > 0.5) - 1.
    # Ensure that the number of directions matches the number of causal variant indices.
    assert len(idx_caus) == len(direction)

    # Compute the remaining variance after accounting for the variance explained by the causal variants.
    ve = 1 - var_expl.sum()
    # Ensure that the total variance explained by causal variants is less than 1.
    assert ve > 0, 'sum(var_expl) should be < 1'

    # Compute the effect sizes for the causal variants based on the variance they explain and their direction.
    beta = np.sqrt(var_expl) * direction

    # Extract the columns of X corresponding to the causal variants and standardize them.
    Xc = X[:, idx_caus]
    Xc = (Xc - Xc.mean(0)) / Xc.std(0)

    # Compute the genetic component of the phenotype.
    yg = Xc.dot(beta)[:, None]
    # Compute the noise component of the phenotype.
    yn = np.sqrt(ve) * rng.standard_normal((X.shape[0], 1))

    # Sum the genetic and noise components to get the simulated phenotype.
    y = yg + yn

    # Initialize the real effect sizes for all variants in X as zeros.
    beta_real = np.zeros(X.shape[1])
    # Update the real effect sizes for the causal variants.
    beta_real[idx_caus] = beta

    # Standardize the phenotypic values to have mean 0 and standard deviation 1.
    ystd = y.std()
    y = (y - y.mean()) / ystd
    # Adjust the real effect sizes accordingly after standardizing y.
    beta_real = beta_real / ystd

    return y, beta_real

In [4]:
def qq_plot(p_values, title):
    """
    Create a QQ plot given a list of p-values.

    Parameters:
    - p_values: list of p-values
    - title: title for the plot
    """

    # Sort p-values
    observed = -np.log10(np.sort(p_values))
    expected = -np.log10(np.arange(1, len(p_values) + 1) / (len(p_values) + 2))

    # Create the QQ plot
    plt.scatter(expected, observed, marker='.')
    plt.plot([0, max(expected)], [0, max(expected)], color='red', linestyle='--')
    plt.xlabel('Expected -log10(P-value)')
    plt.ylabel('Observed -log10(P-value)')
    plt.title(title)

In [None]:
bfile = "/Users/oykusuoglu/gobi/gobi_gwas/universal_data/preprocessing/chr22_preprocessed"

In [6]:
bim,fam, G = read_plink(bfile)

Mapping files: 100%|██████████| 3/3 [00:00<00:00, 50.23it/s]
top-level pandera module will be **removed in a future version of pandera**.
If you're using pandera to validate pandas objects, we highly recommend updating
your import:

```
# old import
import pandera as pa

# new import
import pandera.pandas as pa
```

If you're using pandera to validate objects from other compatible libraries
like pyspark or polars, see the supported libraries section of the documentation
for more information on how to import pandera:

https://pandera.readthedocs.io/en/stable/supported_libraries.html


```
```



In [7]:
n_snps = bim.shape[0]
n_samples = fam.shape[0]
n_snps, n_samples

(59743, 2504)

In [8]:
X_real = G.compute().T

In [None]:
mu_full = X_real.mean(axis=0)
sd_full = X_real.std(axis=0, ddof=0)
keep_full = sd_full > 1e-12
keep_idx  = np.where(keep_full)[0] 
standardized_X = (X_real[:, keep_full] - mu_full[keep_full]) / sd_full[keep_full]

In [10]:
bim_kept = bim.iloc[keep_idx].copy().reset_index(drop=True)

bim_kept["orig_bim_idx"] = keep_idx

In [11]:
h2 = [0.1,0.2,0.3,0.4,0.5,0.6]
n_causal = np.arange(10, 101, 10)

In [12]:
r2_matrix = np.zeros((len(h2), len(n_causal)))
spearman_matrix = np.zeros((len(h2), len(n_causal)))
count_v = 0
count_h = 0

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
idx = np.arange(len(fam))

idx_train, idx_test = train_test_split(idx, test_size=0.2, random_state=42, shuffle=True)

train_ids = fam.loc[idx_train, ["fid","iid"]].copy()
test_ids  = fam.loc[idx_test,  ["fid","iid"]].copy()


train_ids.to_csv("/Users/oykusuoglu/gobi/gobi_gwas/oyku/data/splits/train.keep",
                 sep="\t", index=False, header=False)
test_ids.to_csv("/Users/oykusuoglu/gobi/gobi_gwas/oyku/data/splits/test.keep",
                sep="\t", index=False, header=False)
                

X_train = standardized_X[idx_train]
X_test  = standardized_X[idx_test]

In [15]:
from pathlib import Path
base_dir = Path("/Users/oykusuoglu/gobi/gobi_gwas/oyku/plots/plots_w_new_pcs")

In [16]:
for h in h2:
    for n in n_causal:
        rng = np.random.default_rng(42)
        idx_caus_loop = rng.choice(standardized_X.shape[1], size=n, replace=False)
        var_expl_loop = np.repeat(h/n, n)

        y_loop, beta_real_loop = simulate_pheno(standardized_X, idx_caus_loop, var_expl_loop, rng)

        pheno = fam[["fid","iid"]].copy()
        pheno.columns = ["FID","IID"]
        pheno["y"] = y_loop.reshape(-1)

        pc_path = "/Users/oykusuoglu/gobi/gobi_gwas/oyku/data/pca/before_qc/chr22_raw_pca10.eigenvec.zscore"
        pcs = pd.read_csv(pc_path, sep=r"\s+", header=0, engine="python")
        pcs.columns = ["FID","IID"] + [f"PC{i}" for i in range(1, pcs.shape[1]-1)]

        df = pheno.merge(pcs, on=["FID","IID"], how="inner", validate="one_to_one")

        k = 10
        F = np.column_stack([np.ones((df.shape[0], 1)),
                            df[[f"PC{i}" for i in range(1, k+1)]].to_numpy()])
        
        y_train = y_loop[idx_train]
        y_test  = y_loop[idx_test]
        F_train = F[idx_train]
        F_test  = F[idx_test]
        
        
        lmm = LMM(y_train, F_train)
        lmm.process(X_train)
        pv = lmm.getPv()
        beta = lmm.getBetaSNP()
        beta_ste = lmm.getBetaSNPste()

        outdir = base_dir / f"h2_{h}/causal_vars{n}"
        outdir.mkdir(parents=True, exist_ok=True)


        fig = plt.figure()
        qq_plot(pv, f'QQ Plot for heritability: {h} number of causal variants: {n}')
        fig.tight_layout()
        fig.savefig(outdir / f"qq_h2_{h}_causal_vars{n}.png", dpi=300, bbox_inches="tight")
        plt.close(fig)

        x = bim_kept['pos'].values
        fig2 = plt.figure(figsize=(10, 6))
        ax1 = fig2.add_subplot(2, 1, 1)
        ax2 = fig2.add_subplot(2, 1, 2, sharex=ax1)

        ax1.set_title(f"Real effect size for heritability: {h} number of causal variants: {n}")
        ax1.plot(x, beta_real_loop, ".k")
        ax1.set_ylabel("eff size")

        ax2.set_title(f"GWAS results for heritability: {h} number of causal variants: {n}")
        ax2.plot(x, -np.log10(pv), ".k")
        ax2.set_ylabel(r"-log$_{10}$ P")
        ax2.set_xlabel("position")

        fig2.tight_layout()
        fig2.savefig(outdir / f"effects_gwas_h2{h}_causal_vars{n}.png",
                    dpi=300, bbox_inches="tight")
        plt.close(fig2)

        
        

  ste = beta / z
