In [1]:
!pip install pandas-plink limix-lmm


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.3[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import numpy as np
from pandas_plink import read_plink
from limix_lmm import LMM
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from statsmodels.stats.multitest import multipletests
import seaborn as sns
from sklearn.model_selection import train_test_split

In [3]:
import scipy
import scipy.stats as st

if not hasattr(scipy, 'dot'):
    scipy.dot = np.dot
if not hasattr(scipy, 'einsum'):
    scipy.einsum = np.einsum
if not hasattr(scipy, 'log'):
    scipy.log = np.log
if not hasattr(scipy, 'sign'):
    scipy.sign = np.sign
if not hasattr(scipy, 'sqrt'):
    scipy.sqrt = np.sqrt

### Preprocessing pipeline

BFILE=/Users/oykusuoglu/gobi/gobi_gwas/oyku/data/ALL.chr22_GRCh38.genotypes.20170504

OUTDIR=/Users/oykusuoglu/gobi/gobi_gwas/oyku/data/qc

mkdir -p "$OUTDIR"


plink --bfile "$BFILE" --freq --missing --out "$OUTDIR"/initial_qc


Preprocessing 

We only kept SNP like variants (single base substitutions), dropped variants with missing rate > 0.02 and minor allele frequency < 0.01 to keep only commons, and also variants that have extreme deviations that are caused by noise rather than signal

(filtered data set written as a new bed/bim/fam)


plink --bfile "$BFILE" \
  --snps-only \
  --geno 0.02 \
  --maf 0.01 \
  --hwe 1e-6 \
  --make-bed \
  --out "$OUTDIR"/chr22_step1_common


check if there are duplicate variant ids

plink --bfile "$OUTDIR"/chr22_step1_common \
  --list-duplicate-vars ids-only suppress-first \
  --out "$OUTDIR"/dupcheck

wc -l "$OUTDIR"/dupcheck.dupvar

there were no duplicates (0)

plink --bfile "$OUTDIR"/chr22_step1_common \
  --exclude "$OUTDIR"/dupcheck.dupvar \
  --make-bed \
  --out "$OUTDIR"/chr22_step2_nodup

sample level missingness, none was filtered (both chr22_step2_nodup.fam and chr22_step3_sampleqc.fam are same size 2504)

plink --bfile "$OUTDIR"/chr22_step2_nodup \
  --mind 0.02 \
  --make-bed \
  --out "$OUTDIR"/chr22_step3_sampleqc




INITIAL:

Number of variants: 109827 

Number of samples: 2504

AFTER QC:

Number of variants: 59743

Number of samples: 2504


In [4]:
bfile = 'data/qc/chr22_step3_sampleqc'
bim, fam, G = read_plink(bfile)

Mapping files: 100%|██████████| 3/3 [00:00<00:00, 47.76it/s]
top-level pandera module will be **removed in a future version of pandera**.
If you're using pandera to validate pandas objects, we highly recommend updating
your import:

```
# old import
import pandera as pa

# new import
import pandera.pandas as pa
```

If you're using pandera to validate objects from other compatible libraries
like pyspark or polars, see the supported libraries section of the documentation
for more information on how to import pandera:

https://pandera.readthedocs.io/en/stable/supported_libraries.html


```
```



In [5]:
X_real = G.compute().T

Standardize the genotype matrix, so that all SNPs are on the same scale, 3 variants were dropped, which had a standard deviation equals to 0 among all individuals. (bim annotation table must be also updated)

In [6]:
#whole chromosome 22

mu_full = X_real.mean(axis=0)
sd_full = X_real.std(axis=0, ddof=0)
keep_full = sd_full > 1e-12
keep_idx  = np.where(keep_full)[0] 
standardized_X = (X_real[:, keep_full] - mu_full[keep_full]) / sd_full[keep_full]

In [7]:
# bim data for variants that werent dropped after the standardization
bim_kept = bim.iloc[keep_idx].copy().reset_index(drop=True)

bim_kept["orig_bim_idx"] = keep_idx

In [8]:
standardized_X.shape

(2504, 59740)

In [9]:
def simulate_pheno(X, idx_caus, var_expl, rng, direction=None):
    # Ensure that the number of causal variant indices matches the number of variances explained.
    assert len(idx_caus) == len(var_expl)

    # If no direction is provided, randomly assign a positive or negative direction for each causal variant.
    if direction is None:
        direction = 2. * (rng.random(len(idx_caus)) > 0.5) - 1.
    # Ensure that the number of directions matches the number of causal variant indices.
    assert len(idx_caus) == len(direction)

    # Compute the remaining variance after accounting for the variance explained by the causal variants.
    ve = 1 - var_expl.sum()
    # Ensure that the total variance explained by causal variants is less than 1.
    assert ve > 0, 'sum(var_expl) should be < 1'

    # Compute the effect sizes for the causal variants based on the variance they explain and their direction.
    beta = np.sqrt(var_expl) * direction

    # Extract the columns of X corresponding to the causal variants and standardize them.
    Xc = X[:, idx_caus]
    Xc = (Xc - Xc.mean(0)) / Xc.std(0)

    # Compute the genetic component of the phenotype.
    yg = Xc.dot(beta)[:, None]
    # Compute the noise component of the phenotype.
    yn = np.sqrt(ve) * rng.standard_normal((X.shape[0], 1))

    # Sum the genetic and noise components to get the simulated phenotype.
    y = yg + yn

    # Initialize the real effect sizes for all variants in X as zeros.
    beta_real = np.zeros(X.shape[1])
    # Update the real effect sizes for the causal variants.
    beta_real[idx_caus] = beta

    # Standardize the phenotypic values to have mean 0 and standard deviation 1.
    ystd = y.std()
    y = (y - y.mean()) / ystd
    # Adjust the real effect sizes accordingly after standardizing y.
    beta_real = beta_real / ystd

    return y, beta_real

In [10]:
def qq_plot(p_values, title):
    """
    Create a QQ plot given a list of p-values.

    Parameters:
    - p_values: list of p-values
    - title: title for the plot
    """

    # Sort p-values
    observed = -np.log10(np.sort(p_values))
    expected = -np.log10(np.arange(1, len(p_values) + 1) / (len(p_values) + 2))

    # Create the QQ plot
    plt.scatter(expected, observed, marker='.')
    plt.plot([0, max(expected)], [0, max(expected)], color='red', linestyle='--')
    plt.xlabel('Expected -log10(P-value)')
    plt.ylabel('Observed -log10(P-value)')
    plt.title(title)

In [11]:
m = np.arange(10, 101, 10)
heritability_vals = np.linspace(0.1, 0.6, 6).astype(np.float64)

In [12]:
from pathlib import Path
base_dir = Path("/Users/oykusuoglu/gobi/gobi_gwas/oyku/plots")

In [13]:
for heritability in heritability_vals:
    for count_causal in m:
        # Phenotype simulation
        rng = np.random.default_rng(42)
        idx_caus_loop = rng.choice(standardized_X.shape[1], size=count_causal, replace=False)
        var_expl_loop = np.repeat(heritability/count_causal, count_causal)

        y_loop, beta_real_loop = simulate_pheno(standardized_X, idx_caus_loop, var_expl_loop, rng)

        # Merging PCs with phenotype
        pheno = fam[["fid","iid"]].copy()
        pheno.columns = ["FID","IID"]
        pheno["y"] = y_loop.reshape(-1)

        pcs = pd.read_csv("data/pca/chr22_pca.eigenvec", sep=r"\s+", header=None, engine="python")
        pcs.columns = ["FID","IID"] + [f"PC{i}" for i in range(1, pcs.shape[1]-1)]

        df = pheno.merge(pcs, on=["FID","IID"], how="inner", validate="one_to_one")
        k = 10
        F = np.column_stack([np.ones((df.shape[0], 1)), df[[f"PC{i}" for i in range(1, k+1)]].to_numpy()])

        # Training split
        X_train, X_test, y_train, y_test, F_train, F_test = train_test_split(
                                                            standardized_X, y_loop, F,
                                                            test_size=0.3,
                                                            random_state=42,
                                                            shuffle=True
                                                                        )
        # GWAS
        lmm = LMM(y_train, F_train)
        lmm.process(X_train)
        pv = lmm.getPv()
        beta = lmm.getBetaSNP()
        beta_ste = lmm.getBetaSNPste()

        outdir = base_dir / f"h2_{heritability}_causal_vars{count_causal}"
        outdir.mkdir(parents=True, exist_ok=True)

        # plotting
        fig = plt.figure()
        qq_plot(pv, f'QQ Plot for heritability: {heritability} number of causal variants: {count_causal}')
        fig.tight_layout()
        fig.savefig(outdir / f"qq_h2_{heritability}_causal_vars{count_causal}.png", dpi=300, bbox_inches="tight")
        plt.close(fig)

        x = bim_kept['pos'].values
        fig2 = plt.figure(figsize=(10, 6))
        ax1 = fig2.add_subplot(2, 1, 1)
        ax2 = fig2.add_subplot(2, 1, 2, sharex=ax1)

        ax1.set_title(f"Real effect size for heritability: {heritability} number of causal variants: {count_causal}")
        ax1.plot(x, beta_real_loop, ".k")
        ax1.set_ylabel("eff size")

        ax2.set_title(f"GWAS results for heritability: {heritability} number of causal variants: {count_causal}")
        ax2.plot(x, -np.log10(pv), ".k")
        ax2.set_ylabel(r"-log$_{10}$ P")
        ax2.set_xlabel("position")

        fig2.tight_layout()
        fig2.savefig(outdir / f"effects_gwas_h2{heritability}_causal_vars{count_causal}.png",
                    dpi=300, bbox_inches="tight")
        plt.close(fig2)


  ste = beta / z


### Phase 2: GWAS / Feature Selection

#### 3. Clumping using PLINK 

First create a file with SNP ids and their corresponding p-values

In [14]:
gwas_for_clump = pd.DataFrame({
    "SNP": bim_kept["snp"].astype(str),   
    "P": np.asarray(pv, dtype=float)
})
gwas_for_clump.to_csv("gwas_for_clump.txt", sep="\t", index=False)