In [1]:
import numpy as np
from pandas_plink import read_plink
from limix_lmm import LMM
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from statsmodels.stats.multitest import multipletests
import seaborn as sns

In [2]:
import scipy
import scipy.stats as st

if not hasattr(scipy, 'dot'):
    scipy.dot = np.dot
if not hasattr(scipy, 'einsum'):
    scipy.einsum = np.einsum
if not hasattr(scipy, 'log'):
    scipy.log = np.log
if not hasattr(scipy, 'sign'):
    scipy.sign = np.sign
if not hasattr(scipy, 'sqrt'):
    scipy.sqrt = np.sqrt

In [3]:
%%bash
set -euo pipefail

RAW_BFILE="/Users/oykusuoglu/gobi/gobi_gwas/oyku/data/raw/ALL.chr22_GRCh38.genotypes.20170504"
OUTDIR_PC="/Users/oykusuoglu/gobi/gobi_gwas/oyku/data/pca/before_qc"

mkdir -p "$OUTDIR_PC"

plink --bfile "$RAW_BFILE" \
  --indep-pairwise 200 50 0.2 \
  --out "$OUTDIR_PC/chr22_raw_prune"

#### PCA on pruned RAW variants (writes: .eigenvec, .eigenval)
plink --bfile "$RAW_BFILE" \
  --extract "$OUTDIR_PC/chr22_raw_prune.prune.in" \
  --pca 10 \
  --out "$OUTDIR_PC/chr22_raw_pca10"

PLINK v1.9.0-b.7.11 64-bit (19 Aug 2025)           cog-genomics.org/plink/1.9/
(C) 2005-2025 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to /Users/oykusuoglu/gobi/gobi_gwas/oyku/data/pca/before_qc/chr22_raw_prune.log.
Options in effect:
  --bfile /Users/oykusuoglu/gobi/gobi_gwas/oyku/data/raw/ALL.chr22_GRCh38.genotypes.20170504
  --indep-pairwise 200 50 0.2
  --out /Users/oykusuoglu/gobi/gobi_gwas/oyku/data/pca/before_qc/chr22_raw_prune

16384 MB RAM detected; reserving 8192 MB for main workspace.
109827 variants loaded from .bim file.
2504 people (0 males, 0 females, 2504 ambiguous) loaded from .fam.
Ambiguous sex IDs written to
/Users/oykusuoglu/gobi/gobi_gwas/oyku/data/pca/before_qc/chr22_raw_prune.nosex
.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 2504 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455



Using up to 9 threads (change this with --threads).
Before main variant filters, 2504 founders and 0 nonfounders present.
Calculating allele frequencies... 111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
Total genotyping rate is 0.999756.
7552 variants and 2504 people pass filters and QC.
Note: No phenotypes present.
Relationship matrix calculation complete.
--pca: Results saved to and eigenvectors]
/Users/oykusuoglu/gobi/gobi_gwas/oyku/data/pca/before_qc/chr22_raw_pca10.eigenval
and
/Users/oykusuoglu/gobi/gobi_gwas/oyku/data/pca/before_qc/chr22_raw_pca10.eigenvec
.


In [6]:
%%bash
set -euo pipefail

RAW_BFILE="/Users/oykusuoglu/gobi/gobi_gwas/oyku/data/raw/ALL.chr22_GRCh38.genotypes.20170504"
OUTDIR_QC="/Users/oykusuoglu/gobi/gobi_gwas/oyku/data/pca/before_qc/qc"

mkdir -p "$OUTDIR_QC"

#### Initial QC summaries on RAW
plink --bfile "$RAW_BFILE" \
  --freq --missing \
  --out "$OUTDIR_QC/initial_qc"

#### Variant-level filtering (SNPs only, geno, maf, hwe) -> new BED/BIM/FAM
plink --bfile "$RAW_BFILE" \
  --snps-only \
  --geno 0.02 \
  --maf 0.01 \
  --hwe 1e-6 \
  --make-bed \
  --out "$OUTDIR_QC/chr22_step1_common"

#### Duplicate variant ID check
plink --bfile "$OUTDIR_QC/chr22_step1_common" \
  --list-duplicate-vars ids-only suppress-first \
  --out "$OUTDIR_QC/dupcheck"

wc -l "$OUTDIR_QC/dupcheck.dupvar"

#### Remove duplicates (safe even if file is empty; plink will just exclude none)
plink --bfile "$OUTDIR_QC/chr22_step1_common" \
  --exclude "$OUTDIR_QC/dupcheck.dupvar" \
  --make-bed \
  --out "$OUTDIR_QC/chr22_step2_nodup"

#### Sample-level missingness
plink --bfile "$OUTDIR_QC/chr22_step2_nodup" \
  --mind 0.02 \
  --make-bed \
  --out "$OUTDIR_QC/chr22_step3_sampleqc"

PLINK v1.9.0-b.7.11 64-bit (19 Aug 2025)           cog-genomics.org/plink/1.9/
(C) 2005-2025 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to /Users/oykusuoglu/gobi/gobi_gwas/oyku/data/pca/before_qc/qc/initial_qc.log.
Options in effect:
  --bfile /Users/oykusuoglu/gobi/gobi_gwas/oyku/data/raw/ALL.chr22_GRCh38.genotypes.20170504
  --freq
  --missing
  --out /Users/oykusuoglu/gobi/gobi_gwas/oyku/data/pca/before_qc/qc/initial_qc

16384 MB RAM detected; reserving 8192 MB for main workspace.
109827 variants loaded from .bim file.
2504 people (0 males, 0 females, 2504 ambiguous) loaded from .fam.
Ambiguous sex IDs written to
/Users/oykusuoglu/gobi/gobi_gwas/oyku/data/pca/before_qc/qc/initial_qc.nosex .
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 2504 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626

In [None]:
%%bash
set -euo pipefail

OUTDIR_QC="/Users/oykusuoglu/gobi/gobi_gwas/oyku/data/pca/before_qc/qc"
OUTDIR_PC="/Users/oykusuoglu/gobi/gobi_gwas/oyku/data/pca/before_qc"

# IDs in final QC dataset
cut -d' ' -f1,2 "$OUTDIR_QC/chr22_step3_sampleqc.fam" | sort > "$OUTDIR_QC/final_ids.txt"

# IDs in PCA eigenvec
awk '{print $1, $2}' "$OUTDIR_PC/chr22_raw_pca10.eigenvec" | sort > "$OUTDIR_PC/pca_ids.txt"

# show any samples present in final but missing in PCA 
comm -23 "$OUTDIR_QC/final_ids.txt" "$OUTDIR_PC/pca_ids.txt"

Standardize PCs

In [None]:
import pandas as pd
import numpy as np

eigenvec = "/Users/oykusuoglu/gobi/gobi_gwas/oyku/data/pca/before_qc/chr22_raw_pca10.eigenvec"

df = pd.read_csv(eigenvec, sep=r"\s+", header=None)
ncols = df.shape[1]
pc_cols = list(range(2, ncols))  # columns 2..end are PCs

# z-score each PC across individuals
pcs = df.iloc[:, pc_cols]
pcs_z = (pcs - pcs.mean(axis=0)) / pcs.std(axis=0, ddof=0)

df_z = df.copy()
df_z.iloc[:, pc_cols] = pcs_z

df_z.columns = ["FID", "IID"] + [f"PC{i}" for i in range(1, len(pc_cols)+1)]

out = eigenvec.replace(".eigenvec", ".eigenvec.zscore")
df_z.to_csv(out, sep="\t", index=False)

out

'/Users/oykusuoglu/gobi/gobi_gwas/oyku/data/pca/before_qc/chr22_raw_pca10.eigenvec.zscore'

In [11]:
df_z[[f"PC{i}" for i in range(1, 11)]].mean().round(6), df_z[[f"PC{i}" for i in range(1, 11)]].std(ddof=0).round(6)


(PC1     0.0
 PC2    -0.0
 PC3     0.0
 PC4     0.0
 PC5     0.0
 PC6    -0.0
 PC7    -0.0
 PC8     0.0
 PC9     0.0
 PC10    0.0
 dtype: float64,
 PC1     1.0
 PC2     1.0
 PC3     1.0
 PC4     1.0
 PC5     1.0
 PC6     1.0
 PC7     1.0
 PC8     1.0
 PC9     1.0
 PC10    1.0
 dtype: float64)

Check how data looks like

In [13]:
bfile = '/Users/oykusuoglu/gobi/gobi_gwas/oyku/data/pca/before_qc/qc/chr22_step3_sampleqc'
bim, fam, G = read_plink(bfile)

Mapping files: 100%|██████████| 3/3 [00:00<00:00, 39.29it/s]
top-level pandera module will be **removed in a future version of pandera**.
If you're using pandera to validate pandas objects, we highly recommend updating
your import:

```
# old import
import pandera as pa

# new import
import pandera.pandas as pa
```

If you're using pandera to validate objects from other compatible libraries
like pyspark or polars, see the supported libraries section of the documentation
for more information on how to import pandera:

https://pandera.readthedocs.io/en/stable/supported_libraries.html


```
```



In [15]:
X_real = G.compute().T
X_real

array([[2., 2., 2., ..., 2., 1., 2.],
       [1., 2., 2., ..., 2., 1., 2.],
       [2., 2., 2., ..., 1., 2., 2.],
       ...,
       [2., 2., 2., ..., 2., 2., 2.],
       [2., 2., 1., ..., 1., 2., 2.],
       [1., 2., 1., ..., 1., 2., 2.]], dtype=float32)