# Extracting GBA carriers

Doing this as a request by Jeff Kim, and as a UKBB usage practice.

## 0) Just make sure that when you start jupyter instance, start up one with 1tb storage and 8 cores
## 1) Download plink exome sequencing data for chr1, chr14, chr19

In [None]:
# plink1 files
#!dx ls Bulk/Genotype\ Results/Genotype\ calls  # you don't want imputed genotypes - you want exome files instead
!dx ls Bulk/Exome\ sequences/Population\ level\ exome\ OQFE\ variants,\ PLINK\ format\ -\ final\ release/

In [None]:
!dx download Bulk/Exome\ sequences/Population\ level\ exome\ OQFE\ variants,\ PLINK\ format\ -\ final\ release/ukb23158_c1_b0_v1*
!dx download Bulk/Exome\ sequences/Population\ level\ exome\ OQFE\ variants,\ PLINK\ format\ -\ final\ release/ukb23158_c14_b0_v1*
!dx download Bulk/Exome\ sequences/Population\ level\ exome\ OQFE\ variants,\ PLINK\ format\ -\ final\ release/ukb23158_c19_b0_v1*

## 2) Use script to extract E326K, T369M, and N370S carriers

In [None]:
import pandas as pd
import numpy as np
import os

!echo -e '1:155235843:T:C\n1:155236246:G:A\n1:155236376:C:T' > GBA1_risk_variants.snplist
#!echo -e 'rs76763715\nrs75548401\nrs2230288' > GBA1_risk_variants.snplist

In [None]:
# Download plink2
!wget https://s3.amazonaws.com/plink2-assets/plink2_linux_x86_64_latest.zip
!unzip plink2_linux_x86_64_latest.zip
!chmod +x plink2

!dx upload plink2 --destination ./

In [None]:
# ONLY NEEDED WHEN USING GENOTYPE DATA
# Generate dummy .fam file with placeholder patient metadata (mandatory to run plink command to extract variants)
import struct

# Paths
bed_file = "ukb22418_c1_b0_v2.bed"
fam_file = "ukb22418_c1_b0_v2.fam"

# Read the number of samples from the .fam file if it exists,
# otherwise count from .bed/.bim. We'll count from .bim here:
bim_file = "ukb22418_c1_b0_v2.bim"

# Count number of lines in .bim (number of variants)
with open(bim_file) as f:
    n_variants = sum(1 for _ in f)

# .bed stores samples in columns, but easier: we can assume you know N samples.
# If you a .fam and have no idea, you can use plink --bfile ... --write-samples to get the sample count.
#!./plink2 --bfile ukb22418_c1_b0_v2 --write-samples

# # For demonstration, let's assume N samples
N = n_variants  # <-- replace with your number of samples

# # Generate dummy .fam
with open(fam_file, "w") as f:
    for i in range(1, N + 1):
        # FID IID PAT MAT SEX PHENOTYPE
        line = f"FAM{i} IND{i} 0 0 0 -9\n"
        f.write(line)

print(f"Dummy .fam file created with {N} samples.")


In [None]:
# Checking file integrity
!ls -lh ukb22418_c1_b0_v2.bed
!md5sum ukb22418_c1_b0_v2.bed  # if a checksum is provided

# Hash codes are available here: https://biobank.ndph.ox.ac.uk/ukb/refer.cgi?id=998

In [None]:
!./plink2 --bfile ukb23158_c1_b0_v1 \
         --extract GBA1_risk_variants.snplist \
         --export A \
         --out GBA1_risk_variants_raw \
         --no-categorical

In [None]:
# Alternative below, using bgen files:
# !./plink2 --bfile ukb22418_c1_b0_v2 --export bgen-1.2 --out ukb22418_c1_b0_v2

# !./plink2 \
#   --bgen ukb22418_c1_b0_v2.bgen ref-first \
#   --sample ukb22418_c1_b0_v2.sample \
#   --extract GBA1_risk_variants.snplist \
#   --export A \
#   --out GBA1_risk_variants_raw

In [None]:
# If both tentatives above failed, use plink1 (1.9) instead
# !wget https://s3.amazonaws.com/plink1-assets/plink_linux_x86_64_20250819.zip
# !unzip plink_linux_x86_64_20250819.zip
# !chmod +x plink

In [None]:
# IMPORTANT - Don't forget to save your result into permanent storage!
!dx upload GBA1_risk_variants_raw.* --destination ./

In [None]:
# Visualize head of .bed and .bim files, and of result
#!head ukb23158_c1_b0_v1.bed # lines are too long
!head ukb23158_c1_b0_v1.bim
!head GBA1_risk_variants_raw.raw

In [None]:
# Defining a function to extract information (by Jeff Kim)
def extract_alternate_carriers_vectorized(raw_file, include_count=False):
    """
    Process PLINK2 raw format (--recode A) to extract participants carrying alternate alleles
    using vectorized operations for improved performance.
    
    Parameters:
    -----------
    raw_file : str
        Path to the PLINK2 raw format file
    include_count : bool, default=False
        If True, output will include a COUNT column with allele counts
        
    Returns:
    --------
    pd.DataFrame
        DataFrame with columns:
        - IID: Sample identifier
        - VARID: Comma-separated list of variants with alternate alleles
        - COUNT: (Only if include_count=True) Comma-separated list of allele counts
                 corresponding to each variant in VARID
                 0 = Homozygous alternate, 1 = Heterozygous
    """
    # Read the raw file
    df = pd.read_csv(raw_file, sep = r'\s+')
    
    # Get the variant IDs (column names after the first 6 columns)
    variant_cols = df.columns[6:]
    
    # Pre-clean variant names (remove _REF part) - only do this operation once
    clean_variants = np.array([var.split('_')[0] for var in variant_cols])
    
    # Extract genotype data as numpy array for faster processing
    genotype_array = df[variant_cols].values
    
    # Get IIDs as numpy array
    iids = df['IID'].values
    
    # Initialize result dictionaries
    result_dict = {}
    count_dict = {}
    
    # Identify alternate allele carriers (value < 2)
    alt_allele_mask = genotype_array < 2
    
    # Process each sample
    for i in range(len(df)):
        # Find variants where this sample has alternate alleles
        alt_indices = np.where(alt_allele_mask[i])[0]
        
        # Only include samples with at least one alternate allele
        if len(alt_indices) > 0:
            # Get the variant IDs
            alternate_variants = clean_variants[alt_indices]
            
            # Join variant IDs
            result_dict[iids[i]] = ','.join(alternate_variants)
            
            if include_count:
                # Get the actual count values (0 or 1) for these variants
                count_values = genotype_array[i, alt_indices]
                # Convert to strings and join
                count_dict[iids[i]] = ','.join(map(str, count_values))
    
    # Convert the dictionaries to a DataFrame
    if include_count:
        result_df = pd.DataFrame({
            'IID': list(result_dict.keys()),
            'VARID': list(result_dict.values()),
            'COUNT': [count_dict[iid] for iid in result_dict.keys()]
        })
    else:
        result_df = pd.DataFrame(list(result_dict.items()), 
                               columns=['IID', 'VARID'])
    
    return result_df

In [None]:
GBA1_carriers = extract_alternate_carriers_vectorized("GBA1_risk_variants_raw.raw", include_count=True)
GBA1_E326K_carriers = GBA1_carriers[GBA1_carriers['VARID'].str.contains('1:155236376:C:T')]
GBA1_noE326K_carriers = GBA1_carriers[GBA1_carriers['VARID'] != ('1:155236376:C:T')]

In [None]:
GBA1_carriers

In [None]:
GBA1_E326K_carriers

In [None]:
GBA1_noE326K_carriers

## 3) Use cohort browser to establish EUR PD cases

In [None]:
# First, see what is the information in the .fam files
!head ukb23158_c1_b0_v1.fam # most probably just sex information

In [None]:
# In cohort browser:
# Option 1: Health related outcomes -> First occurrences -> Nervous system disorders -> Date G20 first reported (parkinson's disease)
# Option 2 (better): Health related outcomes -> First occurrences -> Nervous system disorders -> Source of report of G20 (parkinson's disease)

# Include everything EXCEPT "Self-report only" and missing values (there should be around 4,538 participants with PD)

# Then, download subject ids and upload here

## 4) Find overlaps between #2 and #3 to identify GBA-PD cases

In [None]:
import pandas as pd

PD_patients = pd.read_csv("./UKBB_participant_IDs_w_PD.csv")
PD_patients

In [None]:
GBA1_PD_carriers = GBA1_carriers[GBA1_carriers['IID'].isin(PD_patients['Participant ID'])]
GBA1_PD_carriers

In [None]:
GBA1_PD_E326K_carriers = GBA1_E326K_carriers[GBA1_E326K_carriers['IID'].isin(PD_patients['Participant ID'])]
GBA1_PD_E326K_carriers

In [None]:
GBA1_PD_noE326K_carriers = GBA1_noE326K_carriers[GBA1_noE326K_carriers['IID'].isin(PD_patients['Participant ID'])]
GBA1_PD_noE326K_carriers

## 5) Use plink2 --freq count to get allele counts of 14:74480701:C:T and 19:3548233:A:G (NPC2 and MFSD12 variants)
Trying to see if MFSD12 and NPC2 have different signals depending on E326K vs other GBA

In [None]:
!./plink2 --bfile ukb23158_c14_b0_v1 --keep GBA1_risk_variants_raw.raw --snp "14:74480701:C:T" --freq --out GBA_NPC2_carriers --no-categorical
!echo 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
!./plink2 --bfile ukb23158_c19_b0_v1 --keep GBA1_risk_variants_raw.raw --snp "19:3548233:A:G" --freq --out GBA_MFSD12_carriers --no-categorical

In [None]:
!head GBA_NPC2_carriers.afreq
!head GBA_MFSD12_carriers.afreq

In [None]:
# Do the same, but for PD subjects only: all GBA, E326K and no E326K

# # First, save all PD subjects with stratified GBA
# GBA1_PD_carriers.to_csv("GBA1_PD_carriers.txt", sep="\t", index=False)
# !dx upload GBA1_PD_carriers.txt

# GBA1_PD_E326K_carriers.to_csv("GBA1_PD_E326K_carriers.txt", sep="\t", index=False)
# !dx upload GBA1_PD_E326K_carriers.txt

# GBA1_PD_noE326K_carriers.to_csv("GBA1_PD_noE326K_carriers.txt", sep="\t", index=False)
# !dx upload GBA1_PD_noE326K_carriers.txt

# GBA1_PD_carriers
keep_df = pd.DataFrame({
    "FID": GBA1_PD_carriers["IID"],
    "IID": GBA1_PD_carriers["IID"]
})
keep_df.to_csv("GBA1_PD_subjects.txt", sep="\t", index=False, header=False)
!dx upload GBA1_PD_subjects.txt

# GBA1_PD_E326K_carriers
keep_df = pd.DataFrame({
    "FID": GBA1_PD_E326K_carriers["IID"],
    "IID": GBA1_PD_E326K_carriers["IID"]
})
keep_df.to_csv("GBA1_PD_E326K_subjects.txt", sep="\t", index=False, header=False)
!dx upload GBA1_PD_E326K_subjects.txt

# GBA1_PD_noE326K_carriers
keep_df = pd.DataFrame({
    "FID": GBA1_PD_noE326K_carriers["IID"],
    "IID": GBA1_PD_noE326K_carriers["IID"]
})
keep_df.to_csv("GBA1_PD_noE326K_subjects.txt", sep="\t", index=False, header=False)
!dx upload GBA1_PD_noE326K_subjects.txt

In [None]:
# Check if IDs match
!head ukb23158_c14_b0_v1.fam
!head GBA1_PD_subjects.txt

In [None]:
## PD, all GBA
!./plink2 --bfile ukb23158_c14_b0_v1 --keep GBA1_PD_subjects.txt --snp "14:74480701:C:T" --freq --out GBA_PD_subjects_NPC2 --no-pheno
!echo 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
!./plink2 --bfile ukb23158_c19_b0_v1 --keep GBA1_PD_subjects.txt --snp "19:3548233:A:G" --freq --out GBA_PD_subjects_MFSD12 --no-pheno

## PD, E326K only
!./plink2 --bfile ukb23158_c14_b0_v1 --keep GBA1_PD_E326K_subjects.txt --snp "14:74480701:C:T" --freq --out GBA_PD_E326K_subjects_NPC2 --no-pheno
!echo 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
!./plink2 --bfile ukb23158_c19_b0_v1 --keep GBA1_PD_E326K_subjects.txt --snp "19:3548233:A:G" --freq --out GBA_PD_E326K_subjects_MFSD12 --no-pheno

## PD, no E326K
!./plink2 --bfile ukb23158_c14_b0_v1 --keep GBA1_PD_noE326K_subjects.txt --snp "14:74480701:C:T" --freq --out GBA_PD_noE326K_subjects_NPC2 --no-pheno
!echo 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
!./plink2 --bfile ukb23158_c19_b0_v1 --keep GBA1_PD_noE326K_subjects.txt --snp "19:3548233:A:G" --freq --out GBA_PD_noE326K_subjects_MFSD12 --no-pheno

In [None]:
!head GBA_PD_subjects_NPC2.afreq
!head GBA_PD_subjects_MFSD12.afreq

In [None]:
!head GBA_PD_E326K_subjects_NPC2.afreq
!head GBA_PD_E326K_subjects_MFSD12.afreq

In [None]:
!head GBA_PD_noE326K_subjects_NPC2.afreq
!head GBA_PD_noE326K_subjects_MFSD12.afreq

In [None]:
!dx upload *.afreq ./

## 6) Do the same for non-PD carriers

In [None]:
## No PD-GBA
!./plink2 --bfile ukb23158_c14_b0_v1 --remove GBA1_PD_subjects.txt --snp "14:74480701:C:T" --freq --out GBA_Ctrl_subjects_NPC2 --no-pheno
!echo 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
!./plink2 --bfile ukb23158_c19_b0_v1 --remove GBA1_PD_subjects.txt --snp "19:3548233:A:G" --freq --out GBA_Ctrl_subjects_MFSD12 --no-pheno

## No PD-GBA-E326K
!./plink2 --bfile ukb23158_c14_b0_v1 --remove GBA1_PD_E326K_subjects.txt --snp "14:74480701:C:T" --freq --out GBA_Ctrl_E326K_subjects_NPC2 --no-pheno
!echo 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
!./plink2 --bfile ukb23158_c19_b0_v1 --remove GBA1_PD_E326K_subjects.txt --snp "19:3548233:A:G" --freq --out GBA_Ctrl_E326K_subjects_MFSD12 --no-pheno

## No PD-GBA-noE326K
!./plink2 --bfile ukb23158_c14_b0_v1 --remove GBA1_PD_noE326K_subjects.txt --snp "14:74480701:C:T" --freq --out GBA_Ctrl_noE326K_subjects_NPC2 --no-pheno
!echo 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
!./plink2 --bfile ukb23158_c19_b0_v1 --remove GBA1_PD_noE326K_subjects.txt --snp "19:3548233:A:G" --freq --out GBA_Ctrl_noE326K_subjects_MFSD12 --no-pheno

In [None]:
!head GBA_Ctrl_subjects_NPC2.afreq
!head GBA_Ctrl_subjects_MFSD12.afreq

In [None]:
!head GBA_Ctrl_E326K_subjects_NPC2.afreq
!head GBA_Ctrl_E326K_subjects_MFSD12.afreq

In [None]:
!head GBA_Ctrl_noE326K_subjects_NPC2.afreq
!head GBA_Ctrl_noE326K_subjects_MFSD12.afreq

In [None]:
# The approach above still included some non-GBA PD cases. Have to go back a few steps.
# Leverage the following function, to exclude all subjects with PD from complete list of UKBB subjects:
#GBA1_Ctrl_carriers = GBA1_carriers[~GBA1_carriers['IID'].isin(PD_patients['Participant ID'])]