In [1]:
import gwaslab as gl
import pandas as pd

In [2]:
# 1. Load your original UKBB dataset
# Nealelab UKBB v3 is in build 19
df = pd.read_csv(
    "/mnt/hdd_1/ofgeha/galaxy-gwas-tools/Data/21001_raw.gwas.imputed_v3.both_sexes.tsv.bgz",
    sep="\t",
    compression="gzip")

In [3]:
# Split variant into CHR, POS, REF, ALT
df[['CHR', 'POS', 'REF', 'ALT']] = df['variant'].str.split(":", expand=True)
df['POS'] = df['POS'].astype(int)

In [4]:
# Rename to GWASLab standards
df_standard = df.rename(columns={
    'variant': 'SNPID',
    'ALT': 'EA',
    'REF': 'NEA',
    'minor_AF': 'EAF',
    'beta': 'BETA',
    'se': 'SE',
    'pval': 'P'
})

In [5]:
# 2. Initialize as Build 19 (Essential!)
mysumstats = gl.Sumstats(df_standard, build="19")

2026/01/10 15:45:01 GWASLab v4.0.4 https://cloufield.github.io/gwaslab/
2026/01/10 15:45:01 (C) 2022-2026, Yunye He, Kamatani Lab, GPL-3.0 license, gwaslab@gmail.com
2026/01/10 15:45:01 Python version: 3.12.3 (main, Nov  6 2025, 13:44:16) [GCC 13.3.0]
2026/01/10 15:45:01 Start to initialize gl.Sumstats from pandas DataFrame ...
2026/01/10 15:45:02  -Reading columns          : 
2026/01/10 15:45:02  -Renaming columns to      : 
2026/01/10 15:45:02  -Current Dataframe shape : 13791467  x  15
2026/01/10 15:45:02  -Initiating a status column: STATUS ...
2026/01/10 15:45:02  -Genomic coordinates are based on GRCh37/hg19...
2026/01/10 15:45:05 Start to reorder the columns ...(v4.0.4)
2026/01/10 15:45:05  -Reordering columns to    : SNPID,CHR,POS,EA,NEA,STATUS,EAF,BETA,SE,P,minor_allele,low_confidence_variant,n_complete_samples,AC,ytx,tstat
2026/01/10 15:45:05 Finished reordering the columns.
2026/01/10 15:45:06  -Trying to convert datatype for CHR: object -> Int64...Failed...
2026/01/10 15:45

In [6]:
# 3. Step One: Liftover from hg19 to hg38
# GWASLab has built-in chains for 19 to 38
mysumstats.liftover(from_build="19", to_build="38")

2026/01/10 15:45:50 Start to perform liftover ...(v4.0.4)
2026/01/10 15:45:50  -Current Dataframe shape : 13791467 x 16 ; Memory usage: 1489.96 MB
2026/01/10 15:45:50 Start to fix chromosome notation (CHR) ...(v4.0.4)
2026/01/10 15:45:50  -Checking CHR data type...
2026/01/10 15:45:52  -Variants with standardized chromosome notation: 13364303
2026/01/10 15:45:53  -Variants with fixable chromosome notations: 427164
2026/01/10 15:45:53  -No unrecognized chromosome notations...
2026/01/10 15:45:53  -Identifying non-autosomal chromosomes : X, Y, and MT ...
2026/01/10 15:45:53  -Identified 427164 variants on sex chromosomes...
2026/01/10 15:45:53  -Standardizing first sex chromosome chromosome notations: X to 23...
2026/01/10 15:45:56 Finished fixing chromosome notation (CHR).
2026/01/10 15:45:56  -Dtype fixes applied successfully for requested columns.
2026/01/10 15:45:56  -Using built-in chain file: /mnt/hdd_1/ofgeha/galaxy-gwas-tools/.venv/lib/python3.12/site-packages/gwaslab/data/chains

In [8]:
# 4. Step Two: Harmonize with hg38 FASTA 
# This aligns alleles to the hg38 reference genome
mysumstats.harmonize(
    ref_seq="/mnt/hdd_1/ofgeha/gwas-sumstats-harmoniser/gwas-ref/hg38.fa",
    # n_jobs=4
)

2026/01/10 15:47:03 Start to check SNPID/rsID ...(v4.0.4)
2026/01/10 15:47:03  -Checking SNPID data type...
2026/01/10 15:47:03  -Converted datatype for SNPID: object -> string
2026/01/10 15:47:04  -Checking if SNPID contains NA strings :na,NA,Na,Nan,NaN,<NA>,null,NULL,#N/A,#VALUE!,N/A,n/a,missing,...
2026/01/10 15:47:05  -Checking if SNPID is CHR:POS:NEA:EA...(separator: - ,: , _)
2026/01/10 15:47:15 Finished checking SNPID/rsID.
2026/01/10 15:47:15 Start to fix chromosome notation (CHR) ...(v4.0.4)
2026/01/10 15:47:15  -Checking CHR data type...
2026/01/10 15:47:19  -Variants with standardized chromosome notation: 13791467
2026/01/10 15:47:19  -All CHR are already fixed...
2026/01/10 15:47:21 Finished fixing chromosome notation (CHR).
2026/01/10 15:47:21 Start to fix basepair positions (POS) ...(v4.0.4)
2026/01/10 15:47:21  -Trying to convert datatype for POS: int64 -> Int64...
2026/01/10 15:47:22  -Position bound:(0 , 250,000,000)
2026/01/10 15:47:22  -No outlier variants were remov

Unnamed: 0,SNPID,CHR,POS,EA,NEA,STATUS,EAF,BETA,SE,P,minor_allele,low_confidence_variant,n_complete_samples,AC,ytx,tstat
0,1:69487:G:A,1,69487,A,G,1960099,0.000006,-2.715450,2.360060,0.249902,A,True,359983,4.15294,103.515,-1.150590
1,1:69569:T:C,1,69569,C,T,1960099,0.000188,-0.484284,0.423462,0.252778,C,True,359983,135.27800,3644.560,-1.143630
2,1:139853:C:T,1,139853,T,C,1960099,0.000006,-2.703560,2.360130,0.251997,T,True,359983,4.09020,101.840,-1.145510
3,1:692794:CA:C,1,692794,C,CA,1960399,0.110590,-0.016436,0.019585,0.401342,C,False,359983,79621.10000,2179000.000,-0.839228
4,1:693731:A:G,1,693731,G,A,1960099,0.115767,-0.004255,0.018507,0.818155,G,False,359983,83348.00000,2281760.000,-0.229918
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13779880,X:154929412:C:T,23,154929412,T,C,1960099,0.245473,-0.016260,0.010723,0.129427,T,False,359983,176732.00000,4837480.000,-1.516370
13779881,X:154929637:CT:C,23,154929637,C,CT,1960399,0.229726,-0.027098,0.011190,0.015456,C,False,359983,165395.00000,4525550.000,-2.421510
13779882,X:154929952:CAA:C,23,154929952,C,CAA,1960399,0.239430,-0.020494,0.011278,0.069202,C,False,359983,172381.00000,4717910.000,-1.817100
13779883,X:154930230:A:G,23,154930230,G,A,1960099,0.245887,-0.016347,0.010721,0.127334,G,False,359983,177030.00000,4845680.000,-1.524700


In [9]:
# 5. Step Three: Assign rsIDs (Now positions and build match!)
mysumstats.assign_rsid(
    ref_rsid_tsv=gl.get_path("1kg_dbsnp151_hg38_auto")
)

2026/01/10 15:50:16 Start to assign rsID using reference file ...(v4.0.4)
2026/01/10 15:50:16  -Number of threads/cores to use: 1
2026/01/10 15:50:17  -13779885 rsID could be possibly fixed...
2026/01/10 15:50:17  -Setting block size:  5000000
2026/01/10 15:50:17  -Loading block: 0   1   2   3   4   5   6   7   8   9   10   11   12   13   
2026/01/10 15:57:23  -rsID annotation for 13638696 variants needed to be fixed!
2026/01/10 15:57:23  -Annotated 141189 rsID successfully!
2026/01/10 15:57:24  -Current Dataframe shape : 13779885 x 17 ; Memory usage: 1582.28 MB
2026/01/10 15:57:24 Finished assign rsID using reference file.


Unnamed: 0,SNPID,CHR,POS,EA,NEA,STATUS,EAF,BETA,SE,P,minor_allele,low_confidence_variant,n_complete_samples,AC,ytx,tstat,rsID
0,1:69487:G:A,1,69487,A,G,1960099,0.000006,-2.715450,2.360060,0.249902,A,True,359983,4.15294,103.515,-1.150590,
1,1:69569:T:C,1,69569,C,T,1960099,0.000188,-0.484284,0.423462,0.252778,C,True,359983,135.27800,3644.560,-1.143630,
2,1:139853:C:T,1,139853,T,C,1960099,0.000006,-2.703560,2.360130,0.251997,T,True,359983,4.09020,101.840,-1.145510,rs533633326
3,1:692794:CA:C,1,692794,C,CA,1960399,0.110590,-0.016436,0.019585,0.401342,C,False,359983,79621.10000,2179000.000,-0.839228,
4,1:693731:A:G,1,693731,G,A,1960099,0.115767,-0.004255,0.018507,0.818155,G,False,359983,83348.00000,2281760.000,-0.229918,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13779880,X:154929412:C:T,23,154929412,T,C,1960099,0.245473,-0.016260,0.010723,0.129427,T,False,359983,176732.00000,4837480.000,-1.516370,
13779881,X:154929637:CT:C,23,154929637,C,CT,1960399,0.229726,-0.027098,0.011190,0.015456,C,False,359983,165395.00000,4525550.000,-2.421510,
13779882,X:154929952:CAA:C,23,154929952,C,CAA,1960399,0.239430,-0.020494,0.011278,0.069202,C,False,359983,172381.00000,4717910.000,-1.817100,
13779883,X:154930230:A:G,23,154930230,G,A,1960099,0.245887,-0.016347,0.010721,0.127334,G,False,359983,177030.00000,4845680.000,-1.524700,


In [10]:
# 6. Step Four: Final Liftover to T2T (CHM13)
# Use your downloaded chain file
mysumstats.liftover(
    from_build="38", 
    to_build="13", 
    chain_path="./grch38-chm13v2.chain"
)

2026/01/10 15:58:25 Start to perform liftover ...(v4.0.4)
2026/01/10 15:58:25  -Using provided chain file: ./grch38-chm13v2.chain
2026/01/10 15:58:27  -Converting variants with status code xxx0xxx: 13,779,885
2026/01/10 15:58:27  -Target build: 13
2026/01/10 15:58:27  -Input positions are 1-based
2026/01/10 15:58:27  -Output positions will be 1-based
2026/01/10 15:59:56  -Mapped: 13640038 variants
2026/01/10 15:59:56  -Unmapped: 139847 variants
2026/01/10 15:59:56  -Examples of unmapped variants:
2026/01/10 15:59:56    SNPID=1:69487:G:A | CHR=1 | POS=69487 | STATUS=1960099
2026/01/10 15:59:56    SNPID=1:69569:T:C | CHR=1 | POS=69569 | STATUS=1960099
2026/01/10 15:59:56    SNPID=1:139853:C:T | CHR=1 | POS=139853 | STATUS=1960099
2026/01/10 15:59:56    SNPID=1:905017:T:C | CHR=1 | POS=905017 | STATUS=1960099
2026/01/10 15:59:56    SNPID=1:909221:T:C | CHR=1 | POS=909221 | STATUS=1960099
2026/01/10 16:00:07  -Removed 139847 unmapped variants
2026/01/10 16:00:07 Start to fix chromosome not