### Liftover

In [23]:
import gwaslab as gl
import pandas as pd

In [24]:
gl.show_version()

2026/01/11 13:46:52 GWASLab v4.0.4 https://cloufield.github.io/gwaslab/
2026/01/11 13:46:52 (C) 2022-2026, Yunye He, Kamatani Lab, GPL-3.0 license, gwaslab@gmail.com
2026/01/11 13:46:52 Python version: 3.12.3 (main, Nov  6 2025, 13:44:16) [GCC 13.3.0]


In [25]:
# Load the dataset using the confirmed headers
mysumstats = gl.Sumstats(
    "/mnt/hdd_1/ofgeha/galaxy-gwas-tools/Data/21001_raw.gwas.imputed_v3.both_sexes.tsv.bgz",
    snpid="variant",           
    ea="minor_allele",         
    neaf="minor_AF",           
    beta="beta",
    se="se",
    p="pval",
    verbose=True,
    readargs={
        "compression": "gzip",
        "sep": "\t"
    }
)

2026/01/11 13:46:52 GWASLab v4.0.4 https://cloufield.github.io/gwaslab/
2026/01/11 13:46:52 (C) 2022-2026, Yunye He, Kamatani Lab, GPL-3.0 license, gwaslab@gmail.com
2026/01/11 13:46:52 Python version: 3.12.3 (main, Nov  6 2025, 13:44:16) [GCC 13.3.0]
2026/01/11 13:46:52 Start to initialize gl.Sumstats from file :/mnt/hdd_1/ofgeha/galaxy-gwas-tools/Data/21001_raw.gwas.imputed_v3.both_sexes.tsv.bgz
2026/01/11 13:47:21  -Reading columns          : beta,se,minor_allele,minor_AF,variant,pval
2026/01/11 13:47:21  -Renaming columns to      : BETA,SE,EA,EAF,SNPID,P
2026/01/11 13:47:21  -Current Dataframe shape : 13791467  x  6
2026/01/11 13:47:22  -Initiating a status column: STATUS ...
2026/01/11 13:47:23  -NEAF is specified...
2026/01/11 13:47:23  -Checking if 0<= NEAF <=1 ...
2026/01/11 13:47:25  -Converted NEAF to EAF.
2026/01/11 13:47:25 Start to reorder the columns ...(v4.0.4)
2026/01/11 13:47:25  -Reordering columns to    : SNPID,EA,STATUS,EAF,BETA,SE,P
2026/01/11 13:47:25 Finished reo

In [26]:
mysumstats.data

Unnamed: 0,SNPID,EA,STATUS,EAF,BETA,SE,P
0,1:15791:C:T,T,9999999,1.000000,894.616000,1204.870000,0.457786
1,1:69487:G:A,A,9999999,0.999994,-2.715450,2.360060,0.249902
2,1:69569:T:C,C,9999999,0.999812,-0.484284,0.423462,0.252778
3,1:139853:C:T,T,9999999,0.999994,-2.703560,2.360130,0.251997
4,1:692794:CA:C,C,9999999,0.889410,-0.016436,0.019585,0.401342
...,...,...,...,...,...,...,...
13791462,X:154929412:C:T,T,9999999,0.754527,-0.016260,0.010723,0.129427
13791463,X:154929637:CT:C,C,9999999,0.770274,-0.027098,0.011190,0.015456
13791464,X:154929952:CAA:C,C,9999999,0.760570,-0.020494,0.011278,0.069202
13791465,X:154930230:A:G,G,9999999,0.754113,-0.016347,0.010721,0.127334


In [27]:
# 1. Manually split the SNPID string into separate columns
# This splits "1:69487:G:A" into 4 columns
split_cols = mysumstats.data['SNPID'].str.split(':', expand=True)

In [28]:
# 2. Assign the split values to the expected column names
mysumstats.data['CHR'] = split_cols[0]
mysumstats.data['POS'] = split_cols[1].astype(int)  # Liftover needs POS as an integer
mysumstats.data['NEA'] = split_cols[2]

In [29]:
# 3. Verify the columns are actually there now
print("Actual columns in dataframe:", mysumstats.data.columns)
print(mysumstats.data[['SNPID', 'CHR', 'POS']].head())

Actual columns in dataframe: Index(['SNPID', 'EA', 'STATUS', 'EAF', 'BETA', 'SE', 'P', 'CHR', 'POS', 'NEA'], dtype='object')
           SNPID CHR     POS
0    1:15791:C:T   1   15791
1    1:69487:G:A   1   69487
2    1:69569:T:C   1   69569
3   1:139853:C:T   1  139853
4  1:692794:CA:C   1  692794


### Liftover

In [30]:
# 4. Now run Liftover
# Now that 'CHR' and 'POS' are physically in the index, liftover will find them.
# mysumstats.liftover(from_build="19", to_build="38")
mysumstats.liftover(from_build="19", 
                    to_build="38",
                    remove=True)

2026/01/11 13:47:58 Start to perform liftover ...(v4.0.4)
2026/01/11 13:47:58  -Current Dataframe shape : 13791467 x 10 ; Memory usage: 1106.08 MB
2026/01/11 13:47:58 Start to fix chromosome notation (CHR) ...(v4.0.4)
2026/01/11 13:47:58  -Checking CHR data type...
2026/01/11 13:47:59  -Variants with standardized chromosome notation: 13364303
2026/01/11 13:48:00  -Variants with fixable chromosome notations: 427164
2026/01/11 13:48:00  -No unrecognized chromosome notations...
2026/01/11 13:48:01  -Identifying non-autosomal chromosomes : X, Y, and MT ...
2026/01/11 13:48:01  -Identified 427164 variants on sex chromosomes...
2026/01/11 13:48:01  -Standardizing first sex chromosome chromosome notations: X to 23...
2026/01/11 13:48:04 Finished fixing chromosome notation (CHR).
2026/01/11 13:48:04  -Dtype fixes applied successfully for requested columns.
2026/01/11 13:48:04  -Using built-in chain file: /mnt/hdd_1/ofgeha/galaxy-gwas-tools/.venv/lib/python3.12/site-packages/gwaslab/data/chains

In [31]:
mysumstats.data

Unnamed: 0,SNPID,EA,STATUS,EAF,BETA,SE,P,CHR,POS,NEA
0,1:15791:C:T,T,9995999,1.000000,894.616000,1204.870000,0.457786,1,15791,C
1,1:69487:G:A,A,9995999,0.999994,-2.715450,2.360060,0.249902,1,69487,G
2,1:69569:T:C,C,9995999,0.999812,-0.484284,0.423462,0.252778,1,69569,T
3,1:139853:C:T,T,9995999,0.999994,-2.703560,2.360130,0.251997,1,139853,C
4,1:692794:CA:C,C,9995999,0.889410,-0.016436,0.019585,0.401342,1,692794,CA
...,...,...,...,...,...,...,...,...,...,...
13791462,X:154929412:C:T,T,9995999,0.754527,-0.016260,0.010723,0.129427,23,154929412,C
13791463,X:154929637:CT:C,C,9995999,0.770274,-0.027098,0.011190,0.015456,23,154929637,CT
13791464,X:154929952:CAA:C,C,9995999,0.760570,-0.020494,0.011278,0.069202,23,154929952,CAA
13791465,X:154930230:A:G,G,9995999,0.754113,-0.016347,0.010721,0.127334,23,154930230,A


In [32]:
# Save to a gzipped TSV file
mysumstats.to_csv("/mnt/hdd_1/ofgeha/galaxy-gwas-tools/Data/21001_T2T_liftover.tsv.gz", 
                  sep="\t", 
                  index=False)