In [1]:
import gwaslab as gl

### Check reference and download

In [2]:
gl.check_available_ref()

2026/01/11 13:56:38 Start to check available reference files...
2026/01/11 13:56:38  - Available keywords:  1kg_eas_hg19 1kg_eur_hg19 1kg_eas_hg38 1kg_eur_hg38 1kg_sas_hg19 1kg_amr_hg19 1kg_sas_hg38 1kg_amr_hg38 1kg_afr_hg19 1kg_pan_hg19 1kg_afr_hg38 1kg_pan_hg38 dbsnp_v151_hg19 dbsnp_v151_hg38 dbsnp_v157_hg19 dbsnp_v157_hg38 ucsc_genome_hg19 ucsc_genome_hg38 1kg_dbsnp151_hg19_auto 1kg_dbsnp151_hg38_auto recombination_hg19 recombination_hg38 ensembl_hg19_gtf ensembl_hg38_gtf refseq_hg19_gtf refseq_hg38_gtf testlink 19to38 38to19 1kg_hm3_hg38_eaf 1kg_hm3_hg19_eaf


{'1kg_eas_hg19': {'description': '1000 Genomes Project East Asian (1KG EAS) VCF on the hg19 reference. Multi-allelic variants were decomposed. Variants were normalized. The INFO field includes the AF annotation, representing allele frequency in the EAS population.',
  'suggested_use': 'LD reference panel for creating region plot; infer strand for EAS population'},
 '1kg_eur_hg19': {'description': '1000 Genomes Project European (1KG EUR) VCF on the hg19 reference. Multi-allelic variants were decomposed. Variants were normalized. The INFO field includes the AF annotation, representing allele frequency in the EUR population.',
  'suggested_use': 'LD reference panel for creating region plot; infer strand for EUR population'},
 '1kg_eas_hg38': {'description': '1000 Genomes Project East Asian (1KG EAS) VCF on the hg38 reference. Multi-allelic variants were decomposed. Variants were normalized. The INFO field includes the AF annotation, representing allele frequency in the EAS population.',
 

In [3]:
gl.download_ref("1kg_dbsnp151_hg38_auto")

2026/01/11 13:56:43 Start to download  1kg_dbsnp151_hg38_auto  ...
2026/01/11 13:56:43  -Downloading to: /home/ofgeha/.gwaslab/1kg_dbsnp151_hg38_auto.txt.gz
2026/01/11 13:56:43  -File /home/ofgeha/.gwaslab/1kg_dbsnp151_hg38_auto.txt.gz exists.
2026/01/11 13:56:43  - Updating record in config file...
2026/01/11 13:56:43 Downloaded  1kg_dbsnp151_hg38_auto  successfully!


### Load sample data

In [4]:
mysumstats = gl.Sumstats(
    "/mnt/hdd_1/ofgeha/galaxy-gwas-tools/Data/21001_T2T_liftover.tsv.gz",
    snpid="SNPID",
    chrom="CHR",
    pos="POS",
    ea="EA",            # In your data, it's now EA (was ALT)
    nea="NEA",          # In your data, it's now NEA (was REF)
    eaf="EAF",          # In your data, it's now EAF (was minor_AF/Frq)
    beta="BETA",
    se="SE",
    p="P",
    # Note: 'Dir' and 'N' are not in your column list, 
    # so we should omit them or point to the correct ones.
    # n="n_complete_samples", # Optional: if you want to use sample size
    verbose=True,
    readargs={'sep': '\t'}
)

2026/01/11 13:56:58 GWASLab v4.0.4 https://cloufield.github.io/gwaslab/
2026/01/11 13:56:58 (C) 2022-2026, Yunye He, Kamatani Lab, GPL-3.0 license, gwaslab@gmail.com
2026/01/11 13:56:58 Python version: 3.12.3 (main, Nov  6 2025, 13:44:16) [GCC 13.3.0]
2026/01/11 13:56:58 Start to initialize gl.Sumstats from file :/mnt/hdd_1/ofgeha/galaxy-gwas-tools/Data/21001_T2T_liftover.tsv.gz
2026/01/11 13:57:22  -Reading columns          : SE,P,POS,BETA,SNPID,EAF,NEA,CHR,EA
2026/01/11 13:57:22  -Renaming columns to      : SE,P,POS,BETA,SNPID,EAF,NEA,CHR,EA
2026/01/11 13:57:22  -Current Dataframe shape : 13791467  x  9
2026/01/11 13:57:23  -Initiating a status column: STATUS ...
2026/01/11 13:57:26 Start to reorder the columns ...(v4.0.4)
2026/01/11 13:57:26  -Reordering columns to    : SNPID,CHR,POS,EA,NEA,STATUS,EAF,BETA,SE,P
2026/01/11 13:57:26 Finished reordering the columns.
2026/01/11 13:57:26  -Trying to convert datatype for CHR: string -> Int64...Success
2026/01/11 13:57:28  -Column  : SNPID

In [5]:
mysumstats.data

Unnamed: 0,SNPID,CHR,POS,EA,NEA,STATUS,EAF,BETA,SE,P
0,1:15791:C:T,1,15791,T,C,9999999,1.000000,894.616000,1204.870000,0.457786
1,1:69487:G:A,1,69487,A,G,9999999,0.999994,-2.715450,2.360060,0.249902
2,1:69569:T:C,1,69569,C,T,9999999,0.999812,-0.484284,0.423462,0.252778
3,1:139853:C:T,1,139853,T,C,9999999,0.999994,-2.703560,2.360130,0.251997
4,1:692794:CA:C,1,692794,C,CA,9999999,0.889410,-0.016436,0.019585,0.401342
...,...,...,...,...,...,...,...,...,...,...
13791462,X:154929412:C:T,23,154929412,T,C,9999999,0.754527,-0.016260,0.010723,0.129427
13791463,X:154929637:CT:C,23,154929637,C,CT,9999999,0.770274,-0.027098,0.011190,0.015456
13791464,X:154929952:CAA:C,23,154929952,C,CAA,9999999,0.760570,-0.020494,0.011278,0.069202
13791465,X:154930230:A:G,23,154930230,G,A,9999999,0.754113,-0.016347,0.010721,0.127334


In [6]:
mysumstats.basic_check()

2026/01/11 13:57:38 Start to check SNPID/rsID ...(v4.0.4)
2026/01/11 13:57:38  -Current Dataframe shape : 13791467 x 10 ; Memory usage: 963.86 MB
2026/01/11 13:57:38  -Checking SNPID data type...
2026/01/11 13:57:38  -Converted datatype for SNPID: object -> string
2026/01/11 13:57:38  -Checking if SNPID contains NA strings :na,NA,Na,Nan,NaN,<NA>,null,NULL,#N/A,#VALUE!,N/A,n/a,missing,...
2026/01/11 13:57:39  -Checking if SNPID is CHR:POS:NEA:EA...(separator: - ,: , _)
2026/01/11 13:57:49 Finished checking SNPID/rsID.
2026/01/11 13:57:49 Start to fix chromosome notation (CHR) ...(v4.0.4)
2026/01/11 13:57:49  -Checking CHR data type...
2026/01/11 13:57:53  -Variants with standardized chromosome notation: 13791467
2026/01/11 13:57:53  -All CHR are already fixed...
2026/01/11 13:57:56 Finished fixing chromosome notation (CHR).
2026/01/11 13:57:56 Start to fix basepair positions (POS) ...(v4.0.4)
2026/01/11 13:57:56  -Trying to convert datatype for POS: int64 -> Int64...
2026/01/11 13:57:57

Unnamed: 0,SNPID,CHR,POS,EA,NEA,STATUS,EAF,BETA,SE,P
0,1:69487:G:A,1,69487,A,G,9960099,0.999994,-2.715450,2.360060,0.249902
1,1:69569:T:C,1,69569,C,T,9960099,0.999812,-0.484284,0.423462,0.252778
2,1:139853:C:T,1,139853,T,C,9960099,0.999994,-2.703560,2.360130,0.251997
3,1:692794:CA:C,1,692794,C,CA,9960399,0.889410,-0.016436,0.019585,0.401342
4,1:693731:A:G,1,693731,G,A,9960099,0.884233,-0.004255,0.018507,0.818155
...,...,...,...,...,...,...,...,...,...,...
13779880,X:154929412:C:T,23,154929412,T,C,9960099,0.754527,-0.016260,0.010723,0.129427
13779881,X:154929637:CT:C,23,154929637,C,CT,9960399,0.770274,-0.027098,0.011190,0.015456
13779882,X:154929952:CAA:C,23,154929952,C,CAA,9960399,0.760570,-0.020494,0.011278,0.069202
13779883,X:154930230:A:G,23,154930230,G,A,9960099,0.754113,-0.016347,0.010721,0.127334


In [7]:
mysumstats.fix_id(fixsep=True)

2026/01/11 13:59:35 Start to check SNPID/rsID ...(v4.0.4)
2026/01/11 13:59:35  -Checking SNPID data type...
2026/01/11 13:59:35  -Checking if SNPID contains NA strings :na,NA,Na,Nan,NaN,<NA>,null,NULL,#N/A,#VALUE!,N/A,n/a,missing,...
2026/01/11 13:59:36  -Checking if SNPID is CHR:POS:NEA:EA...(separator: - ,: , _)
2026/01/11 13:59:46  -Replacing separators in SNPID with ":" ...
2026/01/11 13:59:52 Finished checking SNPID/rsID.


Unnamed: 0,SNPID,CHR,POS,EA,NEA,STATUS,EAF,BETA,SE,P
0,1:69487:G:A,1,69487,A,G,9960099,0.999994,-2.715450,2.360060,0.249902
1,1:69569:T:C,1,69569,C,T,9960099,0.999812,-0.484284,0.423462,0.252778
2,1:139853:C:T,1,139853,T,C,9960099,0.999994,-2.703560,2.360130,0.251997
3,1:692794:CA:C,1,692794,C,CA,9960399,0.889410,-0.016436,0.019585,0.401342
4,1:693731:A:G,1,693731,G,A,9960099,0.884233,-0.004255,0.018507,0.818155
...,...,...,...,...,...,...,...,...,...,...
13779880,X:154929412:C:T,23,154929412,T,C,9960099,0.754527,-0.016260,0.010723,0.129427
13779881,X:154929637:CT:C,23,154929637,C,CT,9960399,0.770274,-0.027098,0.011190,0.015456
13779882,X:154929952:CAA:C,23,154929952,C,CAA,9960399,0.760570,-0.020494,0.011278,0.069202
13779883,X:154930230:A:G,23,154930230,G,A,9960099,0.754113,-0.016347,0.010721,0.127334


In [8]:
mysumstats.data

Unnamed: 0,SNPID,CHR,POS,EA,NEA,STATUS,EAF,BETA,SE,P
0,1:69487:G:A,1,69487,A,G,9960099,0.999994,-2.715450,2.360060,0.249902
1,1:69569:T:C,1,69569,C,T,9960099,0.999812,-0.484284,0.423462,0.252778
2,1:139853:C:T,1,139853,T,C,9960099,0.999994,-2.703560,2.360130,0.251997
3,1:692794:CA:C,1,692794,C,CA,9960399,0.889410,-0.016436,0.019585,0.401342
4,1:693731:A:G,1,693731,G,A,9960099,0.884233,-0.004255,0.018507,0.818155
...,...,...,...,...,...,...,...,...,...,...
13779880,X:154929412:C:T,23,154929412,T,C,9960099,0.754527,-0.016260,0.010723,0.129427
13779881,X:154929637:CT:C,23,154929637,C,CT,9960399,0.770274,-0.027098,0.011190,0.015456
13779882,X:154929952:CAA:C,23,154929952,C,CAA,9960399,0.760570,-0.020494,0.011278,0.069202
13779883,X:154930230:A:G,23,154930230,G,A,9960099,0.754113,-0.016347,0.010721,0.127334


### Assign rsID

#### assign rsID using a SNPID-rsID table (variants in 1KG)

In [None]:
mysumstats.assign_rsid(ref_rsid_tsv=gl.get_path("1kg_dbsnp151_hg38_auto"))

2026/01/11 14:01:10 Start to assign rsID using reference file ...(v4.0.4)
2026/01/11 14:01:10  -Number of threads/cores to use: 1
2026/01/11 14:01:11  -13779885 rsID could be possibly fixed...
2026/01/11 14:01:12  -Setting block size:  5000000
2026/01/11 14:01:12  -Loading block: 0   1   2   3   

#### Note: you may need a dictionay to match the chromosome with your VCF

In [25]:
gl.get_number_to_NC(build="38")

{1: 'NC_000001.11',
 2: 'NC_000002.12',
 3: 'NC_000003.12',
 4: 'NC_000004.12',
 5: 'NC_000005.10',
 6: 'NC_000006.12',
 7: 'NC_000007.14',
 8: 'NC_000008.11',
 9: 'NC_000009.12',
 10: 'NC_000010.11',
 11: 'NC_000011.10',
 12: 'NC_000012.12',
 13: 'NC_000013.11',
 14: 'NC_000014.9',
 15: 'NC_000015.10',
 16: 'NC_000016.10',
 17: 'NC_000017.11',
 18: 'NC_000018.10',
 19: 'NC_000019.10',
 20: 'NC_000020.11',
 21: 'NC_000021.9',
 22: 'NC_000022.11',
 23: 'NC_000023.11',
 24: 'NC_000024.10',
 25: 'NC_012920.1'}

In [26]:
gl.get_number_to_NC(build="19")

{1: 'NC_000001.10',
 2: 'NC_000002.11',
 3: 'NC_000003.11',
 4: 'NC_000004.11',
 5: 'NC_000005.9',
 6: 'NC_000006.11',
 7: 'NC_000007.13',
 8: 'NC_000008.10',
 9: 'NC_000009.11',
 10: 'NC_000010.10',
 11: 'NC_000011.9',
 12: 'NC_000012.11',
 13: 'NC_000013.10',
 14: 'NC_000014.8',
 15: 'NC_000015.9',
 16: 'NC_000016.9',
 17: 'NC_000017.10',
 18: 'NC_000018.9',
 19: 'NC_000019.9',
 20: 'NC_000020.10',
 21: 'NC_000021.8',
 22: 'NC_000022.10',
 23: 'NC_000023.10',
 24: 'NC_000024.9',
 25: 'NC_012920.1'}

In [None]:
# Save to a gzipped TSV file
mysumstats.to_csv("/mnt/hdd_1/ofgeha/galaxy-gwas-tools/Data/21001_T2T_liftover_rsid.tsv.gz", 
                  sep="\t", 
                  index=False)