read the GWAS file

In [1]:
import pandas as pd
gwas_file = '/Users/sanjeedahs/Desktop/MR_PROJECT/outcome_data/AF/30061737-GCST006414-EFO_0000275-Build37.f.tsv'
df = pd.read_csv(gwas_file, delimiter='\t', low_memory=False)
print(df.head())

   rs_dbsnp147              variant_id chromosome  base_pair_location  \
0  1:10583_G/A              rs58108140          1               10583   
1  1:13302_C/T  rs75241669;rs180734498          1               13302   
2  1:13327_G/C               rs2691329          1               13327   
3  1:16071_G/A             rs541172944          1               16071   
4  1:16280_T/C             rs866639523          1               16280   

  other_allele effect_allele  effect_allele_frequency    beta  standard_error  \
0            A             G                      NaN -0.0640          0.0585   
1            T             C                      NaN -0.1306          0.0681   
2            C             G                      NaN -0.4488          0.1873   
3            A             G                   1.0000 -0.1820          1.7000   
4            T             C                   0.0002  1.2500          0.8640   

   p_value  ci_lower  ci_upper  odds_ratio  
0  0.27390       NaN       Na

reorder and format columns

In [2]:
# Select and rename the relevant columns
df = df.rename(columns={
    'variant_id': 'SNP',
    'chromosome': 'chromosome',
    'base_pair_location': 'bp_loc',
    'beta': 'beta',
    'standard_error': 'se',
    'effect_allele': 'effect_allele',
    'other_allele': 'other_allele',
    'effect_allele_frequency': 'eaf',
    'p_value': 'pval'
})

# Keep only the necessary columns
filtered_snps = df[['SNP', 'chromosome', 'bp_loc', 'beta', 'se', 'effect_allele', 'other_allele', 'eaf', 'pval']]


only keep SNPs by removing multibase alleles

In [4]:
# Convert allele columns to strings to avoid issues with len()
filtered_snps['effect_allele'] = filtered_snps['effect_allele'].astype(str)
filtered_snps['other_allele'] = filtered_snps['other_allele'].astype(str)

# Remove rows with multi-base alleles, making sure to handle NaN values
filtered_snps = filtered_snps[filtered_snps['effect_allele'].map(lambda x: isinstance(x, str) and len(x) == 1)]
filtered_snps = filtered_snps[filtered_snps['other_allele'].map(lambda x: isinstance(x, str) and len(x) == 1)]

# Display the first few rows of the filtered DataFrame
print(filtered_snps.head())

                      SNP chromosome  bp_loc    beta      se effect_allele  \
0              rs58108140          1   10583 -0.0640  0.0585             G   
1  rs75241669;rs180734498          1   13302 -0.1306  0.0681             C   
2               rs2691329          1   13327 -0.4488  0.1873             G   
3             rs541172944          1   16071 -0.1820  1.7000             G   
4             rs866639523          1   16280  1.2500  0.8640             C   

  other_allele     eaf     pval  
0            A     NaN  0.27390  
1            T     NaN  0.05514  
2            C     NaN  0.01657  
3            A  1.0000  0.91470  
4            T  0.0002  0.14800  


remove NA values

In [6]:
#remove na values 
filtered_snps = filtered_snps.dropna(subset=['SNP', 'chromosome', 'bp_loc', 'beta', 'se', 'effect_allele', 'other_allele', 'eaf', 'pval'])
# Display the first few rows of the filtered DataFrame
print(filtered_snps.head())

            SNP chromosome  bp_loc    beta      se effect_allele other_allele  \
3   rs541172944          1   16071 -0.1820  1.7000             G            A   
4   rs866639523          1   16280  1.2500  0.8640             C            T   
6   rs199900651          1   48186  0.4754  0.4954             G            T   
7    rs10399793          1   49298 -0.0079  0.0218             C            T   
10  rs140052487          1   54353 -0.7320  0.7100             C            A   

       eaf    pval  
3   1.0000  0.9147  
4   0.0002  0.1480  
6   0.0015  0.3373  
7   0.6207  0.7161  
10  0.9996  0.3025  


save the dataframe 

In [7]:
# Save the filtered data to a new file 
filtered_gwas_file = '/Users/sanjeedahs/Desktop/MR_PROJECT/outcome_data/AF/filtered_AF_GWAS.tsv'
filtered_snps.to_csv(filtered_gwas_file, sep='\t', index=False)

# Display the first few rows of the filtered DataFrame
print(filtered_snps.head())

            SNP chromosome  bp_loc    beta      se effect_allele other_allele  \
3   rs541172944          1   16071 -0.1820  1.7000             G            A   
4   rs866639523          1   16280  1.2500  0.8640             C            T   
6   rs199900651          1   48186  0.4754  0.4954             G            T   
7    rs10399793          1   49298 -0.0079  0.0218             C            T   
10  rs140052487          1   54353 -0.7320  0.7100             C            A   

       eaf    pval  
3   1.0000  0.9147  
4   0.0002  0.1480  
6   0.0015  0.3373  
7   0.6207  0.7161  
10  0.9996  0.3025  


perform liftover 

In [1]:
# Install pyliftover
%pip install pyliftover

import pandas as pd
from pyliftover import LiftOver

# Initialize the LiftOver object for GRCh37 to hg38 conversion
lo = LiftOver('hg19', 'hg38')

# Load your GWAS data
input_file = '/Users/sanjeedahs/Desktop/MR_PROJECT/outcome_data/AF/filtered_AF_GWAS.tsv'
output_file = '/Users/sanjeedahs/Desktop/MR_PROJECT/outcome_data/AF/filtered_AF_GWAS_hg38.tsv'

gwas_data = pd.read_csv(input_file, sep='\t')

# Function to perform the liftover and handle missing conversions
def lift_over_coordinates(chromosome, bp_loc):
    lifted = lo.convert_coordinate('chr' + str(chromosome), bp_loc)
    if lifted:
        return lifted[0][1]
    else:
        return None

# Apply the liftover to the bp_loc column
gwas_data['bp_loc_hg38'] = gwas_data.apply(lambda row: lift_over_coordinates(row['chromosome'], row['bp_loc']), axis=1)

# Display the first few rows of the updated DataFrame
print(gwas_data.head())

# Save the updated GWAS data to a new file
gwas_data.to_csv(output_file, sep='\t', index=False)

print(f"Lifted over GWAS data saved to {output_file}")


Note: you may need to restart the kernel to use updated packages.
           SNP  chromosome  bp_loc    beta      se effect_allele other_allele  \
0  rs541172944           1   16071 -0.1820  1.7000             G            A   
1  rs866639523           1   16280  1.2500  0.8640             C            T   
2  rs199900651           1   48186  0.4754  0.4954             G            T   
3   rs10399793           1   49298 -0.0079  0.0218             C            T   
4  rs140052487           1   54353 -0.7320  0.7100             C            A   

      eaf    pval  bp_loc_hg38  
0  1.0000  0.9147      16071.0  
1  0.0002  0.1480      16280.0  
2  0.0015  0.3373      48186.0  
3  0.6207  0.7161      49298.0  
4  0.9996  0.3025      54353.0  
Lifted over GWAS data saved to /Users/sanjeedahs/Desktop/MR_PROJECT/outcome_data/AF/filtered_AF_GWAS_hg38.tsv
