# Quality control prior to EasyQC

This is script to perform minimal quality control of the raw summary statistics which are then fed into EasyQC.

It uses 1000 Genome phase 3 as reference.

Missing values will be indicated with a period.

The minimal QC includes following:
- Changing GRCh build to 37
- rsID from 1kG
- uniqID from 1kG
- Code missing data as NA
- Calculation of MAF
    - Consider effective allele as minor allele
- Rename columns as below:
    - CHR: Chromosome
    - POS: base position (GCRh 37, hg19)
    - SNP: rsID
    - A1: effective allele
    - A2: reference allele
    - MAF: minor allele frequency
    - Effect: beta or odds ratio
    - SE: standard error of the effect
    - Pval: p-value
    - N: Leverage SNP-specific sample size if present, otherwise use total N or effective sample size (in the case of binary trait)
    - INFO

# Utility functions

In [None]:
import pandas as pd
from math import isnan

def call_variant_matching_reference():
    file = "1kG_chrpos_rsID.tsv"
    df = pd.read_csv(file, sep='\t', index_col=False)
    return df

def is_na(x):
    if isinstance(x, float):
        return True if isnan(x) else False
    return True if x in ['NA', 'NAN', 'nan', 'None', 'none', None] else False


def is_floatNan(x):
    return True if isnan(x) else False


def is_strna(x):
    return True if x in ['NA', 'NAN', 'nan', 'None', 'none'] else False

def is_integer(x):
    try:
        int(x)
        return True
    except:
        return False

In [None]:
var_match_ref = call_variant_matching_reference() # Index(['CHRPOS', 'rsID'], dtype='object')

# Body mass index (BMI)

## Body mass index (BMI) / Cohort: GIANT 2018 / Pulit et al., 2018

- File name: Bmi.giant-ukbb.meta-analysis.combined.23May2018.HapMap2_only.txt
- Whitespace-separated
- Columns: CHR POS SNP Tested_Allele Other_Allele Freq_Tested_Allele BETA SE P N INFO
- GRCh37

QC performed:
- Code missing data as NA
- SNP allele separation : A small number of SNPs (<9,000) from the GIANT data had no dbSNP151 identifier, and are left as just an rsID.
- Missing CHR and POS from 1kG
- MAF calculation and check
- Rename columns
- Reorder columns
- Assign appropriate data type except NA

In [None]:
import pandas as pd
import os

file = "Bmi.giant-ukbb.meta-analysis.combined.23May2018.HapMap2_only.txt"
df = pd.read_csv(file, sep=' ', index_col=False)

# Code missing data as NA
df['CHR'] = df['CHR'].apply(lambda x: 'NA' if is_floatNan(x) else str(int(x)))
df['POS'] = df['POS'].apply(lambda x: 'NA' if is_floatNan(x) else str(int(x)))
df['SNP'] = df['SNP'].apply(lambda x: 'NA' if is_strna(x) else str(x))
df['Tested_Allele'] = df['Tested_Allele'].apply(lambda x: 'NA' if is_strna(x) else str(x))
df['Other_Allele'] = df['Other_Allele'].apply(lambda x: 'NA' if is_strna(x) else str(x))
df['Freq_Tested_Allele'] = df['Freq_Tested_Allele'].apply(lambda x: 'NA' if is_floatNan(x) else str(float(x)))
df['BETA'] = df['BETA'].apply(lambda x: 'NA' if is_floatNan(x) else str(float(x)))
df['SE'] = df['SE'].apply(lambda x: 'NA' if is_floatNan(x) else str(float(x)))
df['P'] = df['P'].apply(lambda x: 'NA' if is_floatNan(x) else str(float(x)))
df['N'] = df['N'].apply(lambda x: 'NA' if is_floatNan(x) else str(float(x)))
df['INFO'] = df['INFO'].apply(lambda x: 'NA' if is_floatNan(x) else str(float(x)))

# SNP allele separation
df['rsID'] = df['SNP'].apply(lambda snp: snp.split(sep=':')[0] if snp != 'NA' else 'NA')
df.drop(columns=['SNP'], inplace=True)

# CHR and POS from 1kG
df = df.merge(var_match_ref, left_on='rsID', right_on='rsID', how='left')

def check_chrpos_na(x):
    try:
        if is_floatNan(x):
            return True
        return False
    except:
        if is_strna(x):
            return True
        return False

df['CHRPOS'] = df['CHRPOS'].apply(lambda x: 'NA' if check_chrpos_na(x) else str(x))
df['CHR'] = df['CHRPOS'].apply(lambda uniqid: 'NA' if uniqid == 'NA' else str(uniqid.split(sep=':')[0]))
df['POS'] = df['CHRPOS'].apply(lambda uniqid: 'NA' if uniqid == 'NA' else str(uniqid.split(sep=':')[1]))
df.drop(columns=['CHRPOS'])

# MAF calculation and check
df['MAF check'] = df['Freq_Tested_Allele'].apply(lambda eaf: 'NA' if eaf == 'NA' else True if float(eaf) <= 0.5 else False)
df['MAF_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['Freq_Tested_Allele'] if row['MAF check'] else str(1 - float(row['Freq_Tested_Allele'])), axis=1)
df['A1_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['Tested_Allele'] if row['MAF check'] else row['Other_Allele'], axis=1)
df['A2_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['Other_Allele'] if row['MAF check'] else row['Tested_Allele'], axis=1)
df['Effect'] =  df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['BETA'] if row['MAF check'] else str(-1 * float(row['BETA'])), axis=1)
df.drop(columns=['Freq_Tested_Allele', 'MAF check', 'Tested_Allele', 'Other_Allele', 'BETA'], inplace=True)

# Rename columns
df.rename(columns={'se':'SE', 'P':'Pval', 'MAF_new':'MAF', 'A1_new':'A1', 'A2_new':'A2', 'rsID':'SNP'}, inplace=True)

# Reorder columns
df = df[['CHR', 'POS', 'SNP', 'A1', 'A2', 'MAF', 'Effect', 'SE', 'Pval', 'N', 'INFO']]

# Assign appropriate data type except NA
df['CHR'] = df['CHR'].apply(lambda x: 'NA' if x == 'NA' else int(x))
df['POS'] = df['POS'].apply(lambda x: 'NA' if x == 'NA' else int(x))
df['SNP'] = df['SNP'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['A1'] = df['A1'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['A2'] = df['A2'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['MAF'] = df['MAF'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['Effect'] = df['Effect'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['SE'] = df['SE'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['Pval'] = df['Pval'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['N'] = df['N'].apply(lambda x: 'NA' if x == 'NA' else int(float(x)))
df['INFO'] = df['INFO'].apply(lambda x: 'NA' if x == 'NA' else float(x))

# Reorder according to CHR and POS
df.sort_values(by=['CHR', 'POS'], ascending=True, inplace=True)

# Convert 'NA' to .
df.replace(to_replace=['NA'],
            value='.',
            inplace=True
            )

# Save
save_filename = "pQC.Bmi.giant-ukbb.meta-analysis.combined.23May2018.HapMap2_only.txt"
df.to_csv(save_filename, sep='\t', index=False)

## Body mass index (BMI) / Cohort: GIANT 2015 / Locke et al., 2015

- File name: SNP_gwas_mc_merge_nogc.tbl.uniq
- tab-separated
- Columns: SNP     A1      A2      Freq1.Hapmap    b       se      p       N
- GRCh37

QC performed:
- Code missing data as NA
- CHR and POS from 1kG
- MAF calculation and check
- Rename columns
- Reorder columns
- Assign appropriate data type except NA

In [None]:
import pandas as pd
import os

file = "SNP_gwas_mc_merge_nogc.tbl.uniq"
df = pd.read_csv(file, sep='\t', index_col=False)

# Code missing data as NA
df['SNP'] = df['SNP'].apply(lambda x: 'NA' if is_strna(x) else str(x))
df['A1'] = df['A1'].apply(lambda x: 'NA' if is_strna(x) else str(x))
df['A2'] = df['A2'].apply(lambda x: 'NA' if is_strna(x) else str(x))
df['Freq1.Hapmap'] = df['Freq1.Hapmap'].apply(lambda x: 'NA' if is_floatNan(x) else str(float(x)))
df['b'] = df['b'].apply(lambda x: 'NA' if is_floatNan(x) else str(float(x)))
df['se'] = df['se'].apply(lambda x: 'NA' if is_floatNan(x) else str(float(x)))
df['p'] = df['p'].apply(lambda x: 'NA' if is_floatNan(x) else str(float(x)))
df['N'] = df['N'].apply(lambda x: 'NA' if is_floatNan(x) else str(float(x)))

# CHR and POS from 1kG
df = df.merge(var_match_ref, left_on='SNP', right_on='rsID', how='left')

def check_chrpos_na(x):
    try:
        if is_floatNan(x):
            return True
        return False
    except:
        if is_strna(x):
            return True
        return False

df['CHRPOS'] = df['CHRPOS'].apply(lambda x: 'NA' if check_chrpos_na(x) else str(x))
df['CHR'] = df['CHRPOS'].apply(lambda uniqid: 'NA' if uniqid == 'NA' else str(uniqid.split(sep=':')[0]))
df['POS'] = df['CHRPOS'].apply(lambda uniqid: 'NA' if uniqid == 'NA' else str(uniqid.split(sep=':')[1]))
df.drop(columns=['CHRPOS', 'rsID'])

# MAF calculation and check
df['MAF check'] = df['Freq1.Hapmap'].apply(lambda eaf: 'NA' if eaf == 'NA' else True if float(eaf) <= 0.5 else False)
df['MAF_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['Freq1.Hapmap'] if row['MAF check'] else str(1 - float(row['Freq1.Hapmap'])), axis=1)
df['A1_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['A1'] if row['MAF check'] else row['A2'], axis=1)
df['A2_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['A2'] if row['MAF check'] else row['A1'], axis=1)
df['Effect'] =  df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['b'] if row['MAF check'] else str(-1 * float(row['b'])), axis=1)
df.drop(columns=['Freq1.Hapmap', 'MAF check', 'A1', 'A2', 'b'], inplace=True)

# Rename columns
df.rename(columns={'se':'SE', 'p':'Pval', 'MAF_new':'MAF', 'A1_new':'A1', 'A2_new':'A2'}, inplace=True)

# Reorder columns
df = df[['CHR', 'POS', 'SNP', 'A1', 'A2', 'MAF', 'Effect', 'SE', 'Pval', 'N']]

# Assign appropriate data type except NA
df['CHR'] = df['CHR'].apply(lambda x: 'NA' if x == 'NA' else int(x))
df['POS'] = df['POS'].apply(lambda x: 'NA' if x == 'NA' else int(x))
df['SNP'] = df['SNP'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['A1'] = df['A1'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['A2'] = df['A2'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['MAF'] = df['MAF'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['Effect'] = df['Effect'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['SE'] = df['SE'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['Pval'] = df['Pval'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['N'] = df['N'].apply(lambda x: 'NA' if x == 'NA' else int(float(x)))

# Reorder according to CHR and POS
df.sort_values(by=['CHR', 'POS'], ascending=True, inplace=True)


# Re-code 'NA' to .
df.replace(to_replace=['NA'],
            value='.',
            inplace=True
            )

# Save
save_filename ="pQC.SNP_gwas_mc_merge_nogc.tbl.uniq.txt"
df.to_csv(save_filename, sep='\t', index=False)

# Fasting glucose (FG)

## Fasting glucose (FG) / Cohort: MAGIC / Lagou et al., 2021

- File name: FG_combined_1000G_density_formatted_21-03-29.txt
- tab-separated
- Columns: z       source  rsid    a1      a2      r2.pred p-value n       maf     beta    se
- GRCh37
- A1 is the effect allele

README:\
a. z – Z-score of association for FG or FI;\
b. source – SSIMP for imputed and GWAS for SNPs present in the HapMap density;\
c. rsid – rsID;\
d. a1 – reference allele;\
e. a2 – effect allele;\
f. r2.pred – SS-imp imputation quality;\
g. p-value – P-value of association;\
h. n – sample size;\
i. maf – minor allele frequency;\
j. beta – effect size for FG or FI;\
k. se - standard error of the effect estimate for FG or FI.\

QC performed:
- Remove unneccesary column: source
- Code missing data as NA
- Rename a1 as A2, a2 as A1
- CHR and POS from 1kG
- MAF check
- Rename columns
- Reorder columns
- Assign appropriate data type except NA
- Re-code 'NA' to .

In [None]:
import pandas as pd
import os

file = "FG_combined_1000G_density_formatted_21-03-29.txt"
df = pd.read_csv(file, sep='\t', index_col=False)

# Remove unneccesary column: source
df.drop(columns=['source'], inplace=True)

# Code missing data as NA
df['z'] = df['z'].apply(lambda x: 'NA' if is_floatNan(x) else str(float(x)))
df['rsid'] = df['rsid'].apply(lambda x: 'NA' if is_strna(x) else str(x))
df['a1'] = df['a1'].apply(lambda x: 'NA' if is_strna(x) else str(x))
df['a2'] = df['a2'].apply(lambda x: 'NA' if is_strna(x) else str(x))
df['maf'] = df['maf'].apply(lambda x: 'NA' if is_floatNan(x) else str(float(x)))
df['beta'] = df['beta'].apply(lambda x: 'NA' if is_floatNan(x) else str(float(x)))
df['se'] = df['se'].apply(lambda x: 'NA' if is_floatNan(x) else str(float(x)))
df['p-value'] = df['p-value'].apply(lambda x: 'NA' if is_floatNan(x) else str(float(x)))
df['n'] = df['n'].apply(lambda x: 'NA' if is_floatNan(x) else str(float(x)))
df['r2.pred'] = df['r2.pred'].apply(lambda x: 'NA' if is_floatNan(x) else str(float(x)))

# Rename a1 as A2, a2 as A1
df.rename(columns={'a1':'A1', 'a2':'A2'}, inplace=True)

# CHR and POS from 1kG
df = df.merge(var_match_ref, left_on='rsid', right_on='rsID', how='left')

def check_chrpos_na(x):
    try:
        if is_floatNan(x):
            return True
        return False
    except:
        if is_strna(x):
            return True
        return False

df['CHRPOS'] = df['CHRPOS'].apply(lambda x: 'NA' if check_chrpos_na(x) else str(x))
df['CHR'] = df['CHRPOS'].apply(lambda uniqid: 'NA' if uniqid == 'NA' else str(uniqid.split(sep=':')[0]))
df['POS'] = df['CHRPOS'].apply(lambda uniqid: 'NA' if uniqid == 'NA' else str(uniqid.split(sep=':')[1]))
df.drop(columns=['CHRPOS', 'rsID'], inplace=True)

# MAF calculation and check
df['MAF check'] = df['maf'].apply(lambda x: 'NA' if x == 'NA' else True if float(x) <= 0.5 else False)
df['MAF_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['maf'] if row['MAF check'] else str(1 - float(row['maf'])), axis=1)
df['A1_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['A1'] if row['MAF check'] else row['A2'], axis=1)
df['A2_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['A2'] if row['MAF check'] else row['A1'], axis=1)
df['Effect'] =  df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['beta'] if row['MAF check'] else str(-1 * float(row['beta'])), axis=1)
df.drop(columns=['maf', 'MAF check', 'A1', 'A2', 'beta'], inplace=True)

# Rename columns
df.rename(columns={'rsid':'SNP', 'r2.pred':'INFO', 'p-value':'Pval', 'n':'N', 'se':'SE', 'MAF_new':'MAF', 'A1_new':'A1', 'A2_new':'A2'}, inplace=True)

# Reorder columns
df = df[['CHR', 'POS', 'SNP', 'A1', 'A2', 'MAF', 'Effect', 'SE', 'Pval', 'N', 'INFO']]

# Assign appropriate data type except NA
df['CHR'] = df['CHR'].apply(lambda x: 'NA' if x == 'NA' else int(x))
df['POS'] = df['POS'].apply(lambda x: 'NA' if x == 'NA' else int(x))
df['SNP'] = df['SNP'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['A1'] = df['A1'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['A2'] = df['A2'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['MAF'] = df['MAF'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['Effect'] = df['Effect'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['SE'] = df['SE'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['Pval'] = df['Pval'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['N'] = df['N'].apply(lambda x: 'NA' if x == 'NA' else int(float(x)))
df['INFO'] = df['INFO'].apply(lambda x: 'NA' if x == 'NA' else float(x))

# Reorder according to CHR and POS
df.sort_values(by=['CHR', 'POS'], ascending=True, inplace=True)


# Re-code 'NA' to .
df.replace(to_replace=['NA'],
            value='.',
            inplace=True
            )

# Save
save_filename = "pQC.FG_combined_1000G_density_formatted_21-03-29.A1asEffect.txt"
df.to_csv(save_filename, sep='\t', index=False)

# High-density lipoprotein (HDL)

## High-density lipoprotein (HDL) / Cohort: GLGC 2022 with UKB

- File name: HDL_INV_EUR_HRC_1KGP3_others_ALL.meta.singlevar.results.txt
- tab-separated
- Columns: rsID    CHROM   POS_b37 REF     ALT     N       N_studies       POOLED_ALT_AF   EFFECT_SIZE     SE      pvalue_neg_log10        pvalue  pvalue_neg_log10_GC     pvalue_GC
- GRCh37

Columns:\
rsID: ID from dbSNP 150\
CHROM: Chromosome\
POS_b37: Position in build 37\
REF: non-effect allele\
ALT: effect allele\
N: total number of individuals with results for this variant\
N_studies: total number of studies with results for this variant\
POOLED_ALT_AF: allele frequency of the ALT allele\
EFFECT_SIZE: effect size of the ALT allele\
SE: standard error\
pvalue_neg_log10: -log10(p-value)\
pvalue: p-value\
pvalue_neg_log10_GC: -log10(GC-corrected p-value)\
pvalue_GC: GC-corrected p-value\

QC performed:
- Remove unneccesary column: N_studies, pvalue_neg_log10, pvalue_neg_log10_GC, pvalue_GC
- Code missing data as NA
- MAF check
- Rename columns
- Reorder columns
- Assign appropriate data type except NA
- Re-code 'NA' to .

In [None]:
import pandas as pd
import os

file = "HDL_INV_EUR_HRC_1KGP3_others_ALL.meta.singlevar.results.txt"
df = pd.read_csv(file, sep='\t', index_col=False)

# Remove unneccesary columns
df.drop(columns=['N_studies', 'pvalue_neg_log10', 'pvalue_neg_log10_GC', 'pvalue_GC'], inplace=True)

# Code missing data as NA
df['rsID'] = df['rsID'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['CHROM'] = df['CHROM'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['POS_b37'] = df['POS_b37'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['REF'] = df['REF'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['ALT'] = df['ALT'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['POOLED_ALT_AF'] = df['POOLED_ALT_AF'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['EFFECT_SIZE'] = df['EFFECT_SIZE'].apply(lambda x: 'NA' if is_na(x) else str(-1*float(x)))
df['SE'] = df['SE'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['pvalue'] = df['pvalue'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['N'] = df['N'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))

# MAF calculation and check
df['MAF check'] = df['POOLED_ALT_AF'].apply(lambda x: 'NA' if x == 'NA' else True if float(x) <= 0.5 else False)
df['MAF_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['POOLED_ALT_AF'] if row['MAF check'] else str(1 - float(row['POOLED_ALT_AF'])), axis=1)
df['A1_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['ALT'] if row['MAF check'] else row['REF'], axis=1)
df['A2_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['REF'] if row['MAF check'] else row['ALT'], axis=1)
df['Effect'] =  df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['EFFECT_SIZE'] if row['MAF check'] else str(-1 * float(row['EFFECT_SIZE'])), axis=1)
df.drop(columns=['POOLED_ALT_AF', 'MAF check', 'ALT', 'REF', 'EFFECT_SIZE'], inplace=True)

# Rename columns
df.rename(columns={'rsID':'SNP', 'CHROM':'CHR', 'POS_b37':'POS', 'A2_new':'A2', 'A1_new':'A1', 'MAF_new':'MAF', 'pvalue':'Pval'}, inplace=True)

# Reorder columns
df = df[['CHR', 'POS', 'SNP', 'A1', 'A2', 'MAF', 'Effect', 'SE', 'Pval', 'N']]

# Assign appropriate data type except NA
df['CHR'] = df['CHR'].apply(lambda x: 'NA' if x == 'NA' else int(x))
df['POS'] = df['POS'].apply(lambda x: 'NA' if x == 'NA' else int(x))
df['SNP'] = df['SNP'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['A1'] = df['A1'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['A2'] = df['A2'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['MAF'] = df['MAF'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['Effect'] = df['Effect'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['SE'] = df['SE'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['Pval'] = df['Pval'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['N'] = df['N'].apply(lambda x: 'NA' if x == 'NA' else int(float(x)))

# Reorder according to CHR and POS
df.sort_values(by=['CHR', 'POS'], ascending=True, inplace=True)


# Re-code 'NA' to .
df.replace(to_replace=['NA'],
            value='.',
            inplace=True
            )

# Save
save_filename = "pQC.HDL_INV_EUR_HRC_1KGP3_others_ALL.meta.singlevar.results.txt"
df.to_csv(save_filename, sep='\t', index=False)

## High-density lipoprotein (HDL) / Cohort: GLGC 2022 without UKB

- File name: HDL_INV_EUR_HRC_1KGP3_others_ALL.meta.singlevar.results.txt
- tab-separated
- Columns: rsID    CHROM   POS_b37 REF     ALT     N       N_studies       POOLED_ALT_AF   EFFECT_SIZE     SE      pvalue_neg_log10        pvalue  pvalue_neg_log10_GC     pvalue_GC
- GRCh37

Columns:\
rsID: ID from dbSNP 150\
CHROM: Chromosome\
POS_b37: Position in build 37\
REF: non-effect allele\
ALT: effect allele\
N: total number of individuals with results for this variant\
N_studies: total number of studies with results for this variant\
POOLED_ALT_AF: allele frequency of the ALT allele\
EFFECT_SIZE: effect size of the ALT allele\
SE: standard error\
pvalue_neg_log10: -log10(p-value)\
pvalue: p-value\
pvalue_neg_log10_GC: -log10(GC-corrected p-value)\
pvalue_GC: GC-corrected p-value\

QC performed:
- Remove unneccesary column: N_studies, pvalue_neg_log10, pvalue_neg_log10_GC, pvalue_GC
- Code missing data as NA
- MAF check
- Rename columns
- Reorder columns
- Assign appropriate data type except NA
- Re-code 'NA' to .

In [None]:
import pandas as pd
import os

file = "without_UKB_HDL_INV_EUR_HRC_1KGP3_others_ALL.meta.singlevar.results"
df = pd.read_csv(file, sep='\t', index_col=False)

# Remove unneccesary columns
df.drop(columns=['N_studies', 'pvalue_neg_log10', 'pvalue_neg_log10_GC', 'pvalue_GC'], inplace=True)

# Code missing data as NA
df['rsID'] = df['rsID'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['CHROM'] = df['CHROM'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['POS_b37'] = df['POS_b37'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['REF'] = df['REF'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['ALT'] = df['ALT'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['POOLED_ALT_AF'] = df['POOLED_ALT_AF'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['EFFECT_SIZE'] = df['EFFECT_SIZE'].apply(lambda x: 'NA' if is_na(x) else str(-1*float(x)))
df['SE'] = df['SE'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['pvalue'] = df['pvalue'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['N'] = df['N'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))

# MAF calculation and check
df['MAF check'] = df['POOLED_ALT_AF'].apply(lambda x: 'NA' if x == 'NA' else True if float(x) <= 0.5 else False)
df['MAF_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['POOLED_ALT_AF'] if row['MAF check'] else str(1 - float(row['POOLED_ALT_AF'])), axis=1)
df['A1_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['ALT'] if row['MAF check'] else row['REF'], axis=1)
df['A2_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['REF'] if row['MAF check'] else row['ALT'], axis=1)
df['Effect'] =  df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['EFFECT_SIZE'] if row['MAF check'] else str(-1 * float(row['EFFECT_SIZE'])), axis=1)
df.drop(columns=['POOLED_ALT_AF', 'MAF check', 'ALT', 'REF', 'EFFECT_SIZE'], inplace=True)

# Rename columns
df.rename(columns={'rsID':'SNP', 'CHROM':'CHR', 'POS_b37':'POS', 'A2_new':'A2', 'A1_new':'A1', 'MAF_new':'MAF', 'pvalue':'Pval'}, inplace=True)

# Reorder columns
df = df[['CHR', 'POS', 'SNP', 'A1', 'A2', 'MAF', 'Effect', 'SE', 'Pval', 'N']]

# Assign appropriate data type except NA
df['CHR'] = df['CHR'].apply(lambda x: 'NA' if x == 'NA' else int(x))
df['POS'] = df['POS'].apply(lambda x: 'NA' if x == 'NA' else int(x))
df['SNP'] = df['SNP'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['A1'] = df['A1'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['A2'] = df['A2'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['MAF'] = df['MAF'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['Effect'] = df['Effect'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['SE'] = df['SE'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['Pval'] = df['Pval'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['N'] = df['N'].apply(lambda x: 'NA' if x == 'NA' else int(float(x)))

# Reorder according to CHR and POS
df.sort_values(by=['CHR', 'POS'], ascending=True, inplace=True)


# Re-code 'NA' to .
df.replace(to_replace=['NA'],
            value='.',
            inplace=True
            )

# Save
save_filename = "pQC.without_UKB_HDL_INV_EUR_HRC_1KGP3_others_ALL.meta.singlevar.results.txt"
df.to_csv(save_filename, sep='\t', index=False)

# Triglycerides (TG)

## Triglycerides (TG) / Cohort: GLGC 2022 with UKB

- File name: logTG_INV_EUR_HRC_1KGP3_others_ALL.meta.singlevar.results.txt
- tab-separated
- Columns: rsID    CHROM   POS_b37 REF     ALT     N       N_studies       POOLED_ALT_AF   EFFECT_SIZE     SE      pvalue_neg_log10        pvalue  pvalue_neg_log10_GC     pvalue_GC
- GRCh37

Columns:\
rsID: ID from dbSNP 150\
CHROM: Chromosome\
POS_b37: Position in build 37\
REF: non-effect allele\
ALT: effect allele\
N: total number of individuals with results for this variant\
N_studies: total number of studies with results for this variant\
POOLED_ALT_AF: allele frequency of the ALT allele\
EFFECT_SIZE: effect size of the ALT allele\
SE: standard error\
pvalue_neg_log10: -log10(p-value)\
pvalue: p-value\
pvalue_neg_log10_GC: -log10(GC-corrected p-value)\
pvalue_GC: GC-corrected p-value\

QC performed:
- Remove unneccesary column: N_studies, pvalue_neg_log10, pvalue_neg_log10_GC, pvalue_GC
- Code missing data as NA
- MAF check
- Rename columns
- Reorder columns
- Assign appropriate data type except NA
- Re-code 'NA' to .

In [None]:
import pandas as pd
import os

file = "logTG_INV_EUR_HRC_1KGP3_others_ALL.meta.singlevar.results.txt"
df = pd.read_csv(file, sep='\t', index_col=False)

# Remove unneccesary columns
df.drop(columns=['N_studies', 'pvalue_neg_log10', 'pvalue_neg_log10_GC', 'pvalue_GC'], inplace=True)

# Code missing data as NA
df['rsID'] = df['rsID'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['CHROM'] = df['CHROM'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['POS_b37'] = df['POS_b37'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['REF'] = df['REF'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['ALT'] = df['ALT'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['POOLED_ALT_AF'] = df['POOLED_ALT_AF'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['EFFECT_SIZE'] = df['EFFECT_SIZE'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['SE'] = df['SE'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['pvalue'] = df['pvalue'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['N'] = df['N'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))

# MAF calculation and check
df['MAF check'] = df['POOLED_ALT_AF'].apply(lambda x: 'NA' if x == 'NA' else True if float(x) <= 0.5 else False)
df['MAF_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['POOLED_ALT_AF'] if row['MAF check'] else str(1 - float(row['POOLED_ALT_AF'])), axis=1)
df['A1_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['ALT'] if row['MAF check'] else row['REF'], axis=1)
df['A2_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['REF'] if row['MAF check'] else row['ALT'], axis=1)
df['Effect'] =  df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['EFFECT_SIZE'] if row['MAF check'] else str(-1 * float(row['EFFECT_SIZE'])), axis=1)
df.drop(columns=['POOLED_ALT_AF', 'MAF check', 'ALT', 'REF', 'EFFECT_SIZE'], inplace=True)

# Rename columns
df.rename(columns={'rsID':'SNP', 'CHROM':'CHR', 'POS_b37':'POS', 'A2_new':'A2', 'A1_new':'A1', 'MAF_new':'MAF', 'pvalue':'Pval'}, inplace=True)

# Reorder columns
df = df[['CHR', 'POS', 'SNP', 'A1', 'A2', 'MAF', 'Effect', 'SE', 'Pval', 'N']]

# Assign appropriate data type except NA
df['CHR'] = df['CHR'].apply(lambda x: 'NA' if x == 'NA' else int(x))
df['POS'] = df['POS'].apply(lambda x: 'NA' if x == 'NA' else int(x))
df['SNP'] = df['SNP'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['A1'] = df['A1'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['A2'] = df['A2'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['MAF'] = df['MAF'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['Effect'] = df['Effect'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['SE'] = df['SE'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['Pval'] = df['Pval'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['N'] = df['N'].apply(lambda x: 'NA' if x == 'NA' else int(float(x)))

# Reorder according to CHR and POS
df.sort_values(by=['CHR', 'POS'], ascending=True, inplace=True)


# Re-code 'NA' to .
df.replace(to_replace=['NA'],
            value='.',
            inplace=True
            )

# Save
save_filename = "pQC.logTG_INV_EUR_HRC_1KGP3_others_ALL.meta.singlevar.results.txt"
df.to_csv(save_filename, sep='\t', index=False)

## Triglycerides (TG) / Cohort: GLGC 2022 without UKB

- File name: without_UKB_logTG_INV_EUR_HRC_1KGP3_others_ALL.meta.singlevar.results
- tab-separated
- Columns: rsID    CHROM   POS_b37 REF     ALT     N       N_studies       POOLED_ALT_AF   EFFECT_SIZE     SE      pvalue_neg_log10        pvalue  pvalue_neg_log10_GC     pvalue_GC
- GRCh37

Columns:\
rsID: ID from dbSNP 150\
CHROM: Chromosome\
POS_b37: Position in build 37\
REF: non-effect allele\
ALT: effect allele\
N: total number of individuals with results for this variant\
N_studies: total number of studies with results for this variant\
POOLED_ALT_AF: allele frequency of the ALT allele\
EFFECT_SIZE: effect size of the ALT allele\
SE: standard error\
pvalue_neg_log10: -log10(p-value)\
pvalue: p-value\
pvalue_neg_log10_GC: -log10(GC-corrected p-value)\
pvalue_GC: GC-corrected p-value\

QC performed:
- Remove unneccesary column: N_studies, pvalue_neg_log10, pvalue_neg_log10_GC, pvalue_GC
- Code missing data as NA
- MAF check
- Rename columns
- Reorder columns
- Assign appropriate data type except NA
- Re-code 'NA' to .

In [None]:
import pandas as pd
import os

file = "without_UKB_logTG_INV_EUR_HRC_1KGP3_others_ALL.meta.singlevar.results"
df = pd.read_csv(file, sep='\t', index_col=False)

# Remove unneccesary columns
df.drop(columns=['N_studies', 'pvalue_neg_log10', 'pvalue_neg_log10_GC', 'pvalue_GC'], inplace=True)

# Code missing data as NA
df['rsID'] = df['rsID'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['CHROM'] = df['CHROM'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['POS_b37'] = df['POS_b37'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['REF'] = df['REF'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['ALT'] = df['ALT'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['POOLED_ALT_AF'] = df['POOLED_ALT_AF'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['EFFECT_SIZE'] = df['EFFECT_SIZE'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['SE'] = df['SE'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['pvalue'] = df['pvalue'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['N'] = df['N'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))

# MAF calculation and check
df['MAF check'] = df['POOLED_ALT_AF'].apply(lambda x: 'NA' if x == 'NA' else True if float(x) <= 0.5 else False)
df['MAF_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['POOLED_ALT_AF'] if row['MAF check'] else str(1 - float(row['POOLED_ALT_AF'])), axis=1)
df['A1_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['ALT'] if row['MAF check'] else row['REF'], axis=1)
df['A2_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['REF'] if row['MAF check'] else row['ALT'], axis=1)
df['Effect'] =  df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['EFFECT_SIZE'] if row['MAF check'] else str(-1 * float(row['EFFECT_SIZE'])), axis=1)
df.drop(columns=['POOLED_ALT_AF', 'MAF check', 'ALT', 'REF', 'EFFECT_SIZE'], inplace=True)

# Rename columns
df.rename(columns={'rsID':'SNP', 'CHROM':'CHR', 'POS_b37':'POS', 'A2_new':'A2', 'A1_new':'A1', 'MAF_new':'MAF', 'pvalue':'Pval'}, inplace=True)

# Reorder columns
df = df[['CHR', 'POS', 'SNP', 'A1', 'A2', 'MAF', 'Effect', 'SE', 'Pval', 'N']]

# Assign appropriate data type except NA
df['CHR'] = df['CHR'].apply(lambda x: 'NA' if x == 'NA' else int(x))
df['POS'] = df['POS'].apply(lambda x: 'NA' if x == 'NA' else int(x))
df['SNP'] = df['SNP'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['A1'] = df['A1'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['A2'] = df['A2'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['MAF'] = df['MAF'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['Effect'] = df['Effect'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['SE'] = df['SE'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['Pval'] = df['Pval'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['N'] = df['N'].apply(lambda x: 'NA' if x == 'NA' else int(float(x)))

# Reorder according to CHR and POS
df.sort_values(by=['CHR', 'POS'], ascending=True, inplace=True)


# Re-code 'NA' to .
df.replace(to_replace=['NA'],
            value='.',
            inplace=True
            )

# Save
save_filename = "pQC.without_UKB_logTG_INV_EUR_HRC_1KGP3_others_ALL.meta.singlevar.results.txt"
df.to_csv(save_filename, sep='\t', index=False)

# Hypertension (HTN)

## Hypertension (HTN) / Cohort: Finngen release 7

- File name: finngen_R7_I9_HYPERTENSION
- tab-separated
- Columns: 
- GRCh38

README:\
#chrom: chromosome on build GRCh38 (1-23)\
pos: position in base pairs on build GRCh38\
ref: reference allele\
alt: alternative allele (effect allele)\
rsids: variant identifier\
nearest_genes: nearest gene(s) (comma separated) from variant\
pval: p-value from ​\
mlogp: -log10(p-value)\
beta: effect size (log(OR) scale) estimated with  for the alternative allele\
sebeta: standard error of effect size estimated with ​\
af_alt: alternative (effect) allele frequency\
af_alt_cases: alternative (effect) allele frequency among cases\
af_alt_controls: alternative (effect) allele frequency among controls\

QC performed:
- liftover
- Remove unneccesary column: mlogp, nearest_genes, af_alt_cases, af_alt_controls, n_hom_cases, n_hom_ref_cases, n_het_cases, n_hom_controls, n_hom_ref_controls, n_het_controls
- Code missing data as NA
- MAF check
- Rename columns
- Reorder columns
- Assign appropriate data type except NA
- Re-code 'NA' to .

In [None]:
# Liftover
import pandas as pd
import os

file = 'finngen_R7_I9_HYPERTENSION'
lifting_dir = 'HTN_Finngen_r7'

## Generate bed file
df = pd.read_csv(file, sep='\t', index_col=False)
df_bed = df[['#chrom', 'pos', 'rsids']].copy()
df_bed = df_bed[df_bed['rsids'].notnull()]
df_bed['chr'] = df_bed['#chrom'].apply(lambda x: "chr{}".format(x))
df_bed['bp-1'] = df_bed['pos'].apply(lambda x: int(x) - 1)
df_bed = df_bed[['chr', 'bp-1', 'pos', 'rsids']]
df_bed.to_csv(os.path.join(lifting_dir, "HTN_Finngen_r7.bed"), sep='\t', header=False, index=False)

In [None]:
## Perform liftover
!liftOver HTN_Finngen_r7.bed GRCh38_to_GRCh37.chain.gz HTN_Finngen_r7.lifted HTN_Finngen_r7.unlifted

In [None]:
import pandas as pd
import os
from tqdm import tqdm
import csv

file = 'finngen_R7_I9_HYPERTENSION'
df = pd.read_csv(file, sep='\t', index_col=False)

# Remove unneccesary column
df.drop(columns=["mlogp", "nearest_genes", "af_alt_cases", "af_alt_controls"], inplace=True)

df_ = df[df['rsids'].notnull()].copy()
df_rsid_na = df[~df['rsids'].notnull()].copy()

## Match bp from liftover
liftover_out = pd.read_csv("HTN_Finngen_r7.lifted", sep='\t', index_col=False, names=['CHR', 'BP-1', 'BP', 'SNP'])
liftover_out.drop(columns=['BP-1'], inplace=True)

## Add lifted SNPs in unlifted file which are having Duplicated in new tag
unlifted_file = "HTN_Finngen_r7.unlifted"
retain = []
with open(unlifted_file, 'r') as f:
    reader = csv.reader(f, delimiter='\t')
    tag = False
    for row in reader:
        if tag:
            retain.append(row)
            tag = False
        if row[0] == '#Duplicated in new':
            tag = True
unlifted_retained = pd.DataFrame(retain, columns=['CHR', 'BP-1', 'BP', 'SNP'])
unlifted_retained.drop(columns=['BP-1'], inplace=True)

liftover_out = pd.concat([liftover_out, unlifted_retained])

## Start matching
lifted_df = None
for chr_i in tqdm(range(1, 24), leave=False):
    df_partial = df_.loc[df_['#chrom'] == chr_i]

    lifted_df_partial = df_partial.merge(liftover_out, left_on='rsids', right_on='SNP', how='left')
    lifted_df_partial.drop(columns=['#chrom', 'pos', 'rsids'], inplace=True)
    lifted_df_partial['CHR'] = lifted_df_partial['CHR'].apply(lambda chr: int(chr) if is_integer(chr) else float('nan'))
    lifted_df_partial['BP'] = lifted_df_partial['BP'].apply(lambda chr: int(chr) if is_integer(chr) else float('nan'))
    lifted_df = pd.concat([lifted_df, lifted_df_partial])

del liftover_out


# Attach rsID missing df (these will be removed during EasyQC step)
df_rsid_na.rename(columns={'#chrom':'CHR', 'pos':'BP', 'rsids':'SNP'}, inplace=True)
df_rsid_na = df_rsid_na[['ref', 'alt', 'pval', 'beta', 'sebeta', 'af_alt', 'CHR', 'BP', 'SNP']]

df = pd.concat([lifted_df, df_rsid_na])

# Code missing data as NA
df['CHR'] = df['CHR'].apply(lambda x: 'NA' if is_na(x) else str(int(x)))
df['BP'] = df['BP'].apply(lambda x: 'NA' if is_na(x) else str(int(x)))
df['SNP'] = df['SNP'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['ref'] = df['ref'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['alt'] = df['alt'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['pval'] = df['pval'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['beta'] = df['beta'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['sebeta'] = df['sebeta'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['af_alt'] = df['af_alt'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))


# MAF check
df['MAF check'] = df['af_alt'].apply(lambda x: 'NA' if x == 'NA' else True if float(x) <= 0.5 else False)
df['MAF_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['af_alt'] if row['MAF check'] else str(1 - float(row['af_alt'])), axis=1)
df['alt_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['alt'] if row['MAF check'] else row['ref'], axis=1)
df['ref_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['ref'] if row['MAF check'] else row['alt'], axis=1)
df['Effect'] =  df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['beta'] if row['MAF check'] else str(-1 * float(row['beta'])), axis=1)
df.drop(columns=['af_alt', 'MAF check', 'alt', 'ref', 'beta'], inplace=True)

# Rename columns
df.rename(columns={'ref_new':'A2', 'alt_new':'A1', 'pval':'Pval', 'sebeta':'SE', 'BP':'POS', 'MAF_new':'MAF'}, inplace=True)

# Reorder columns
df = df[['CHR', 'POS', 'SNP', 'A1', 'A2', 'MAF', 'Effect', 'SE', 'Pval']]

# Assign appropriate data type except NA
df['CHR'] = df['CHR'].apply(lambda x: 'NA' if x == 'NA' else int(x))
df['POS'] = df['POS'].apply(lambda x: 'NA' if x == 'NA' else int(x))
df['SNP'] = df['SNP'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['A1'] = df['A1'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['A2'] = df['A2'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['MAF'] = df['MAF'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['Effect'] = df['Effect'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['SE'] = df['SE'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['Pval'] = df['Pval'].apply(lambda x: 'NA' if x == 'NA' else float(x))

# Reorder according to CHR and POS
df.sort_values(by=['CHR', 'POS'], ascending=True, inplace=True)


# Re-code 'NA' to .
df.replace(to_replace=['NA'],
            value='.',
            inplace=True
            )

# Save
save_filename = "pQC.finngen_R7_I9_HYPERTENSION.txt"
df.to_csv(save_filename, sep='\t', index=False)

## Hypertension (HTN) / Cohort: UKB

- File name: 20002_1065_logistic.EUR.sumstats.MACfilt.txt
- tab-separated
- Columns: SNP     CHR     BP      A1      TEST    NMISS   OR      SE      L95     U95     STAT    P       A2      MAF     NCHROBS SNPID_UKB       A1_UKB  A2_UKB  INFO_UKB        MAF_UKB
- GRCh37

README:\
SNP: unique ID of the SNP consists of chromosome, position and alphabetically ordered alleles\
CHR: chromosome\
BP: base pair position on GRCh37\
A1: effect allele\
TEST: Type of test (ADD for all files)\
NMISS: Number of non-missing genotypes\
BETA/OR: Regression coefficient or odds ratio\
SE: Standard error (for binary traits, SE in logOR scale)\
L95: Lower bound on confidence interval for CMH odds ratio\
U95: Upper bound on confidence interval for CMH odds ratio\
STAT: Coefficient t-statistics\
P: P-value\
A2: non effect allele\
MAF: Minor allele frequency\
NCHROBS: Number of allele observation\
SNPID_UKB: rsID provided by UK Biobank\
A1_UKB: A1 allele in UK Biobank\
A2_UKB: A2 allele in UK Biobank\
INFO_UKB: Info score provided by UK Biobank\
MAF_UKB: MAF of entire UK Boiobank samples\

QC performed:
- Remove unneccesary column: TEST, NMISS, L95, U95, STAT, NCHROBS, A1_UKB, A2_UKB, MAF_UKB
- Code missing data as NA
- OR to beta
- MAF check
- Rename columns
- Reorder columns
- Assign appropriate data type except NA
- Re-code 'NA' to .

In [None]:
import pandas as pd
import os
from math import log, exp

file = "20002_1065_logistic.EUR.sumstats.MACfilt.txt"
df = pd.read_csv(file, sep='\t', index_col=False)

# Remove unneccesary columns
df.drop(columns=['SNP', 'TEST', 'NMISS', 'L95', 'U95', 'STAT', 'NCHROBS', 'A1_UKB', 'A2_UKB', 'MAF_UKB'], inplace=True)

# Code missing data as NA
df['SNPID_UKB'] = df['SNPID_UKB'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['CHR'] = df['CHR'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['BP'] = df['BP'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['A1'] = df['A1'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['A2'] = df['A2'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['MAF'] = df['MAF'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['OR'] = df['OR'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['SE'] = df['SE'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['P'] = df['P'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))

# OR to ln(OR)
df['Effect'] = df['OR'].apply(lambda x: 'NA' if x == 'NA' else str(log(float(x))))

# MAF check
df['MAF check'] = df['MAF'].apply(lambda x: 'NA' if x == 'NA' else True if float(x) <= 0.5 else False)
df['MAF_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['MAF'] if row['MAF check'] else str(1 - float(row['MAF'])), axis=1)
df['A1_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['A1'] if row['MAF check'] else row['A2'], axis=1)
df['A2_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['A2'] if row['MAF check'] else row['A1'], axis=1)
df['Effect_new'] =  df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['Effect'] if row['MAF check'] else str(-1 * float(row['Effect'])), axis=1)
df['OR_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['OR'] if row['MAF check'] else str(exp(float(row['Effect_new']))), axis=1)
df.drop(columns=['MAF', 'MAF check', 'A1', 'A2', 'Effect', 'OR'], inplace=True)


# Rename columns
df.rename(columns={'SNPID_UKB':'SNP', 'BP':'POS', 'P':'Pval', 'MAF_new':'MAF', 'A1_new':'A1', 'A2_new':'A2', 'Effect_new':'Effect', 'OR_new':'OR'}, inplace=True)

# Reorder columns
df = df[['CHR', 'POS', 'SNP', 'A1', 'A2', 'MAF', 'Effect', 'OR', 'SE', 'Pval']]

# Assign appropriate data type except NA
df['CHR'] = df['CHR'].apply(lambda x: 'NA' if x == 'NA' else int(x))
df['POS'] = df['POS'].apply(lambda x: 'NA' if x == 'NA' else int(x))
df['SNP'] = df['SNP'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['A1'] = df['A1'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['A2'] = df['A2'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['MAF'] = df['MAF'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['Effect'] = df['Effect'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['OR'] = df['OR'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['SE'] = df['SE'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['Pval'] = df['Pval'].apply(lambda x: 'NA' if x == 'NA' else float(x))


# Reorder according to CHR and POS
df.sort_values(by=['CHR', 'POS'], ascending=True, inplace=True)


# Re-code 'NA' to .
df.replace(to_replace=['NA'],
            value='.',
            inplace=True
            )

# Save
save_filename = "pQC.20002_1065_logistic.EUR.sumstats.MACfilt.txt"
df.to_csv(save_filename, sep='\t', index=False)

## Hypertension (HTN) / Cohort: [METAL] FinnGen release 7, UKB

- File name: ./meta_analysis/HTN_FinngenR7_UKB/meta.HTN.FinngenR7.UKBwatanabe.1.tbl
- tab-separated
- Columns: MarkerName      Allele1 Allele2 Freq1   FreqSE  Effect  StdErr  P-value Direction       HetISq  HetChiSq        HetDf   HetPVal
- GRCh37

README:\


QC performed:
- Remove unneccesary column: FreqSE, Direction, HetISq, HetChiSq, HetDf, HetPVal
- Code missing data as NA
- CHR, POS matching from 1kG
- MAF check
- Rename columns
- Reorder columns
- Assign appropriate data type except NA
- Re-code 'NA' to .

In [None]:

import pandas as pd
import os
from math import log, exp

file = "./meta_analysis/HTN_FinngenR7_UKB/meta.HTN.FinngenR7.UKBwatanabe.1.tbl"
df = pd.read_csv(file, sep='\t', index_col=False)

# Remove unneccesary columns
df.drop(columns=['FreqSE', 'Direction', 'HetISq', 'HetChiSq', 'HetDf', 'HetPVal'], inplace=True)

# Code missing data as NA
df['MarkerName'] = df['MarkerName'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['Allele1'] = df['Allele1'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['Allele2'] = df['Allele2'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['Freq1'] = df['Freq1'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['Effect'] = df['Effect'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['StdErr'] = df['StdErr'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['P-value'] = df['P-value'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))

# CHR and POS from 1kG
df = df.merge(var_match_ref, left_on='MarkerName', right_on='rsID', how='left')

def check_chrpos_na(x):
    try:
        if is_floatNan(x):
            return True
        return False
    except:
        if is_strna(x):
            return True
        return False

df['CHRPOS'] = df['CHRPOS'].apply(lambda x: 'NA' if check_chrpos_na(x) else str(x))
df['CHR'] = df['CHRPOS'].apply(lambda uniqid: 'NA' if uniqid == 'NA' else str(uniqid.split(sep=':')[0]))
df['POS'] = df['CHRPOS'].apply(lambda uniqid: 'NA' if uniqid == 'NA' else str(uniqid.split(sep=':')[1]))
df.drop(columns=['CHRPOS', 'rsID'], inplace=True)

# MAF calculation and check
df['MAF check'] = df['Freq1'].apply(lambda x: 'NA' if x == 'NA' else True if float(x) <= 0.5 else False)
df['MAF_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['Freq1'] if row['MAF check'] else str(1 - float(row['Freq1'])), axis=1)
df['A1_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['Allele1'] if row['MAF check'] else row['Allele2'], axis=1)
df['A2_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['Allele2'] if row['MAF check'] else row['Allele1'], axis=1)
df['Effect_new'] =  df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['Effect'] if row['MAF check'] else str(-1 * float(row['Effect'])), axis=1)
df.drop(columns=['Freq1', 'MAF check', 'Allele1', 'Allele2', 'Effect'], inplace=True)

# Rename columns
df.rename(columns={'MarkerName':'SNP', 'A1_new':'A1', 'A2_new':'A2', 'Effect_new':'Effect', 'MAF_new':'MAF', 'StdErr':'SE', 'P-value':'Pval'}, inplace=True)

# Reorder columns
df = df[['CHR', 'POS', 'SNP', 'A1', 'A2', 'MAF', 'Effect', 'SE', 'Pval']]

# Assign appropriate data type except NA
df['CHR'] = df['CHR'].apply(lambda x: 'NA' if x == 'NA' else int(x))
df['POS'] = df['POS'].apply(lambda x: 'NA' if x == 'NA' else int(x))
df['SNP'] = df['SNP'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['A1'] = df['A1'].apply(lambda x: 'NA' if x == 'NA' else str(x).upper())
df['A2'] = df['A2'].apply(lambda x: 'NA' if x == 'NA' else str(x).upper())
df['MAF'] = df['MAF'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['Effect'] = df['Effect'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['SE'] = df['SE'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['Pval'] = df['Pval'].apply(lambda x: 'NA' if x == 'NA' else float(x))

# Reorder according to CHR and POS
df.sort_values(by=['CHR', 'POS'], ascending=True, inplace=True)


# Re-code 'NA' to .
df.replace(to_replace=['NA'],
            value='.',
            inplace=True
            )

# Save
save_filename = "pQC.meta.HTN.FinngenR7.UKBwatanabe.1.tbl.txt"
df.to_csv(save_filename, sep='\t', index=False)

# Type II Diabetes (T2D)

## Type II Diabetes (T2D) / Cohort: Finngen release 7

- File name: finngen_R7_E4_DM2
- tab-separated
- Columns: 
- GRCh38

README:\
#chrom: chromosome on build GRCh38 (1-23)\
pos: position in base pairs on build GRCh38\
ref: reference allele\
alt: alternative allele (effect allele)\
rsids: variant identifier\
nearest_genes: nearest gene(s) (comma separated) from variant\
pval: p-value from ​\
mlogp: -log10(p-value)\
beta: effect size (log(OR) scale) estimated with  for the alternative allele\
sebeta: standard error of effect size estimated with ​\
af_alt: alternative (effect) allele frequency\
af_alt_cases: alternative (effect) allele frequency among cases\
af_alt_controls: alternative (effect) allele frequency among controls\

QC performed:
- liftover
- Remove unneccesary column: mlogp, nearest_genes, af_alt_cases, af_alt_controls, n_hom_cases, n_hom_ref_cases, n_het_cases, n_hom_controls, n_hom_ref_controls, n_het_controls
- Code missing data as NA
- MAF check
- Rename columns
- Reorder columns
- Assign appropriate data type except NA
- Re-code 'NA' to .

In [None]:
# Liftover
import pandas as pd
import os

file = 'finngen_R7_E4_DM2'
lifting_dir = './T2D_Finngen_r7'

## Generate bed file
df = pd.read_csv(file, sep='\t', index_col=False)
df_bed = df[['#chrom', 'pos', 'rsids']].copy()
df_bed = df_bed[df_bed['rsids'].notnull()]
df_bed['chr'] = df_bed['#chrom'].apply(lambda x: "chr{}".format(x))
df_bed['bp-1'] = df_bed['pos'].apply(lambda x: int(x) - 1)
df_bed = df_bed[['chr', 'bp-1', 'pos', 'rsids']]
df_bed.to_csv(os.path.join(lifting_dir, "T2D_Finngen_r7.bed"), sep='\t', header=False, index=False)

In [None]:
## Perform liftover
!liftOver T2D_Finngen_r7.bed GRCh38_to_GRCh37.chain.gz T2D_Finngen_r7.lifted T2D_Finngen_r7.unlifted

In [None]:
import pandas as pd
import os
from tqdm import tqdm
import csv

file = 'finngen_R7_E4_DM2'
df = pd.read_csv(file, sep='\t', index_col=False)

# Remove unneccesary column
df.drop(columns=["mlogp", "nearest_genes", "af_alt_cases", "af_alt_controls"], inplace=True)

df_ = df[df['rsids'].notnull()].copy()
df_rsid_na = df[~df['rsids'].notnull()].copy()

## Match bp from liftover
liftover_out = pd.read_csv("/data1/sanghyeon/Projects/MetabolicSyndrome/GWASsumstat/litftover_sumstat/T2D_Finngen_r7/T2D_Finngen_r7.lifted", sep='\t', index_col=False, names=['CHR', 'BP-1', 'BP', 'SNP'])
liftover_out.drop(columns=['BP-1'], inplace=True)

## Add lifted SNPs in unlifted file which are having Duplicated in new tag
unlifted_file = "/data1/sanghyeon/Projects/MetabolicSyndrome/GWASsumstat/litftover_sumstat/T2D_Finngen_r7/T2D_Finngen_r7.unlifted"
retain = []
with open(unlifted_file, 'r') as f:
    reader = csv.reader(f, delimiter='\t')
    tag = False
    for row in reader:
        if tag:
            retain.append(row)
            tag = False
        if row[0] == '#Duplicated in new':
            tag = True
unlifted_retained = pd.DataFrame(retain, columns=['CHR', 'BP-1', 'BP', 'SNP'])
unlifted_retained.drop(columns=['BP-1'], inplace=True)

liftover_out = pd.concat([liftover_out, unlifted_retained])

## Start matching
lifted_df = None
for chr_i in tqdm(range(1, 24), leave=False):
    df_partial = df_.loc[df_['#chrom'] == chr_i]

    lifted_df_partial = df_partial.merge(liftover_out, left_on='rsids', right_on='SNP', how='left')
    lifted_df_partial.drop(columns=['#chrom', 'pos', 'rsids'], inplace=True)
    lifted_df_partial['CHR'] = lifted_df_partial['CHR'].apply(lambda chr: int(chr) if is_integer(chr) else float('nan'))
    lifted_df_partial['BP'] = lifted_df_partial['BP'].apply(lambda chr: int(chr) if is_integer(chr) else float('nan'))
    lifted_df = pd.concat([lifted_df, lifted_df_partial])

del liftover_out


# Attach rsID missing df (these will be removed during EasyQC step)
df_rsid_na.rename(columns={'#chrom':'CHR', 'pos':'BP', 'rsids':'SNP'}, inplace=True)
df_rsid_na = df_rsid_na[['ref', 'alt', 'pval', 'beta', 'sebeta', 'af_alt', 'CHR', 'BP', 'SNP']]

df = pd.concat([lifted_df, df_rsid_na])

# Code missing data as NA
df['CHR'] = df['CHR'].apply(lambda x: 'NA' if is_na(x) else str(int(x)))
df['BP'] = df['BP'].apply(lambda x: 'NA' if is_na(x) else str(int(x)))
df['SNP'] = df['SNP'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['ref'] = df['ref'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['alt'] = df['alt'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['pval'] = df['pval'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['beta'] = df['beta'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['sebeta'] = df['sebeta'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['af_alt'] = df['af_alt'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))


# MAF check
df['MAF check'] = df['af_alt'].apply(lambda x: 'NA' if x == 'NA' else True if float(x) <= 0.5 else False)
df['MAF_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['af_alt'] if row['MAF check'] else str(1 - float(row['af_alt'])), axis=1)
df['alt_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['alt'] if row['MAF check'] else row['ref'], axis=1)
df['ref_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['ref'] if row['MAF check'] else row['alt'], axis=1)
df['Effect'] =  df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['beta'] if row['MAF check'] else str(-1 * float(row['beta'])), axis=1)
df.drop(columns=['af_alt', 'MAF check', 'alt', 'ref', 'beta'], inplace=True)

# Rename columns
df.rename(columns={'ref_new':'A2', 'alt_new':'A1', 'pval':'Pval', 'sebeta':'SE', 'BP':'POS', 'MAF_new':'MAF'}, inplace=True)

# Reorder columns
df = df[['CHR', 'POS', 'SNP', 'A1', 'A2', 'MAF', 'Effect', 'SE', 'Pval']]

# Assign appropriate data type except NA
df['CHR'] = df['CHR'].apply(lambda x: 'NA' if x == 'NA' else int(x))
df['POS'] = df['POS'].apply(lambda x: 'NA' if x == 'NA' else int(x))
df['SNP'] = df['SNP'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['A1'] = df['A1'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['A2'] = df['A2'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['MAF'] = df['MAF'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['Effect'] = df['Effect'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['SE'] = df['SE'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['Pval'] = df['Pval'].apply(lambda x: 'NA' if x == 'NA' else float(x))

# Reorder according to CHR and POS
df.sort_values(by=['CHR', 'POS'], ascending=True, inplace=True)


# Re-code 'NA' to .
df.replace(to_replace=['NA'],
            value='.',
            inplace=True
            )

# Save
save_filename = "pQC.finngen_R7_E4_DM2.txt"
df.to_csv(save_filename, sep='\t', index=False)

## Type II Diabetes (T2D) / Cohort: Mahajan 2022

- File name: DIAMANTE-EUR.sumstat.txt
- Whitespace-separated
- Columns: chromosome(b37) position(b37) chrposID rsID effect_allele other_allele effect_allele_frequency Fixed-effects_beta Fixed-effects_SE Fixed-effects_p-value
- GRCh37

README:\
A space delimited file with one row per SNV including the following columns:\
chromosome and position (hg19, build 37); \
chromosome and position ID; \
rsID; \
effect allele and other allele; \
effect allele frequency; \
fixed-effects
meta-analysis beta (log-odds ratio); \
fixed-effects meta-analysis standard error (corrected for genomic control); \
fixed-effects meta-analysis association p-value (corrected for genomic control).

QC performed:
- Drop unneccessary columns: chrposID
- Rename columns
- Code missing data as NA
- Capitalize allele
- MAF check
- Rename columns
- Reorder columns
- Assign appropriate data type except NA
- Re-code 'NA' to .

In [None]:
import pandas as pd
import os

file = 'DIAMANTE-EUR.sumstat.txt'
df = pd.read_csv(file, sep=' ', index_col=False)

# Drop unneccessary columns: chrposID
df.drop(columns=['chrposID'], inplace=True)

# Rename columns
df.rename(columns={'chromosome(b37)':'CHR', 'position(b37)':'POS', 'rsID':'SNP', 'effect_allele':'A1', 'other_allele':'A2',
                'effect_allele_frequency':'EAF', 'Fixed-effects_beta':'Effect', 'Fixed-effects_SE':'SE', 'Fixed-effects_p-value':'Pval'}, inplace=True)

# Code missing data as NA
df['CHR'] = df['CHR'].apply(lambda x: 'NA' if is_na(x) else str(int(x)))
df['POS'] = df['POS'].apply(lambda x: 'NA' if is_na(x) else str(int(x)))
df['SNP'] = df['SNP'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['A1'] = df['A1'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['A2'] = df['A2'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['Pval'] = df['Pval'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['Effect'] = df['Effect'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['SE'] = df['SE'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['EAF'] = df['EAF'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))

# Capitalize allele
df['A1'] = df['A1'].apply(lambda x: x.upper())
df['A2'] = df['A2'].apply(lambda x: x.upper())

# MAF check
df['MAF check'] = df['EAF'].apply(lambda x: 'NA' if x == 'NA' else True if float(x) <= 0.5 else False)
df['MAF'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['EAF'] if row['MAF check'] else str(1 - float(row['EAF'])), axis=1)
df['A1_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['A1'] if row['MAF check'] else row['A2'], axis=1)
df['A2_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['A2'] if row['MAF check'] else row['A1'], axis=1)
df['Effect_new'] =  df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['Effect'] if row['MAF check'] else str(-1 * float(row['Effect'])), axis=1)
df.drop(columns=['EAF', 'MAF check', 'A1', 'A2', 'Effect'], inplace=True)

df.rename(columns={'A1_new':'A1', 'A2_new':'A2', 'Effect_new':'Effect'}, inplace=True)


# Reorder columns
df = df[['CHR', 'POS', 'SNP', 'A1', 'A2', 'MAF', 'Effect', 'SE', 'Pval']]

# Assign appropriate data type except NA
df['CHR'] = df['CHR'].apply(lambda x: 'NA' if x == 'NA' else int(x))
df['POS'] = df['POS'].apply(lambda x: 'NA' if x == 'NA' else int(x))
df['SNP'] = df['SNP'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['A1'] = df['A1'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['A2'] = df['A2'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['MAF'] = df['MAF'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['Effect'] = df['Effect'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['SE'] = df['SE'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['Pval'] = df['Pval'].apply(lambda x: 'NA' if x == 'NA' else float(x))

# Reorder according to CHR and POS
df.sort_values(by=['CHR', 'POS'], ascending=True, inplace=True)


# Re-code 'NA' to .
df.replace(to_replace=['NA'],
            value='.',
            inplace=True
            )

# Save
save_filename = "pQC.DIAMANTE-EUR.sumstat.txt"
df.to_csv(save_filename, sep='\t', index=False)



## Type II Diabetes (T2D) / Cohort: MVP

- File name: T2D_MVP_only_EUR_dbGaP.txt
- Whitespace-separated
- Columns: RSID CHR POS NEA EA EAF BETA SE P N MACH_R2 CALLRATE HWE OR
- GRCh37

README:\
RSID	Reference SNP identifier. rs number (RSID) assigned by dbSNP\
CHR	chromosomal number: 1, 2 . .,22\
POS	Locus position on a chromosome (hg19)\
NEA	Non-effect allele\
EA	Effect allele\
EAF	Effect allele frequency across all samples\
BETA	Regression coefficient (이건 log(OR).)\
SE	Standard error of estimated effect\
P	P value based on test statistic\
N	Number of subjects participated in data analysis\
MACH_R2	imputation quality\
CALLRATE	call rate for genotyped SNPs\
HWE	HWE p-value\
OR	Odds ratio\


QC performed:
- Drop unneccessary columns: HWE, CALLRATE
- Rename columns
- Code missing data as NA
- MAF check
- Rename columns
- Reorder columns
- Assign appropriate data type except NA
- Re-code 'NA' to .

In [None]:
import pandas as pd
import os
from math import log

file = 'T2D_MVP_only_EUR_dbGaP.txt'
df = pd.read_csv(file, sep=' ', index_col=False)


# Drop unneccessary columns: chrposID
df.drop(columns=['HWE', 'CALLRATE'], inplace=True)

# Rename columns
df.rename(columns={'RSID': 'SNP', 'NEA':'A2', 'EA':'A1', 'BETA':'Effect', 'P':'Pval', 'MACH_R2':'INFO'}, inplace=True)

# Code missing data as NA
df['CHR'] = df['CHR'].apply(lambda x: 'NA' if is_na(x) else str(int(x)))
df['POS'] = df['POS'].apply(lambda x: 'NA' if is_na(x) else str(int(x)))
df['SNP'] = df['SNP'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['A1'] = df['A1'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['A2'] = df['A2'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['Pval'] = df['Pval'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['Effect'] = df['Effect'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['SE'] = df['SE'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['EAF'] = df['EAF'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['N'] = df['N'].apply(lambda x: 'NA' if is_na(x) else str(int(x)))
df['OR'] = df['OR'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['INFO'] = df['INFO'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))

# MAF check
df['MAF check'] = df['EAF'].apply(lambda x: 'NA' if x == 'NA' else True if float(x) <= 0.5 else False)
df['MAF'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['EAF'] if row['MAF check'] else str(1 - float(row['EAF'])), axis=1)
df['A1_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['A1'] if row['MAF check'] else row['A2'], axis=1)
df['A2_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['A2'] if row['MAF check'] else row['A1'], axis=1)
df['Effect_new'] =  df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['Effect'] if row['MAF check'] else str(-1 * float(row['Effect'])), axis=1)
df.drop(columns=['EAF', 'MAF check', 'A1', 'A2', 'Effect'], inplace=True)

df.rename(columns={'A1_new':'A1', 'A2_new':'A2', 'Effect_new':'Effect'}, inplace=True)


# Reorder columns
df = df[['CHR', 'POS', 'SNP', 'A1', 'A2', 'MAF', 'Effect', 'OR', 'SE', 'Pval', 'N', 'INFO']]

# Assign appropriate data type except NA
df['CHR'] = df['CHR'].apply(lambda x: 'NA' if x == 'NA' else int(x))
df['POS'] = df['POS'].apply(lambda x: 'NA' if x == 'NA' else int(x))
df['SNP'] = df['SNP'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['A1'] = df['A1'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['A2'] = df['A2'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['MAF'] = df['MAF'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['Effect'] = df['Effect'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['OR'] = df['OR'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['SE'] = df['SE'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['Pval'] = df['Pval'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['N'] = df['N'].apply(lambda x: 'NA' if x == 'NA' else int(x))
df['INFO'] = df['INFO'].apply(lambda x: 'NA' if x == 'NA' else float(x))

# Reorder according to CHR and POS
df.sort_values(by=['CHR', 'POS'], ascending=True, inplace=True)


# Re-code 'NA' to .
df.replace(to_replace=['NA'],
            value='.',
            inplace=True
            )

# Save
save_filename = "pQC.T2D_MVP_only_EUR_dbGaP.txt"
df.to_csv(save_filename, sep='\t', index=False)


## Type II Diabetes (T2D) / Cohort: [METAL] FinnGen release 7, Mahajan 2022, MVP

- File name: ./meta_analysis/T2D_FinngenR7_Mahajan2022_MVP/meta.T2D.FinngenR7.Mahajan2022.MVP.1.tbl
- tab-separated
- Columns: MarkerName      Allele1 Allele2 Freq1   FreqSE  Effect  StdErr  P-value Direction       HetISq  HetChiSq        HetDf   HetPVal
- GRCh37

README:\


QC performed:
- Remove unneccesary column: FreqSE, Direction, HetISq, HetChiSq, HetDf, HetPVal
- Code missing data as NA
- CHR, POS matching from 1kG
- MAF check
- Rename columns
- Reorder columns
- Assign appropriate data type except NA
- Re-code 'NA' to .

In [None]:
import pandas as pd
import os
from math import log, exp

file = "./meta_analysis/T2D_FinngenR7_Mahajan2022_MVP/meta.T2D.FinngenR7.Mahajan2022.MVP.1.tbl"
df = pd.read_csv(file, sep='\t', index_col=False)

# Remove unneccesary columns
df.drop(columns=['FreqSE', 'Direction', 'HetISq', 'HetChiSq', 'HetDf', 'HetPVal'], inplace=True)

# Code missing data as NA
df['MarkerName'] = df['MarkerName'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['Allele1'] = df['Allele1'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['Allele2'] = df['Allele2'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['Freq1'] = df['Freq1'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['Effect'] = df['Effect'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['StdErr'] = df['StdErr'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['P-value'] = df['P-value'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))

# CHR and POS from 1kG
df = df.merge(var_match_ref, left_on='MarkerName', right_on='rsID', how='left')

def check_chrpos_na(x):
    try:
        if is_floatNan(x):
            return True
        return False
    except:
        if is_strna(x):
            return True
        return False

df['CHRPOS'] = df['CHRPOS'].apply(lambda x: 'NA' if check_chrpos_na(x) else str(x))
df['CHR'] = df['CHRPOS'].apply(lambda uniqid: 'NA' if uniqid == 'NA' else str(uniqid.split(sep=':')[0]))
df['POS'] = df['CHRPOS'].apply(lambda uniqid: 'NA' if uniqid == 'NA' else str(uniqid.split(sep=':')[1]))
df.drop(columns=['CHRPOS', 'rsID'], inplace=True)

# MAF calculation and check
df['MAF check'] = df['Freq1'].apply(lambda x: 'NA' if x == 'NA' else True if float(x) <= 0.5 else False)
df['MAF_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['Freq1'] if row['MAF check'] else str(1 - float(row['Freq1'])), axis=1)
df['A1_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['Allele1'] if row['MAF check'] else row['Allele2'], axis=1)
df['A2_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['Allele2'] if row['MAF check'] else row['Allele1'], axis=1)
df['Effect_new'] =  df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['Effect'] if row['MAF check'] else str(-1 * float(row['Effect'])), axis=1)
df.drop(columns=['Freq1', 'MAF check', 'Allele1', 'Allele2', 'Effect'], inplace=True)

# Rename columns
df.rename(columns={'MarkerName':'SNP', 'A1_new':'A1', 'A2_new':'A2', 'Effect_new':'Effect', 'MAF_new':'MAF', 'StdErr':'SE', 'P-value':'Pval'}, inplace=True)

# Reorder columns
df = df[['CHR', 'POS', 'SNP', 'A1', 'A2', 'MAF', 'Effect', 'SE', 'Pval']]

# Assign appropriate data type except NA
df['CHR'] = df['CHR'].apply(lambda x: 'NA' if x == 'NA' else int(x))
df['POS'] = df['POS'].apply(lambda x: 'NA' if x == 'NA' else int(x))
df['SNP'] = df['SNP'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['A1'] = df['A1'].apply(lambda x: 'NA' if x == 'NA' else str(x).upper())
df['A2'] = df['A2'].apply(lambda x: 'NA' if x == 'NA' else str(x).upper())
df['MAF'] = df['MAF'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['Effect'] = df['Effect'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['SE'] = df['SE'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['Pval'] = df['Pval'].apply(lambda x: 'NA' if x == 'NA' else float(x))

# Reorder according to CHR and POS
df.sort_values(by=['CHR', 'POS'], ascending=True, inplace=True)


# Re-code 'NA' to .
df.replace(to_replace=['NA'],
            value='.',
            inplace=True
            )

# Save
save_filename = os.path.join(dir, "meta.T2D.FinngenR7.Mahajan2022.MVP.1.tbl.txt")
df.to_csv(save_filename, sep='\t', index=False)

## Type II Diabetes (T2D) / Cohort: [METAL] FinnGen release 7, MVP

- File name: ./meta_analysis/T2D_FinngenR7_MVP/meta.T2D.FinngenR7.MVP.1.tbl
- tab-separated
- Columns: MarkerName      Allele1 Allele2 Freq1   FreqSE  Effect  StdErr  P-value Direction       HetISq  HetChiSq        HetDf   HetPVal
- GRCh37

README:\


QC performed:
- Remove unneccesary column: FreqSE, Direction, HetISq, HetChiSq, HetDf, HetPVal
- Code missing data as NA
- CHR, POS matching from 1kG
- MAF check
- Rename columns
- Reorder columns
- Assign appropriate data type except NA
- Re-code 'NA' to .

In [None]:
import pandas as pd
import os
from math import log, exp

file = "./meta_analysis/T2D_FinngenR7_MVP/meta.T2D.FinngenR7.MVP.1.tbl"
df = pd.read_csv(file, sep='\t', index_col=False)

# Remove unneccesary columns
df.drop(columns=['FreqSE', 'Direction', 'HetISq', 'HetChiSq', 'HetDf', 'HetPVal'], inplace=True)

# Code missing data as NA
df['MarkerName'] = df['MarkerName'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['Allele1'] = df['Allele1'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['Allele2'] = df['Allele2'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['Freq1'] = df['Freq1'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['Effect'] = df['Effect'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['StdErr'] = df['StdErr'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['P-value'] = df['P-value'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))

# CHR and POS from 1kG
df = df.merge(var_match_ref, left_on='MarkerName', right_on='rsID', how='left')

def check_chrpos_na(x):
    try:
        if is_floatNan(x):
            return True
        return False
    except:
        if is_strna(x):
            return True
        return False

df['CHRPOS'] = df['CHRPOS'].apply(lambda x: 'NA' if check_chrpos_na(x) else str(x))
df['CHR'] = df['CHRPOS'].apply(lambda uniqid: 'NA' if uniqid == 'NA' else str(uniqid.split(sep=':')[0]))
df['POS'] = df['CHRPOS'].apply(lambda uniqid: 'NA' if uniqid == 'NA' else str(uniqid.split(sep=':')[1]))
df.drop(columns=['CHRPOS', 'rsID'], inplace=True)

# MAF calculation and check
df['MAF check'] = df['Freq1'].apply(lambda x: 'NA' if x == 'NA' else True if float(x) <= 0.5 else False)
df['MAF_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['Freq1'] if row['MAF check'] else str(1 - float(row['Freq1'])), axis=1)
df['A1_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['Allele1'] if row['MAF check'] else row['Allele2'], axis=1)
df['A2_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['Allele2'] if row['MAF check'] else row['Allele1'], axis=1)
df['Effect_new'] =  df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['Effect'] if row['MAF check'] else str(-1 * float(row['Effect'])), axis=1)
df.drop(columns=['Freq1', 'MAF check', 'Allele1', 'Allele2', 'Effect'], inplace=True)

# Rename columns
df.rename(columns={'MarkerName':'SNP', 'A1_new':'A1', 'A2_new':'A2', 'Effect_new':'Effect', 'MAF_new':'MAF', 'StdErr':'SE', 'P-value':'Pval'}, inplace=True)

# Reorder columns
df = df[['CHR', 'POS', 'SNP', 'A1', 'A2', 'MAF', 'Effect', 'SE', 'Pval']]

# Assign appropriate data type except NA
df['CHR'] = df['CHR'].apply(lambda x: 'NA' if x == 'NA' else int(x))
df['POS'] = df['POS'].apply(lambda x: 'NA' if x == 'NA' else int(x))
df['SNP'] = df['SNP'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['A1'] = df['A1'].apply(lambda x: 'NA' if x == 'NA' else str(x).upper())
df['A2'] = df['A2'].apply(lambda x: 'NA' if x == 'NA' else str(x).upper())
df['MAF'] = df['MAF'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['Effect'] = df['Effect'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['SE'] = df['SE'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['Pval'] = df['Pval'].apply(lambda x: 'NA' if x == 'NA' else float(x))

# Reorder according to CHR and POS
df.sort_values(by=['CHR', 'POS'], ascending=True, inplace=True)


# Re-code 'NA' to .
df.replace(to_replace=['NA'],
            value='.',
            inplace=True
            )

# Save
save_filename = "meta.T2D.FinngenR7.MVP.1.tbl.txt"
df.to_csv(save_filename, sep='\t', index=False)

# Waist circumference (WC)

## Waist circumference / Cohort: UKB

- File name: f.48.0.0_res.EUR.sumstats.MACfilt.txt
- tab-separated
- Columns: SNP     CHR     BP      A1      TEST    NMISS   BETA    SE      L95     U95     STAT    P       A2      MAF     NCHROBS SNPID_UKB       A1_UKB  A2_UKB  INFO_UKB        MAF_UKB
- GRCh37

README:\
SNP: unique ID of the SNP consists of chromosome, position and alphabetically ordered alleles\
CHR: chromosome\
BP: base pair position on GRCh37\
A1: effect allele\
TEST: Type of test (ADD for all files)\
NMISS: Number of non-missing genotypes\
BETA/OR: Regression coefficient or odds ratio\
SE: Standard error (for binary traits, SE in logOR scale)\
L95: Lower bound on confidence interval for CMH odds ratio\
U95: Upper bound on confidence interval for CMH odds ratio\
STAT: Coefficient t-statistics\
P: P-value\
A2: non effect allele\
MAF: Minor allele frequency\
NCHROBS: Number of allele observation\
SNPID_UKB: rsID provided by UK Biobank\
A1_UKB: A1 allele in UK Biobank\
A2_UKB: A2 allele in UK Biobank\
INFO_UKB: Info score provided by UK Biobank\
MAF_UKB: MAF of entire UK Boiobank samples\

QC performed:
- Remove unneccesary column: TEST, NMISS, L95, U95, STAT, NCHROBS, A1_UKB, A2_UKB, MAF_UKB
- Code missing data as NA
- Rename columns
- Reorder columns
- Assign appropriate data type except NA
- Re-code 'NA' to .

In [None]:
import pandas as pd
import os

file = "f.48.0.0_res.EUR.sumstats.MACfilt.txt"
df = pd.read_csv(file, sep='\t', index_col=False)

# Remove unneccesary columns
df.drop(columns=['SNP', 'TEST', 'NMISS', 'L95', 'U95', 'STAT', 'NCHROBS', 'A1_UKB', 'A2_UKB', 'MAF_UKB'], inplace=True)

# Code missing data as NA
df['SNPID_UKB'] = df['SNPID_UKB'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['CHR'] = df['CHR'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['BP'] = df['BP'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['A1'] = df['A1'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['A2'] = df['A2'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['MAF'] = df['MAF'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['BETA'] = df['BETA'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['SE'] = df['SE'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['P'] = df['P'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))

# Rename columns
df.rename(columns={'SNPID_UKB':'SNP', 'BP':'POS', 'BETA':'Effect', 'P':'Pval'}, inplace=True)

# Reorder columns
df = df[['CHR', 'POS', 'SNP', 'A1', 'A2', 'MAF', 'Effect', 'SE', 'Pval']]

# Assign appropriate data type except NA
df['CHR'] = df['CHR'].apply(lambda x: 'NA' if x == 'NA' else int(x))
df['POS'] = df['POS'].apply(lambda x: 'NA' if x == 'NA' else int(x))
df['SNP'] = df['SNP'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['A1'] = df['A1'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['A2'] = df['A2'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['MAF'] = df['MAF'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['Effect'] = df['Effect'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['SE'] = df['SE'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['Pval'] = df['Pval'].apply(lambda x: 'NA' if x == 'NA' else float(x))

# Reorder according to CHR and POS
df.sort_values(by=['CHR', 'POS'], ascending=True, inplace=True)


# Re-code 'NA' to .
df.replace(to_replace=['NA'],
            value='.',
            inplace=True
            )

# Save
save_filename = "pQC.f.48.0.0_res.EUR.sumstats.MACfilt.txt"
df.to_csv(save_filename, sep='\t', index=False)

## Waist circumference / Cohort: GIANT 2015

- File name: GIANT_2015_WC_COMBINED_EUR.txt
- tab-separated
- Columns: 
- GRCh37

README:\
MarkerName: The dbSNP name of the genetic marker \
Allele1: The first allele (hg19 + strand). Where the regression coefficients (betas) are provided, the first allele is the effect allele. Where betas are not provided (typically the 2010 data), the first allele is the trait-increasing allele. \
Allele2: The second allele (hg19 + strand) \
Freq.Allele1.HapMapCEU: The allele frequency of Allele1 in the HapMap CEU population \
b: beta \
SE: standard error \
p: p-value after meta-analysis using regression coefficients (beta and standard error), and after correction for inflation of test statistics using genomic control both at the individual study level and again after meta-analysis \
N: Number of observations \


QC performed:
- Remove binary rows
- Code missing data as NA
- Get CHR and POS
- Check MAF
- Rename columns
- Reorder columns
- Assign appropriate data type except NA
- Re-code 'NA' to .

In [None]:
import pandas as pd
import os

file = "GIANT_2015_WC_COMBINED_EUR.txt"
with open(file, 'rb') as f:
    lines = f.readlines()

data = []
for idx, line in enumerate(lines):
    if idx <= 7:
        continue
    data.append(lines[idx].decode("utf-8").rstrip().split(sep='\t'))

df = pd.DataFrame(data, columns=['SNP', 'A1', 'A2', 'MAF', 'Effect', 'SE', 'Pval', 'N'])

# Code missing data as NA
df['SNP'] = df['SNP'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['A1'] = df['A1'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['A2'] = df['A2'].apply(lambda x: 'NA' if is_na(x) else str(x))
df['MAF'] = df['MAF'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['Effect'] = df['Effect'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['SE'] = df['SE'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['Pval'] = df['Pval'].apply(lambda x: 'NA' if is_na(x) else str(float(x)))
df['N'] = df['N'].apply(lambda x: 'NA' if is_na(x) else str(int(x)))

# CHR and POS from 1kG
df = df.merge(var_match_ref, left_on='SNP', right_on='rsID', how='left')

def check_chrpos_na(x):
    try:
        if is_floatNan(x):
            return True
        return False
    except:
        if is_strna(x):
            return True
        return False

df['CHRPOS'] = df['CHRPOS'].apply(lambda x: 'NA' if check_chrpos_na(x) else str(x))
df['CHR'] = df['CHRPOS'].apply(lambda uniqid: 'NA' if uniqid == 'NA' else str(uniqid.split(sep=':')[0]))
df['POS'] = df['CHRPOS'].apply(lambda uniqid: 'NA' if uniqid == 'NA' else str(uniqid.split(sep=':')[1]))
df.drop(columns=['CHRPOS', 'rsID'], inplace=True)

# MAF calculation and check
df['MAF check'] = df['MAF'].apply(lambda x: 'NA' if x == 'NA' else True if float(x) <= 0.5 else False)
df['MAF_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['MAF'] if row['MAF check'] else str(1 - float(row['MAF'])), axis=1)
df['A1_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['A1'] if row['MAF check'] else row['A2'], axis=1)
df['A2_new'] = df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['A2'] if row['MAF check'] else row['A1'], axis=1)
df['Effect_new'] =  df.apply(lambda row: 'NA' if row['MAF check'] == 'NA' else row['Effect'] if row['MAF check'] else str(-1 * float(row['Effect'])), axis=1)
df.drop(columns=['MAF', 'MAF check', 'A1', 'A2', 'Effect'], inplace=True)


# Rename columns
df.rename(columns={'MAF_new': 'MAF', 'A1_new':'A1', 'A2_new':'A2', 'Effect_new':'Effect'}, inplace=True)

# Reorder columns
df = df[['CHR', 'POS', 'SNP', 'A1', 'A2', 'MAF', 'Effect', 'SE', 'Pval', 'N']]

# Assign appropriate data type except NA
df['CHR'] = df['CHR'].apply(lambda x: 'NA' if x == 'NA' else int(x))
df['POS'] = df['POS'].apply(lambda x: 'NA' if x == 'NA' else int(x))
df['SNP'] = df['SNP'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['A1'] = df['A1'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['A2'] = df['A2'].apply(lambda x: 'NA' if x == 'NA' else str(x))
df['MAF'] = df['MAF'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['Effect'] = df['Effect'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['SE'] = df['SE'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['Pval'] = df['Pval'].apply(lambda x: 'NA' if x == 'NA' else float(x))
df['N'] = df['N'].apply(lambda x: 'NA' if x == 'NA' else int(x))

# Reorder according to CHR and POS
df.sort_values(by=['CHR', 'POS'], ascending=True, inplace=True)


# Re-code 'NA' to .
df.replace(to_replace=['NA'],
            value='.',
            inplace=True
            )

# Save
save_filename = "pQC.GIANT_2015_WC_COMBINED_EUR.txt"
df.to_csv(save_filename, sep='\t', index=False)