# Intro

**Author:** Stephan Cordogan

This document stitches together the GWAS summary statistics generated in the previous notebook [(a)](#Merge-component-files-if-necessary) (if necessary), and meta- analyzes them together, as well as the processed finngen file, although this can be skipped if not applicable [(b)](#Load-in-Finngen-GWAS) [(c)](#Process-Finngen-if-desired), **or hashtagged out** [(d)](#Perform-meta-analysis-using-METAL).  You could also load in another summary statistics file if desired.  The resultant summary statistics are saved to your workspace bucket as meta_all1_GC.tsv.bgz.

# Import Necessary Packages

In [None]:
from datetime import datetime
import os
import pandas as pd


In [None]:
start = datetime.now()
bucket = os.getenv('WORKSPACE_BUCKET')
bucket

In [None]:
!gsutil ls $WORKSPACE_BUCKET/data

In [None]:
# Download METAL using wget in Jupyter
!wget https://csg.sph.umich.edu/abecasis/Metal/download/Linux-metal.tar.gz
    
!tar -xvzf Linux-metal.tar.gz


# Load in GWAS Summary Statistics

## Merge component files if necessary

In [None]:
import subprocess

# Define  parameters
ethnicities = ["eur", "afr", "amr", "eas", "sas"]
file_ids = ["1", "2", "3", "45", "67", "89", "101112", "131415", "161718", "19202122"]
file_extension = ".tsv"

def load_files(file_path, local_file_name):
    subprocess.run(["gsutil", "cp", file_path, "."], check=True)
    decompressed_file = local_file_name.rstrip(".bgz")
    if os.path.exists(decompressed_file):
        os.remove(decompressed_file)
    subprocess.run(["bgzip", "-d", local_file_name], check=True)
    return decompressed_file

def clean_df(df):
    # Drop unwanted columns and rows with NaN
    return df.drop(columns=["fit.n_iterations", "fit.converged", "fit.exploded"], errors="ignore").dropna()

merged_dataframes = {}

for ethnicity in ethnicities:
    df_list = []
    
    for file_id in file_ids:
        file_path = f'{bucket}/data/log_reg_{ethnicity}_{file_id}.tsv.bgz'
        local_file_name = f'log_reg_{ethnicity}_{file_id}{file_extension}.bgz'
        
        decompressed_file = load_files(file_path, local_file_name)
        df = pd.read_csv(decompressed_file, sep="\t")
        df_cleaned = clean_df(df)
        df_list.append(df_cleaned)
        
        print(f"Processed file: {decompressed_file}")
    
    # Concatenate all DataFrames for this ethnicity
    merged_df = pd.concat(df_list, ignore_index=True)
    merged_dataframes[ethnicity] = merged_df
    
    # Save merged DataFrame to file
    output_file = f'merged_{ethnicity}.tsv'
    merged_df.to_csv(output_file, sep="\t", index=False)
    print(f"Merged files for {ethnicity}, cleaned, and saved as '{output_file}'.")

#View
print(merged_dataframes["eur"].head())
print(merged_dataframes["afr"].head())


## load in full files if created

In [None]:
# log_reg_eur_path = f'{bucket}/data/log_reg_eur.tsv.bgz'
# !gsutil cp {log_reg_eur_path} .
# !bgzip -d log_reg_eur.tsv.bgz 

# log_reg_eas_path = f'{bucket}/data/log_reg_eas.tsv.bgz'
# !gsutil cp {log_reg_eas_path} .
# !bgzip -d log_reg_eas.tsv.bgz 

# log_reg_afr_path = f'{bucket}/data/log_reg_afr.tsv.bgz'
# !gsutil cp {log_reg_afr_path} .
# !bgzip -d log_reg_afr.tsv.bgz 

# log_reg_amr_path = f'{bucket}/data/log_reg_amr.tsv.bgz'
# !gsutil cp {log_reg_amr_path} .
# !bgzip -d log_reg_amr.tsv.bgz 

# log_reg_sas_path = f'{bucket}/data/log_reg_sas.tsv.bgz'
# !gsutil cp {log_reg_sas_path} .
# !bgzip -d log_reg_sas.tsv.bgz 


## Load in Finngen GWAS

In [None]:
log_reg_finn_path = f'{bucket}/data/fixed_menisc_sumstats.tsv.bgz'
!gsutil cp {log_reg_finn_path} .
!bgzip -d fixed_menisc_sumstats.tsv.bgz 

# Process summary statistics files so that they are legible to METAL

In [None]:
files = {
    'eur': 'merged_eur.tsv',
    'afr': 'merged_afr.tsv',
    'amr': 'merged_amr.tsv',
    'eas': 'merged_eas.tsv',
    'sas': 'merged_sas.tsv'
}

# Process each file except 'finn'
for key, file in files.items():
    df = pd.read_csv(files[key], sep='\t')
    
    df = df.rename(columns={'locus': 'position'})
    
    df['alleles'] = df['alleles'].str.replace(r'[\[\]"]', '', regex=True)
    df['ref'] = [a.split(',')[0] if ',' in a else None for a in df['alleles']]
    df['alt'] = [a.split(',')[1] if ',' in a else None for a in df['alleles']]
    
    # Create 'locus' column as 'position_ref_alt'
    df['locus'] = df['position'] + '_' + df['ref'] + '_' + df['alt']
    
    df.dropna(subset=['locus', 'position', 'ref', 'alt', 'beta', 'p_value', 'standard_error'], inplace=True)
    
    df.to_csv(files[key], sep='\t', index=False)


Save completed files to workspace bucket if desired

In [None]:
!bgzip -@ 6 -l 9 -c merged_eur.tsv > merged_eur.tsv.bgz
meta_save_path = f'{bucket}/data/merged_eur.tsv.bgz'
!gsutil cp 'merged_eur.tsv.bgz' {meta_save_path}

!bgzip -@ 6 -l 9 -c merged_afr.tsv > merged_afr.tsv.bgz
meta_save_path = f'{bucket}/data/merged_afr.tsv.bgz'
!gsutil cp 'merged_afr.tsv.bgz' {meta_save_path}

!bgzip -@ 6 -l 9 -c merged_amr.tsv > merged_amr.tsv.bgz
meta_save_path = f'{bucket}/data/merged_amr.tsv.bgz'
!gsutil cp 'merged_amr.tsv.bgz' {meta_save_path}

!bgzip -@ 6 -l 9 -c merged_eas.tsv > merged_eas.tsv.bgz
meta_save_path = f'{bucket}/data/merged_eas.tsv.bgz'
!gsutil cp 'merged_eas.tsv.bgz' {meta_save_path}

!bgzip -@ 6 -l 9 -c merged_sas.tsv > merged_sas.tsv.bgz
meta_save_path = f'{bucket}/data/merged_sas.tsv.bgz'
!gsutil cp 'merged_sas.tsv.bgz' {meta_save_path}

## Process Finngen if desired

In [None]:
df_finn = pd.read_csv('fixed_menisc_sumstats.tsv', sep='\t')
df_finn.head()
df_finn['position'] = 'chr' + df_finn['chrom'].astype(str) + ':' + df_finn['pos'].astype(str)
df_finn['locus'] = df_finn['position'] + '_' + df_finn['ref'] + '_' + df_finn['alt']
df_finn = df_finn.rename(columns={'beta_menisc_fixed': 'beta', 'sebeta_menisc_fixed': 'standard_error'})
output_file_finn = 'processed_finn.tsv'
df_finn.to_csv(output_file_finn, sep='\t', index=False)

In [None]:
meta_save_path = f'{bucket}/data/processed_finn.tsv'
!gsutil cp 'processed_finn.tsv' {meta_save_path}

In [None]:
df_finn.head()

# Perform meta-analysis using METAL

Hashtag out the finngen line if desired

In [None]:
# Define the contents of the meta_analysis.txt command file
metal_command = """
# Set the analysis scheme to STDERR (standard error-based meta-analysis)
SCHEME STDERR
COLUMNCOUNTING LENIENT

# Define the columns in the input files
MARKER locus  # SNP ID
ALLELE ref alt  # Alleles
EFFECT beta  # Effect size (regression coefficient)
STDERR standard_error  # Standard error of the effect size
PVALUE p_value  # P-value

# Enable genomic control correction if desired
# GENOMICCONTROL ON

# Specify the files to be processed
PROCESS /home/jupyter/workspaces/flagshipgwas/merged_eur.tsv
PROCESS /home/jupyter/workspaces/flagshipgwas/merged_afr.tsv
PROCESS /home/jupyter/workspaces/flagshipgwas/merged_amr.tsv
PROCESS /home/jupyter/workspaces/flagshipgwas/merged_eas.tsv
PROCESS /home/jupyter/workspaces/flagshipgwas/merged_sas.tsv
#Finngen- can be hashtagged out
PROCESS /home/jupyter/workspaces/flagshipgwas/processed_finn.tsv


# Output the meta-analysis results
OUTFILE meta_allGC .tbl

# Perform the meta-analysis
ANALYZE
"""

# Create and write the content to meta_analysis.txt
with open("meta_analysis.txt", "w") as f:
    f.write(metal_command)

print("meta_analysis.txt file has been created.")


In [None]:
!./generic-metal/metal meta_analysis.txt

In [None]:
# fixed_menisc_data.to_csv("fixed_menisc_sumstats.tsv", sep='\t', index=False)
!bgzip -@ 6 -l 9 -c meta_allGC1.tbl > meta_all1_GC.tsv.bgz
meta_save_path = f'{bucket}/data/meta_all1_GC.tsv.bgz'
!gsutil cp meta_all1_GC.tsv.bgz {meta_save_path}