# Intro

**Author:** Stephan Cordogan

This notebook generates a Manhattan Plot, QQ Plot, and genomic inflation factor for your meta-analyzed summary statistics.

# Import Necessary Packages

Use a General Analysis environment, Python Kernel

In [None]:
from datetime import datetime
import os
import pandas as pd


In [None]:
start = datetime.now()
bucket = os.getenv('WORKSPACE_BUCKET')
bucket

In [None]:
!gsutil ls $WORKSPACE_BUCKET/data

In [None]:
meta_path = f'{bucket}/data/meta_all1.tsv.bgz'
!gsutil cp {meta_path} .
!bgzip -d meta_all1.tsv.bgz 

# Filter data for viewing

In [None]:
meta_df = pd.read_csv("meta_all1.tsv", sep="\t")
meta_df.head()

In [None]:
meta_path = f'{bucket}/data/meta_all1_GC.tsv.bgz'
!gsutil cp {meta_path} .
!bgzip -d meta_all1_GC.tsv.bgz 
meta_df = pd.read_csv("meta_all1_GC.tsv", sep="\t")
meta_df.head()

In [None]:
#Create necessary columns
import numpy as np

meta_df['locus'] = meta_df['MarkerName']

meta_df['MarkerName'] = meta_df['MarkerName'].str.replace(r'(_.*$)', '', regex=True)

meta_df[['CHR', 'BP']] = meta_df['MarkerName'].str.split(':', expand=True)

meta_df['CHR'] = meta_df['CHR'].str.replace('chr', '', regex=True)

meta_df['CHR'] = pd.to_numeric(meta_df['CHR'], errors='coerce')

meta_df['P'] = pd.to_numeric(meta_df['P-value'], errors='coerce')

meta_df = meta_df.drop(columns=['P-value'])

meta_df = meta_df[np.isfinite(meta_df['P'])]

meta_df['BP'] = pd.to_numeric(meta_df['BP'], errors='coerce')


#View
print(meta_df.head())


In [None]:
#Order
meta_df = meta_df.sort_values(by=['CHR', 'BP'])

meta_df = meta_df.reset_index(drop=True)

print(meta_df.head())

In [None]:
# -Log10P col
meta_df['-log10(P)'] = -np.log10(meta_df['P'])
print(meta_df.head())

In [None]:
# Create manhattan plot
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(12, 6))

colors = ['#1f77b4', '#ff7f0e']
x_labels = []
x_ticks = []

current_x = 0

# Loop through each chromosome
for i, (chromosome, group) in enumerate(meta_df.groupby('CHR')):
    group = group.sort_values('BP')  
    x = current_x + np.arange(len(group))  # create x-axis pos
    ax.scatter(x, group['-log10(P)'], c=colors[i % 2], s=2, label=f'Chr {chromosome}')
    x_labels.append(f'Chr {chromosome}')
    x_ticks.append(current_x + len(group) // 2)
    current_x += len(group)

genome_sig = -np.log10(5e-8)
ax.axhline(y=genome_sig, color='red', linestyle='--', label='Genome-wide significance')

# Format
ax.set_xlabel('Chromosome')
ax.set_ylabel('-log10(P)')
ax.set_xticks(x_ticks)
ax.set_xticklabels(x_labels, rotation=90)
ax.set_title('Manhattan Plot')
plt.tight_layout()

plt.savefig('manhattan_plot.png', dpi=300)  
plt.show()

In [None]:
import numpy as np
import scipy.stats as stats

p_values = meta_df['P']

chi_squared = stats.chi2.isf(p_values, df=1)  # Converts P to chi-squared with 1 df

# Calculate the genomic inflation factor (lambda_GC)
median_chi_squared = np.median(chi_squared)
expected_median_chi_squared = 0.455  # Expected median for 1 df

lambda_gc = median_chi_squared / expected_median_chi_squared
print(f"Genomic Inflation Factor: {lambda_gc}"")

In [None]:
import numpy as np
import matplotlib.pyplot as plt

p_values = p_values[p_values > 0]

observed = -np.log10(np.sort(p_values))  meta_df['-log10(P)']
expected = -np.log10(np.linspace(1 / len(p_values), 1, len(p_values)))

# Create QQ plot
fig, ax = plt.subplots(figsize=(8, 8))
ax.scatter(expected, observed, c='blue', s=2, label='Observed Data')
ax.plot([0, max(expected)], [0, max(expected)], color='red', linestyle='--', label='Expected Line')

# Format
ax.set_xlabel('Expected -log10(P)')
ax.set_ylabel('Observed -log10(P)')
ax.set_title('QQ Plot')
ax.legend()
plt.tight_layout()

plt.savefig('qq_plot.png', dpi=300)  # Save as a high-quality image
plt.show()


In [None]:
genome_sig_threshold = 5e-8

# Filter meta_df for rows with P-value less than the threshold
significant_vars = meta_df[meta_df['P'] < genome_sig_threshold]

In [None]:
with pd.option_context('display.max_rows', None):
    print(f"Significant Genetic Variants: {significant_vars[['locus', 'P', 'Direction']]}"")

Save to bucket if desired

In [None]:
# final_sumstats = 'final_meta_sumstats.tsv'
# meta_df.to_csv(final_sumstats, sep='\t', index=False)
# meta_save_path = f'{bucket}/data/final_meta_sumstats.tsv.bgz'
# !gsutil cp final_meta_sumstats.tsv.bgz {meta_save_path}