# GPN-MSA: variant effect prediction

In [1]:
#!pip install --quiet git+https://github.com/songlab-cal/gpn.git

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [3]:
# Local download has more setup but is a lot faster and uses less CPU memory,
# for how to improve performance see README:
# https://huggingface.co/datasets/songlab/multiz100way

# Streaming (inference takes 5-10 minutes, might depend on network connection)
msa_path = "zip:///::https://huggingface.co/datasets/songlab/multiz100way/resolve/main/89.zarr.zip"

# Using a local file (inference takes 5 seconds)
#msa_path = "../../analysis/human/results/msa/multiz100way/89/all.zarr"

In [4]:
model_path = "songlab/gpn-msa-sapiens"

# Input file requirements:
# Either a HuggingFace dataset or a file
# Should automatically detect .parquet, .vcf, .vcf.gz, .csv, .csv.gz, .tsv, .tsv.gz
# Will only use chrom, pos, ref, alt (hg38 coordinates)
# chrom should be in 1,...,22,X,Y (string)
# ref, alt should be in A,C,G,T
# ref must match the reference genome

# Small VCF with 1000 positive and 1000 negative variants from our ClinVar benchmark
# can download from https://raw.githubusercontent.com/songlab-cal/gpn/main/examples/msa/example.vcf
input_path = "example.vcf"
# For now the output is just a single column "score", in the same order as input_path
output_path = "example.preds.parquet"

# The script will detect any number of available GPUs (can also run without GPU)
per_device_batch_size = 128 # whatever fits in your GPU
# how many CPUs you want to use
# anything > 0 get's frozen when using the remote (streaming) msa_path
# for local download can set equal to number of CPUs
dataloader_num_workers = 0
window_size = 128
python_path = "/scratch/users/gbenegas/software/mambaforge/envs/gpn/bin/python"  # might just be "python" in your system

In [None]:
!{python_path} -m gpn.msa.inference vep {input_path} {msa_path} {window_size} {model_path} {output_path} \
    --is_file \
    --per_device_batch_size {per_device_batch_size} --dataloader_num_workers {dataloader_num_workers}

Namespace(command='vep', input_path='example.vcf', msa_path='zip:///::https://huggingface.co/datasets/songlab/multiz100way/resolve/main/89.zarr.zip', window_size=128, model_path='songlab/gpn-msa-sapiens', output_path='example.preds.parquet', per_device_batch_size=128, dataloader_num_workers=0, split='test', is_file=True, disable_aux_features=False, center_window_size=None)
Loading MSA...
Loading MSA... Done
100%|███████████████████████████████████████████| 16/16 [03:31<00:00, 13.20s/it]


In [None]:
V = pd.read_csv(input_path, sep="\t", comment="#", header=None).rename(columns={
    0: "chrom", 1: "pos", 2: "id", 3: "ref", 4: "alt"
})
V["label"] = V.id.str.split("_").str[0]
V["score"] = pd.read_parquet(output_path).score.values.ravel()
V

In [None]:
sns.histplot(data=V, x="score", hue="label")