**NB:** There is no need to run this notebook, the relevant output is precomputed and saved to `imported_dbNSFP_predictions.tar.gz`

In [None]:
import os 
import numpy as np
from glob import glob
import gzip

In [None]:
%%bash
# The following command will download about 25GB
wget ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/dbNSFP4.0a.zip
gunzip dbNSFP4.0a.zip

In [None]:
# Insert here the location of the folder containing the
# uncompressed dbSNFP dataset. The folder should contain a list of
# files named 'dbNSFP4.0a_variant.chr*.gz'
dbNSFP_dir = 'path_to_downloaded_dbNSFP/'

# Insert here the path to the Rhapsody dataset of
# precomputed features called 'precomputed_features-ID.npy'
ID = np.load('../../1-Training_Dataset/local/data/precomputed_features-ID.npy')

In [None]:
# list of columns in dbNSFP containing functional predictions from
# various tools and info about their interpretation (adapted from
# dbNSFP4.0a.readme.txt). For instance, '+' means that a larger
# score corresponds to damaging effect, '-' means that a smaller
# score corresponds to damaging effect, 'DP/B' means that letters
# 'D' and 'P' will be considered as 'deleterious' and 'B' as 'neutral'.
dbNSFP_sel_columns = [
    (37, 'SIFT_score', '-'),
    (39, 'SIFT_pred', 'D/T'),
    (40, 'SIFT4G_score', '-'),
    (42, 'SIFT4G_pred', 'D/T'),
    (43, 'Polyphen2_HDIV_score', '+'),
    (45, 'Polyphen2_HDIV_pred', 'DP/B'),
    (46, 'Polyphen2_HVAR_score', '+'),
    (48, 'Polyphen2_HVAR_pred', 'DP/B'),
    (49, 'LRT_score', '-'),
    (51, 'LRT_pred', 'D/N'),
    (53, 'MutationTaster_score', '+'),
    (55, 'MutationTaster_pred', 'AD/NP'),
    (58, 'MutationAssessor_score', '+'),
    (60, 'MutationAssessor_pred', 'HM/LN'),
    (61, 'FATHMM_score', '-'),
    (63, 'FATHMM_pred', 'D/T'),
    (64, 'PROVEAN_score', '-'),
    (66, 'PROVEAN_pred', 'D/N'),
    (67, 'VEST4_score', '+'),
    (69, 'MetaSVM_score', '+'),
    (71, 'MetaSVM_pred', 'D/T'),
    (72, 'MetaLR_score', '+'),
    (74, 'MetaLR_pred', 'D/T'),
    (76, 'M-CAP_score', '+'),
    (78, 'M-CAP_pred', 'D/T'),
    (79, 'REVEL_score', '+'),
    (81, 'MutPred_score', '+'),
    (86, 'MVP_score', '+'),
    (88, 'MPC_score', '+'),
    (90, 'PrimateAI_score', '+'),
    (92, 'PrimateAI_pred', 'D/T'),
    (93, 'DEOGEN2_score', '+'),
    (95, 'DEOGEN2_pred', 'D/T'),
    (102, 'CADD_raw', '+'),
    (105, 'DANN_score', '+'),
    (107, 'fathmm-MKL_coding_score', '+'),
    (109, 'fathmm-MKL_coding_pred', 'D/N'),
    (111, 'fathmm-XF_coding_score', '+'),
    (113, 'fathmm-XF_coding_pred', 'D/N'),
    (114, 'Eigen-raw_coding', '+'),
    (117, 'Eigen-PC-raw_coding', '+'),
    (120, 'GenoCanyon_score', '+'),
    (122, 'integrated_fitCons_score', '+')
]

In [None]:
def calc_average_score(content, symbol):
    arr = []
    for field in content.split(';'):
        try:
            arr.append(float(field))
        except ValueError:
            pass
    if arr:
        mean = np.mean(arr)
        if symbol == '-':
            mean = - mean 
        return mean
    else:
        return np.nan

def calc_consensus_prediction(content, del_neu_symbols):
    del_symbols, neu_symbols = del_neu_symbols.split('/')
    del_preds = np.sum([content.count(s) for s in del_symbols])
    neu_preds = np.sum([content.count(s) for s in neu_symbols])
    if del_preds > neu_preds:
        return 1
    elif del_preds == neu_preds:
        return -1
    else:
        return 0

# recover SAV coordinates from Integrated Datasets and relative indexes
ID_SAVs = {}
for array_index, SAV in enumerate(ID['SAV_coords']):
    Uniprot_acc = SAV.split()[0]
    d = ID_SAVs.get(Uniprot_acc, {})
    d[SAV] = array_index
    ID_SAVs[Uniprot_acc] = d

# define a structured array that will contain the output 
output_dtype = [('SAV_coords', 'U50'), ('chr', 'U2')]
for col, method, symbols in dbNSFP_sel_columns:
    if symbols in ['+', '-']:
       output_dtype.append((f'{symbols}{method}', 'f4')) 
    else:
       output_dtype.append((method, 'i'))
output = np.empty(len(ID), dtype=np.dtype(output_dtype))
# initialize output array
output['SAV_coords'] = ID['SAV_coords']
output['chr'] = ''
for field in output.dtype.names[2:]:
    if field.startswith('+') or field.startswith('-'):
        output[field] = np.nan
    else:
        output[field] = -1

# list of input files
gz_files = glob(os.path.join(dbNSFP_dir, 'dbNSFP4.0a_variant.chr*.gz'))
assert gz_files, 'No file found'

# loop over compressed files
for gz_file in gz_files:
    fname = os.path.basename(gz_file)
    chromosome = fname.replace('dbNSFP4.0a_variant.chr', '')
    chromosome = chromosome.replace('.gz', '')
    print('>>>', fname)
    with gzip.open(gz_file, 'rt') as f:
        for i, line in enumerate(f):
            cols = line.split('\t')
            accs = cols[16].split(';')
            # check if Uniprot acc. number is in ID
            for j, acc in enumerate(accs):
                if acc in ID_SAVs:
                    pos = cols[11].split(';')[j]
                    waa = cols[4]
                    maa = cols[5]
                    SAV = f'{acc} {pos} {waa} {maa}'
                    # check if specific SAV is in ID
                    if SAV in ID_SAVs[acc]:
                        arr_i = ID_SAVs[acc][SAV]
                        output[arr_i]['chr'] = chromosome
                        # recover predictions from selected methods
                        for k, t in enumerate(dbNSFP_sel_columns):
                            col, method, s = t
                            l = cols[col-1]
                            if s in ['+', '-']:
                                x = calc_average_score(l, s)
                            else:
                                x = calc_consensus_prediction(l, s)
                            output[arr_i][k+2] = x
                        break

# save output to file
np.save('imported_dbNSFP_predictions.npy', output)

n_recovered = sum(output['chr'] != '')
n_tot = len(output)
print(f'{n_recovered} out of {n_tot} SAVs in Rhapsody ID '
      'recovered from dbNSFP.')