In [18]:
import numpy as np
import pandas as pd
from collections import defaultdict

import os
import sys
from tqdm import tqdm
from glob import glob

from sklearnex import patch_sklearn
patch_sklearn()

import sklearn

import sklearn.pipeline 
import sklearn.model_selection
import sklearn.metrics
import sklearn.linear_model

from sklearn.preprocessing import StandardScaler
from multiprocessing import Pool

import optuna

sys.path.append("/home/icb/sergey.vilov/workspace/MLM/mpra/utils/") 

from mlp import *
from misc import dotdict,pearson_r

import pickle

import matplotlib.pyplot as plt
import matplotlib

matplotlib.rcParams.update({'font.size': 20})

model_colors = {'DNABERT':"#D55E00",'DNABERT-3UTR':"#ffac6a", 
                '13-mer':"#CC79A7",
                'PhyloP-241way':"#ffd373",
                'PhyloP-100way':"#e69f00",
                'StateSpace':"#0072B2", 'StateSpace-SA':"#59c3ff", 
                'NTv2-250M':"#009E73", 'NTv2-250M-3UTR':"#00dea2"}

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
data_dir = '/lustre/groups/epigenereg01/workspace/projects/vale/mlm/half_life/agarwal_2022/'

In [3]:
folds_df = pd.read_csv(data_dir + 'source_data/saluki_paper/Fig3_S4/binnedgenes.txt', sep='\t', usecols=[0,1],
                      names=['Fold','gene_id'], skiprows=1).set_index('gene_id') #folds as they are in Agarwal article

folds_df = folds_df-1 #to 0-based

In [4]:
#features_df = pd.read_csv(data_dir + 'source_data/human/seqFeatWithKmerFreqs.txt.gz', sep='\t', 
#                                      usecols=lambda x: not 'ORF.' in x and not '5UTR.' in x) #basic features. 3'UTR and 5'UTR k-mers, ORF, target

#features_df.to_parquet(data_dir + 'source_data/human/seqFeatWithKmerFreqs_no5UTR.parquet.gz')

features_df = pd.read_parquet(data_dir + 'source_data/human/seqFeatWithKmerFreqs_no5UTR.parquet.gz').set_index('GENE')

target_df = features_df[['HALFLIFE']]
features_df = features_df.drop(columns='HALFLIFE')

In [5]:
transcript_to_gene = pd.read_csv(data_dir + '../../UTR_coords/GRCh38_EnsembleCanonical_HGNC.tsv.gz', sep='\t', 
                                     names=['gene_id','transcript_id'], skiprows=1,usecols=[0,1]).set_index('transcript_id')

In [6]:
#get FASTA seqs

human_fasta = data_dir + '../../fasta/Homo_sapiens_rna.fa'

utr_df = defaultdict(str)

with open(human_fasta, 'r') as f:
    for line in f:
        if line.startswith('>'):
            transcript_id = line[1:].split('.')[0]
        else:
            utr_df[transcript_id] += line.rstrip().upper()

In [7]:
utr_df = pd.DataFrame(utr_df.values(),
             index=transcript_to_gene.loc[utr_df.keys()].gene_id, 
             columns=['seq'])

In [31]:
input_params = dotdict({})

input_params.model = 'dnabert' #embedding name, can be "Species-aware",""Species-agnostic", "MLM" "word2vec" "griesemer" or "Nmers" where N is an integer

input_params.embeddings = data_dir + '../../human_3utr/embeddings/dnabert2-3utr/predictions.pickle'

input_params.regressor = 'Ridge'

input_params.cv_splits_hpp = 5 #number of CV splits for hyperparameter search
input_params.seed = 1 #seed fot GroupShuffleSplit

input_params.n_hpp_trials = 300

In [32]:
base_features = input_params.model.split('_')[0] if '_' in input_params.model else ''
base_features

'dnabert'

In [33]:
data_df = [folds_df,target_df]

if 'B' in base_features:
    
    print('adding basic features')
    data_df.append(features_df.iloc[:,:8])

if 'C' in base_features:

    print('adding codons')
    data_df.append(features_df[[x for x in features_df.columns if x.startswith('Codon.')]])

if '3K' in base_features:

    print("adding k-mer embeddings for 3'UTRs")
    data_df.append(features_df[[x for x in features_df.columns if x.startswith('3UTR.')]])

if 'S' in base_features:

    print('adding SeqWeaver RBP binding (780)')
    for region in ('3pUTR','5pUTR','ORF'):
        df = pd.read_csv(data_dir + f'data/human/SeqWeaver_predictions/{region}_avg.txt.gz', sep='\t').set_index('Group.1')
        data_df.append(df)

if 'M' in base_features:
    print('miRNA target repression (319)')
    df = pd.read_csv(data_dir + 'data/human/CWCS.txt.gz', sep='\t').set_index('GeneID')
    data_df.append(df)

In [34]:
data_df = pd.concat(data_df,axis=1) #concat all features, except embeddings

In [35]:
data_df = data_df[~data_df.HALFLIFE.isna()]
data_df.fillna(0, inplace=True)

In [36]:
embeddings_df = None

if input_params.embeddings!=None:
    print('adding language model embeddings')
    with open(input_params.embeddings,'rb') as f:
        X = pickle.load(f)
        print(f'number of sequences after filtering: {len(data_df)}')
        embeddings = np.vstack(X['embeddings'])
        print(f"embeddings size: {len(embeddings)}")
        embeddings_genes=transcript_to_gene.loc[[x.split('.')[0] for x in X['seq_names']]].gene_id
        data_df = data_df[data_df.index.isin(embeddings_genes)]
        embeddings_df = pd.DataFrame(embeddings, index=embeddings_genes,
                                    columns=[f'emb_{x}' for x in range(embeddings.shape[1])])
elif 'mers' in input_params.model:
    
    print('adding k-mer embeddings')

    k = int(input_params.model[0])
        
    kmerizer = Kmerizer(k=k)
    
    Nmer_embeddings = utr_df.seq.apply(lambda x: kmerizer.kmerize(x))
    
    embeddings_df = pd.DataFrame(Nmer_embeddings.tolist(), index=Nmer_embeddings.index, columns=[f'emb_{x}' for x in range(4**k)])

elif input_params.model=='word2vec':

    print('adding word2vec embeddings')

    kmerizer_w2v = Kmerizer(k=4)

    w2v_model = gensim.models.Word2Vec(sentences=utr_df.seq.apply(lambda x: kmerizer_w2v.tokenize(x)), 
                             vector_size=128, window=5, min_count=1, workers=4, sg=1) #default: CBOW

    word2vec_emb = utr_df.seq.apply(
        lambda x: np.mean([w2v_model.wv[x]  for x in kmerizer_w2v.tokenize(x)],axis=0)) #average embedding of all 4-mers in the sequence

    word2vec_emb = word2vec_emb[~word2vec_emb.isna()]
    
    embeddings_df = pd.DataFrame(word2vec_emb.tolist(), index=word2vec_emb.index, columns=[f'emb_{x}' for x in range(128)])

if embeddings_df is not None:
    data_df = data_df.join(embeddings_df)
    print(f'number of sequences overlapping with embeddings: {len(data_df)}')

adding language model embeddings
number of sequences after filtering: 12981
embeddings size: 18134
number of sequences overlapping with embeddings: 12431


In [37]:
X = data_df.drop(columns=['Fold','HALFLIFE']).values#all columns except HALFLIFE and fold

y = data_df['HALFLIFE'].values

folds = data_df['Fold'].values

genes = data_df.index

del data_df

In [38]:
def hpp_search_svr(X,y,groups,cv_splits = 5):

    '''
    Perform Hyperparameter Search using OPTUNA Bayesian Optimisation strategy

    The bets hyperparameters should maximize coefficient of determination (R2)

    The hyperparameter range should first be adjused with grid search to make the BO algorithm converge in reasonable time
    '''

    def objective(trial):

        C = trial.suggest_float("C", 1e-2, 1, log=True)
        epsilon = trial.suggest_float("epsilon", 1e-5, 1, log=True)
        gamma = trial.suggest_float("gamma", 1e-5, 1, log=True)

        pipe = sklearn.pipeline.make_pipeline(sklearn.preprocessing.StandardScaler(),
                                                  sklearn.svm.SVR(C=C, epsilon=epsilon, gamma=gamma))
        
        with parallel_backend('multiprocessing', n_jobs=input_params.n_jobs):
            cv_score = sklearn.model_selection.cross_val_score(pipe, X, y, groups=groups,
                     cv = cv_splits, scoring = 'r2', n_jobs = -1)

        av_score = cv_score.mean()

        return av_score

    #optuna.logging.set_verbosity(optuna.logging.DEBUG)

    study = optuna.create_study(direction = "maximize")

    study.optimize(objective, n_trials = input_params.n_hpp_trials)

    best_params = study.best_params

    return best_params

In [None]:
hpp_dict = {}

#best_hpp_models = {'Species-aware-BC3MS':{'C': 17.2, 'epsilon': 0.73, 'gamma': 2e-4},
#            'Species-agnostic-BC3MS':{'C': 15.3, 'epsilon': 0.37, 'gamma': 1.8e-4},
#            'NT-MS-v2-500M-BC3MS':{'C': 100, 'epsilon': 0.84, 'gamma': 1.3e-5},
#            'DNABERT-2-BC3MS':{'C': 19, 'epsilon': 9.8e-5, 'gamma': 1.4e-4},
#            'DNABERT-2-B3': {'C': 100, 'epsilon': 2e-3, 'gamma': 6.6e-05},
#            'BCMS': {'C': 18.5, 'epsilon': 2.8e-2, 'gamma': 1.3e-4},
#            'Species-agnostic-B3':{'C': 86, 'epsilon': 0.06, 'gamma': 2.4e-4},
#            'Species-agnostic-3':{'C': 22, 'epsilon': 0.003, 'gamma': 5.6e-4},
#            'Species-aware-3':{'C': 2.05, 'epsilon': 0.091, 'gamma': 0.0037},
#            '3K': {'C': 8.7, 'epsilon': 1.8e-3, 'gamma': 4.2e-05},
#            'DNABERT-2-3': {'C': 0.69, 'epsilon': 2.6e-4, 'gamma': 6.3e-4},
#            'NT-MS-v2-500M-3':{'C': 21, 'epsilon': 0.75, 'gamma': 3.8e-05},
#           }

res_df = []

N_folds = folds.max()+1

for fold in range(N_folds):
    
        print(f'Fold {fold}')
        
        X_train, X_test, y_train, y_test = X[folds!=fold],X[folds==fold],y[folds!=fold],y[folds==fold]

        if input_params.regressor == 'Ridge':
            pipe = sklearn.pipeline.make_pipeline(sklearn.preprocessing.StandardScaler(), 
                                              sklearn.linear_model.RidgeCV(cv=input_params.cv_splits_hpp, alphas=10.**np.arange(-5,6)))
        
        elif input_params.regressor == 'SVR':
            pipe = sklearn.pipeline.make_pipeline(sklearn.preprocessing.StandardScaler(),
                                                  sklearn.svm.SVR(**hpp_dict))
        
        pipe.fit(X_train,y_train)
                    
        y_pred = pipe.predict(X_test) 
                
        fold_res = np.vstack([np.ones((len(y_test),))*fold,genes[folds==fold],y_test,y_pred]).T

        res_df.append(fold_res)

res_df = np.vstack(res_df)
res_df = pd.DataFrame(res_df,columns=['fold','gene','y_true','y_pred'])

In [40]:
pearson_r(res_df.y_true,res_df.y_pred)

0.301991737176074

In [None]:
#cv_scores.to_csv(data_dir + f'predictions/{input_params.model}.tsv', sep = '\t', index=None)
res_df.to_csv(data_dir + f'predictions/{input_params.model}-full.tsv', sep = '\t', index=None)