In [1]:
import numpy as np
import pandas as pd

import os
import sys

from sklearnex import patch_sklearn
patch_sklearn()

import sklearn

import sklearn.pipeline 
import sklearn.model_selection
import sklearn.metrics

from sklearn.preprocessing import StandardScaler

import optuna

sys.path.append("/data/ouga/home/ag_gagneur/l_vilov/workspace/species-aware-DNA-LM/mpra_griesemer/utils") 

from models import *
from misc import dotdict

import multiprocessing
import pickle

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
data_dir = '/s/project/mll/sergey/effect_prediction/MLM/griesemer/'

In [3]:
input_params = dotdict({})

input_params.cell_type = 'HMEC' #HMEC,HEK293FT,HEPG2,K562,GM12878,SKNSH

input_params.model = 'enformer_all_targets' #embedding name, can be "MLM" "word2vec" "griesemer" or "Nmers" where N is an integer

input_params.output_dir = './test' #output folder

input_params.N_trials = 1000 #number of optuna trials
input_params.n_jobs = 16

input_params.N_splits = 3 #number of GroupShuffleSplits
input_params.N_CVsplits = 5 #number of CV splits for hyperparameter search
input_params.seed = 1 #seed fot GroupShuffleSplit

In [4]:
mpra_df = pd.read_csv(data_dir + 'mpra_df.tsv', sep='\t') #sequence info

mpra_df['tag'] = mpra_df.oligo_id.apply(lambda x:'ref' if 'ref' in x else 'alt')

mlm_embeddings = np.load(data_dir + "embeddings/seq_len_5000/embeddings.npy") #masked language model embeddings

#Data Cleaning
# Take only SNP mutations
# Remove nan values in Expression column

is_snp = mpra_df.ref_allele.str.len() == mpra_df.alt_allele.str.len()

flt = mpra_df[f'log2FoldChange_Skew_{input_params.cell_type}'].isna()  | (~is_snp) | (mpra_df.stop_codon_dist>5000) #| mpra_df.oligo_id.str.contains('_ref$')

mpra_df = mpra_df[~flt]

In [5]:
#Expression column to float
mpra_df['Expression'] = mpra_df[f'log2FoldChange_Skew_{input_params.cell_type}']

assert all(mpra_df.loc[mpra_df.tag=='ref','mpra_variant_id'].values==
         mpra_df.loc[mpra_df.tag=='alt','mpra_variant_id'].values)
    
mpra_df.Expression = mpra_df.Expression.apply(lambda x:x.replace(',','.') if type(x)==str else x).astype(float)

In [6]:
enformer_dir = data_dir + 'enformer/predictions/'

data = {}

for pickle_file in os.listdir(enformer_dir):
    with open(enformer_dir+pickle_file,'rb') as f:
        data = data|pickle.load(f)
        
enformer_df = pd.DataFrame(data).T

idx = mpra_df.set_index(['mpra_variant_id','tag']).index

enformer_df = enformer_df.loc[idx].swaplevel() #get variants in exactly the same order as in mpra_df

enformer_log2fc = np.log2(enformer_df.loc['alt']/enformer_df.loc['ref']) #log2fc for all targets

enformer_log2fc = enformer_log2fc.fillna(enformer_log2fc.median())

assert all(enformer_log2fc.index==mpra_df[mpra_df.tag=='alt'].mpra_variant_id)

In [7]:
def get_embeddings(mpra_df):

    if input_params.model=='MLM':

        X = mlm_embeddings[mpra_df.index]

    elif 'mers' in input_params.model:

        k = int(input_params.model[0])

        kmerizer = Kmerizer(k=k)
        X = np.stack(mpra_df.seq.apply(lambda x: kmerizer.kmerize(x))) 

    elif input_params.model=='word2vec':

        X = word2vec_model(mpra_df)

    elif input_params.model=='griesemer':

        X = minseq_model(mpra_df)

    X = np.hstack((X,np.expand_dims(mpra_df.min_free_energy.values,axis=1)))
    
    return X

In [8]:
def get_enformer_matrix(model):
 
    dnase_all_idx = np.array(np.arange(0,674)) 

    cage_all_idx = np.array(np.arange(4675,5313)) 

    chipseq_all_idx = np.array(np.arange(674,4675)) 

    if model == 'enformer_all_targets':
        
        X = enformer_log2fc.values
        
    elif model == 'enformer_summary':
        
        X = np.vstack((enformer_log2fc[dnase_all_idx].mean(axis=1),
                       enformer_log2fc[cage_all_idx].mean(axis=1),
                         enformer_log2fc[cage_all_idx].mean(axis=1))).T
        
    X = np.hstack((X,
                   np.expand_dims(mpra_df[mpra_df.tag=='alt'].min_free_energy.values,axis=1),
                    np.expand_dims(mpra_df[mpra_df.tag=='ref'].min_free_energy.values,axis=1)))
        
    return X

In [9]:
if not 'enformer' in input_params.model:
    
    X_ref = get_embeddings(mpra_df[mpra_df.tag=='ref'])
    X_alt = get_embeddings(mpra_df[mpra_df.tag=='alt'])

    X = np.hstack((X_ref,X_alt))
    
else:
    
    X = get_enformer_matrix(input_params.model)

y = mpra_df.loc[mpra_df.tag=='alt', 'Expression'].values
groups = mpra_df.loc[mpra_df.tag=='alt', 'group'].values

In [10]:
def hpp_search(X,y,groups,cv_splits = 5):
    
    '''
    Perform Hyperparameter Search using OPTUNA Bayesian Optimisation strategy
    
    The bets hyperparameters should maximize coefficient of determination (R2)
    
    The hyperparameter range should first be adjused with grid search to make the BO algorithm converge in reasonable time
    '''

    def objective(trial):

        C = trial.suggest_float("C", 1e-5, 1e2, log=True)
        epsilon = trial.suggest_float("epsilon", 1e-5, 1, log=True)
        gamma = trial.suggest_float("gamma", 1e-5, 1, log=True)

        clf = sklearn.svm.SVR(C=C, epsilon=epsilon, gamma=gamma)

        pipe = sklearn.pipeline.make_pipeline(sklearn.preprocessing.StandardScaler(),clf)

        cv_score = sklearn.model_selection.cross_val_score(pipe, X, y, groups=groups, 
                     cv = sklearn.model_selection.GroupKFold(n_splits = cv_splits), scoring = 'r2', n_jobs = -1)
        
        av_score = cv_score.mean()
        
        return av_score
    
    study = optuna.create_study(direction = "maximize")

    study.optimize(objective, n_trials = input_params.N_trials)
    
    best_params = study.best_params
    
    return best_params

In [11]:
gss = sklearn.model_selection.GroupShuffleSplit(n_splits=input_params.N_splits, train_size=.9, random_state = input_params.seed) 

train_idx, test_idx = next(iter(gss.split(X, y, groups)))

X_train, X_test, y_train, y_test = X[train_idx,:],X[test_idx,:],y[train_idx],y[test_idx] #first split

best_hpp = hpp_search(X_train,y_train,groups[train_idx],cv_splits = input_params.N_CVsplits) #get optimal hyperparameters

def apply_SVR(train_idx, test_idx):
    X_train, X_test, y_train, y_test = X[train_idx,:],X[test_idx,:],y[train_idx],y[test_idx]
    pipe = sklearn.pipeline.make_pipeline(sklearn.preprocessing.StandardScaler(),
                                              sklearn.svm.SVR(**best_hpp))
    pipe.fit(X_train,y_train)  
    
    y_pred = np.full_like(y,np.NaN)
    
    y_pred[test_idx] = pipe.predict(X_test)  
    
    r2 = sklearn.metrics.r2_score(y[test_idx], y_pred[test_idx])

    return y_pred, r2
 
def svr_parallel():
    '''
    Perform multiple train/test splits and run classifier in an asynchronous parallel loop
    '''
    pool = multiprocessing.Pool(input_params.n_jobs)
    result = pool.starmap(apply_SVR, gss.split(X, y, groups))
    return result

all_res = svr_parallel()

preds, scores = zip(*all_res)

cv_res = np.vstack(preds)

cv_scores = pd.DataFrame({'round':range(input_params.N_splits),'scores':scores}|best_hpp)

NameError: name 'best_hpp' is not defined

In [None]:
os.makedirs(input_params.output_dir, exist_ok=True) #make output dir

cv_scores.to_csv(input_params.output_dir + '/cv_scores.tsv', sep='\t', index=None) #save scores

with open(input_params.output_dir + '/cv_res.npy', 'wb') as f:
    np.save(f, cv_res) #save predictions at each round