In [1]:
import numpy as np
import pandas as pd

import os
import sys

from sklearnex import patch_sklearn
patch_sklearn()

import sklearn

import sklearn.pipeline 
import sklearn.model_selection
import sklearn.metrics
import sklearn.linear_model

from sklearn.preprocessing import StandardScaler

import optuna

sys.path.append("/data/ouga/home/ag_gagneur/l_vilov/workspace/species-aware-DNA-LM/mpra_griesemer/utils") 

from models import *
from misc import dotdict

import scipy.stats
import pickle

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
data_dir = '/s/project/mll/sergey/effect_prediction/MLM/siegel_2022/'

In [18]:
input_params = dotdict({})

input_params.cell_type = 'Jurkat' #Beas2B or Jurkat
input_params.response = 'stability' #response variable: steady_state or stability

input_params.model = 'MLM' #embedding name, can be "MLM" "word2vec" "griesemer" or "Nmers" where N is an integer

input_params.output_dir = './test' #output folder

input_params.N_trials = 1000 #number of optuna trials
input_params.keep_first = True #perform hpp search only at the first split, then use these hyperparameters

input_params.N_splits = 100 #number of GroupShuffleSplits
input_params.N_CVsplits = 5 #number of CV splits for hyperparameter search
input_params.seed = 1 #seed fot GroupShuffleSplit

In [19]:
mpra_df = pd.read_csv(data_dir + f'{input_params.cell_type}.tsv', sep='\t') #sequence info

mlm_embeddings = np.load(data_dir + "embeddings/seq_len_5000/embeddings.npy") #masked language model embeddings

In [20]:
#Some of the CXCL7 padding sequences have AREs -- we can account for these if desired...

#cxcl7_list = ['SUPV3L1|10:70968792-70968830','TRPT1|11:63991271-63991346','ART4|12:14982266-14982303','POLE2|14:50116899-50116969','NMRAL1|16:4511716-4511779','ADPRM|17:10614462-10614520','NUP85|17:73231775-73231829','PPP1R15A|19:49379231-49379294','PQLC3|2:11300834-11300874','FASTKD1|2:170386287-170386333','TFPI|2:188343337-188343401','YBEY|21:47717549-47717616','ALG1L|3:125648118-125648193','HELQ|4:84328529-84328604','TMEM171|5:72427558-72427617','IL4|5:132018280-132018347','PCDHA11|5:140251122-140251185','PCDHA12|5:140257437-140257474','GIN1|5:102423545-102423600','HLA-DQA1|6:32610542-32610561','CCDC132|7:92905660-92905721','NAPRT|8:144656955-144657006']
#mpra_df['CXCL7_ARE'] = mpra_df.region.apply(lambda x:x in cxcl7_list)

In [21]:
regions_utr_map = pd.read_csv(data_dir + 'regions_hg38/regions_3UTR_GRCh38.bed', sep='\t',
                             names = ['region_start','region_end','ids','utr_start','utr_end','strand']) #mapping between regions and 3'UTR coordinates

regions_utr_map = regions_utr_map[(regions_utr_map.region_start>=regions_utr_map.utr_start) & 
    (regions_utr_map.region_end<=regions_utr_map.utr_end)].drop_duplicates() #region should be entirely within 3'UTR

regions_utr_map = regions_utr_map.drop_duplicates(subset='ids', keep=False) #remove variants assign to multiple 3'UTR

regions_utr_map['stop_codon_dist'] = regions_utr_map.apply(lambda x: x.region_end-x.utr_start 
                      if x.strand=='+' else x.utr_end - x.region_start, axis=1) #distance to the stop codon, must be below 5000 for MLM

In [22]:
mpra_df = mpra_df.merge(regions_utr_map[['ids','stop_codon_dist']], how='left')

In [23]:
if input_params.response == 'steady_state':
    mpra_df['Expression'] = mpra_df.ratios_T0_GC_resid
elif input_params.response == 'stability':
    mpra_df['Expression'] = mpra_df.ratios_T4T0_GC_resid

In [24]:
mpra_df.drop_duplicates(inplace=True)

flt = (mpra_df.Expression.isna()) | (mpra_df.ARE_length_perfect.isna()) | (mpra_df.stop_codon_dist.isna()) | (mpra_df.stop_codon_dist>5000) | (~mpra_df.issnp.astype(bool))

mpra_df = mpra_df[~flt]

mlm_embeddings = mlm_embeddings[mpra_df.index]

In [25]:
mpra_df['group'] = mpra_df.region.apply(lambda x:x.split('|')[1].split(':')[0])

In [26]:
if input_params.model=='MLM':

    X = mlm_embeddings

elif 'mers' in input_params.model:
    
    k = int(input_params.model[0])
        
    kmerizer = Kmerizer(k=k)
    X = np.stack(mpra_df.seq.apply(lambda x: kmerizer.kmerize(x))) 
        
elif input_params.model=='word2vec':
        
    X = word2vec_model(mpra_df)

elif input_params.model=='effective_length':
    
    X = mpra_df.ARE_registration_perfect + mpra_df.ARE_length_perfect
    X = np.expand_dims(X.values,1)

#X = np.hstack((X,np.expand_dims(mpra_df.min_free_energy.values,axis=1)))

y = mpra_df['Expression'].values
groups = mpra_df['group'].values

In [27]:
def hpp_search(X,y,groups,cv_splits = 5):
    
    '''
    Perform Hyperparameter Search using OPTUNA Bayesian Optimisation strategy
    
    The bets hyperparameters should maximize coefficient of determination (R2)
    
    The hyperparameter range should first be adjused with grid search to make the BO algorithm converge in reasonable time
    '''

    def objective(trial):

        C = trial.suggest_float("C", 1e-2, 1, log=True)
        epsilon = trial.suggest_float("epsilon", 1e-5, 1, log=True)
        gamma = trial.suggest_float("gamma", 1e-5, 1, log=True)

        clf = sklearn.svm.SVR(C=C, epsilon=epsilon, gamma=gamma)

        pipe = sklearn.pipeline.make_pipeline(sklearn.preprocessing.StandardScaler(),clf)

        cv_score = sklearn.model_selection.cross_val_score(pipe, X, y, groups=groups, 
                     cv = sklearn.model_selection.GroupKFold(n_splits = cv_splits), scoring = 'r2', n_jobs = -1)
        
        av_score = cv_score.mean()
        
        return av_score
    
    study = optuna.create_study(direction = "maximize")

    study.optimize(objective, n_trials = input_params.N_trials)
    
    best_params = study.best_params
    
    return best_params

In [None]:
#cv_res = np.zeros((input_params.N_splits,len(y))) #predictions for each point in each split
cv_res = np.zeros((len(set(groups)),len(y))) #predictions for each point in each split
cv_res[:] = np.NaN 

#cv_res_lasso = np.zeros((input_params.N_splits,len(y))) #predictions for each point in each split
cv_res_lasso = np.zeros((len(set(groups)),len(y))) #predictions for each point in each split
cv_res_lasso[:] = np.NaN 

cv_scores = [] #scores and best hyperparameters for each split

#gss = sklearn.model_selection.LeaveOneGroupOut() 

input_params.N_splits=10

gss = sklearn.model_selection.LeaveOneGroupOut() 

best_hpp = {'C': 0.12227907014412719, 'epsilon': 0.002017028863020407, 'gamma': 0.0017418739573905068}#MLM
best_hpp = {'C': 0.06074046127798098, 'epsilon': 0.032758360822165884, 'gamma': 0.0005466435051403621} #5mers

#best_hpp = {'alpha':1e-5}

for round_idx, (train_idx, test_idx) in enumerate(gss.split(X, y, groups)):
        
        X_train, X_test, y_train, y_test = X[train_idx,:],X[test_idx,:],y[train_idx],y[test_idx]
        
        if round_idx==0 or input_params.keep_first==False:
            #perform only ones if input_params.keep_first==True
            best_hpp = hpp_search(X_train,y_train,groups[train_idx],cv_splits = input_params.N_CVsplits)
        
        pipe = sklearn.pipeline.make_pipeline(sklearn.preprocessing.StandardScaler(),
                                              sklearn.svm.SVR(**best_hpp))
        
        #pipe = sklearn.pipeline.make_pipeline(sklearn.preprocessing.StandardScaler(),
        #                                      sklearn.linear_model.Lasso(**best_hpp))
        
        pipe.fit(X_train,y_train)
                    
        y_pred = pipe.predict(X_test) 
        
        cv_res[round_idx,test_idx] = y_pred
        
        group_kfold = sklearn.model_selection.GroupKFold(n_splits=input_params.N_CVsplits).split(X_train,y_train,groups[train_idx])
        pipe_lasso = sklearn.pipeline.make_pipeline(sklearn.preprocessing.StandardScaler(), sklearn.linear_model.LassoCV(cv=group_kfold, alphas=10.**np.arange(-6,0))) 
        pipe_lasso.fit(X_train,y_train)
        y_pred_lasso = pipe_lasso.predict(X_test)
        
        cv_res_lasso[round_idx,test_idx] = y_pred_lasso

        cv_scores.append({'round':round_idx,
                          'chrom':groups[test_idx][0],
                         'r2_svr':sklearn.metrics.r2_score(y_test,y_pred),
                         'pearson_r_svr':scipy.stats.pearsonr(y_test,y_pred)[0],
                         'r2_lasso':sklearn.metrics.r2_score(y_test,y_pred_lasso),
                         'pearson_r_lasso':scipy.stats.pearsonr(y_test,y_pred_lasso)[0]
                         }|best_hpp)
        
cv_scores = pd.DataFrame(cv_scores)

In [492]:
cv_scores.pearson_r_svr.mean()

0.2038695562423924

In [426]:
from sklearn.model_selection import cross_val_score

In [427]:
def pearson_r(estimator, X, y):
    y_pred = estimator.predict(X)
    #print(estimator[1].alpha_)
    if len(y_pred.shape) == 2:
        y_pred = y_pred.reshape(-1)
    return scipy.stats.pearsonr(y, y_pred)[0]

In [None]:
pipe = sklearn.pipeline.make_pipeline(sklearn.preprocessing.StandardScaler(), sklearn.linear_model.LassoCV(cv=3, alphas=10.**np.arange(-6,0))) 
#limit runtime
r2 = cross_val_score(pipe,X[train_idx],y[train_idx],scoring=pearson_r,
                     cv=sklearn.model_selection.GroupKFold(n_splits=10), groups=groups[train_idx], n_jobs=-1)#.mean()

In [472]:
np.mean(r2)

0.496721636952289

In [None]:
os.makedirs(input_params.output_dir, exist_ok=True) #make output dir

cv_scores.to_csv(input_params.output_dir + '/cv_scores.tsv', sep='\t', index=None) #save scores

with open(input_params.output_dir + '/cv_res.npy', 'wb') as f:
    np.save(f, cv_res) #save predictions at each round