In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict

import os
import sys

from sklearnex import patch_sklearn
patch_sklearn()

import sklearn

import sklearn.pipeline 
import sklearn.model_selection
import sklearn.metrics
import sklearn.linear_model

from sklearn.preprocessing import StandardScaler
from multiprocessing import Pool

import optuna

sys.path.append("/data/ouga/home/ag_gagneur/l_vilov/workspace/species-aware-DNA-LM/mpra_griesemer/utils") 

from models import *
from misc import dotdict

import scipy.stats
import pickle

import gensim.models 

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
data_dir = '/s/project/mll/sergey/effect_prediction/MLM/agarwal_2022/'

In [3]:
input_params = dotdict({})

input_params.model = 'MLM' #embedding name, can be "MLM" "word2vec" "griesemer" or "Nmers" where N is an integer

input_params.output_dir = './test' #output folder

input_params.N_trials = 1000 #number of optuna trials
input_params.keep_first = True #perform hpp search only at the first split, then use these hyperparameters

input_params.N_splits = 100 #number of GroupShuffleSplits
input_params.N_CVsplits = 5 #number of CV splits for hyperparameter search
input_params.seed = 1 #seed fot GroupShuffleSplit

In [4]:
#get MLM embeddings

with open(data_dir + '../species_aware_emb/all_3utr.pickle', 'rb') as f:
            mlm_embeddings = pickle.load(f)
            utr_names =  pickle.load(f)

In [5]:
#MLM embeddings are made for transcripts
#get corresponding gene names

embedding_transcripts = [x.split('.')[0] for x in utr_names]

transcript_to_gene = pd.read_csv(data_dir + '../UTR_coords/GRCh38_EnsembleCanonical_HGNC.tsv.gz', sep='\t', 
                                     names=['gene_id','transcript_id'], skiprows=1,usecols=[0,1]).set_index('transcript_id')

In [6]:
#get FASTA seqs

human_fasta = data_dir + '../fasta/240_mammals/species/Homo_sapiens.fa'

utr_df = defaultdict(str)

with open(human_fasta, 'r') as f:
    for line in f:
        if line.startswith('>'):
            transcript_id = line[1:].split(':')[0].split('.')[0]
        else:
            utr_df[transcript_id] += line.rstrip().upper()

In [7]:
utr_df = pd.DataFrame(utr_df.values(),
             index=transcript_to_gene.loc[utr_df.keys()].gene_id, 
             columns=['seq'])

In [8]:
folds_df = pd.read_csv(data_dir + 'saluki_paper/Fig3_S4/binnedgenes.txt', sep='\t', usecols=[0,1],
                      names=['Fold','gene_id'], skiprows=1).set_index('gene_id') #folds as they are in Agarwal article

folds_df = folds_df-1 #to 0-based

In [9]:
data_df = [folds_df]

df = pd.read_csv(data_dir + 'human/seqFeatWithKmerFreqs.txt.gz', sep='\t', 
                          usecols=lambda x: not 'ORF.' in x and not 'UTR.' in x).set_index('GENE') #basic features (8) + codons (62)

data_df.append(df)

In [10]:
#SeqWeaver RBP binding (780)
for region in ('3pUTR','5pUTR','ORF'):
    df = pd.read_csv(data_dir + f'human/SeqWeaver_predictions/{region}_avg.txt.gz', sep='\t').set_index('Group.1')
    data_df.append(df)

In [11]:
#miRNA target repression (319)
df = pd.read_csv(data_dir + f'human/CWCS.txt.gz', sep='\t').set_index('GeneID')
data_df.append(df)

In [12]:
data_df = pd.concat(data_df,axis=1) #concat all features, except embeddings

In [13]:
data_df = data_df[~data_df.HALFLIFE.isna()]
data_df.fillna(0, inplace=True)

In [14]:
#get sequence embedding depending on the model

if input_params.model=='MLM':
            

    embeddings_df = pd.DataFrame(mlm_embeddings, 
                                     index=transcript_to_gene.loc[embedding_transcripts].gene_id, 
                                     columns=[f'emb_{x}' for x in range(mlm_embeddings.shape[1])])

elif 'mers' in input_params.model:
    
    k = int(input_params.model[0])
        
    kmerizer = Kmerizer(k=k)
    
    Nmer_embeddings = utr_df.seq.apply(lambda x: kmerizer.kmerize(x))
    
    embeddings_df = pd.DataFrame(Nmer_embeddings.tolist(), index=Nmer_embeddings.index, columns=[f'emb_{x}' for x in range(4**k)])

elif input_params.model=='word2vec':
        
    kmerizer_w2v = Kmerizer(k=4)

    w2v_model = gensim.models.Word2Vec(sentences=utr_df.seq.apply(lambda x: kmerizer_w2v.tokenize(x)), 
                             vector_size=128, window=5, min_count=1, workers=4, sg=1) #default: CBOW

    word2vec_emb = utr_df.seq.apply(
        lambda x: np.mean([w2v_model.wv[x]  for x in kmerizer_w2v.tokenize(x)],axis=0)) #average embedding of all 4-mers in the sequence

    word2vec_emb = word2vec_emb[~word2vec_emb.isna()]
    
    embeddings_df = pd.DataFrame(word2vec_emb.tolist(), index=word2vec_emb.index, columns=[f'emb_{x}' for x in range(128)])

In [15]:
data_df = pd.concat([data_df,embeddings_df], join='inner', axis=1)

#data_df.reset_index(drop=True, inplace=True)

In [16]:
X = data_df.iloc[:,2:].values#all columns except HALFLIFE and fold

y = data_df['HALFLIFE'].values

folds = data_df['Fold'].values

N_folds = int(max(folds))+1

In [17]:
del data_df

In [18]:
def apply_SVR(args):
        
    test_hpp, (train_idx, test_idx) = args 

    pipe = sklearn.pipeline.make_pipeline(sklearn.preprocessing.StandardScaler(),
                                                  sklearn.svm.SVR(**test_hpp))
    pipe.fit(X[val_idx][train_idx],y[val_idx][train_idx])

    R2_score = pipe.score(X[val_idx][test_idx],y[val_idx][test_idx])
        
    return R2_score


def hpp_search(val_idx, cv_splits = 10):
    
    '''
    Perform Hyperparameter Search using OPTUNA Bayesian Optimisation strategy
    
    The bets hyperparameters should maximize coefficient of determination (R2)
    
    The hyperparameter range should first be adjused with grid search to make the BO algorithm converge in reasonable time
    '''


    def objective(trial):

        C = trial.suggest_float("C", 1e-2, 1e2, log=True)
        epsilon = trial.suggest_float("epsilon", 1e-5, 1, log=True)
        gamma = trial.suggest_float("gamma", 1e-5, 1, log=True)

        test_hpp = {'C':C, 'epsilon':epsilon, 'gamma':gamma}
        
        pool = Pool(processes=input_params.n_jobs,maxtasksperchild=3)

        cv_scores = []
        
        kfold = sklearn.model_selection.KFold(n_splits=cv_splits)
        
        params = zip((test_hpp for fold_idx in range(cv_splits)), kfold.split(X[val_idx], y[val_idx]))
        
        for res in pool.imap(apply_SVR,params):
            cv_scores.append(res)
     
        pool.close()
        pool.join()
    
        return np.mean(cv_scores)
    
    study = optuna.create_study(direction = "maximize")

    study.optimize(objective, n_trials = input_params.N_trials)
    
    best_params = study.best_params
    
    return best_params

In [None]:
cv_scores = [] #scores and best hyperparameters for each split

best_hpp = {'C': 15.508154368830185, 'epsilon': 0.7477971556590273, 'gamma': 0.00020825415882166394} #MLM

for fold in range(N_folds):
    
        print(f'Fold {fold}')
        
        X_train, X_test, y_train, y_test = X[folds!=fold],X[folds==fold],y[folds!=fold],y[folds==fold]
        
        if fold==0 or input_params.keep_first==False:
            #perform only ones if input_params.keep_first==True
            val_idx = np.where(folds==0)[0]
            best_hpp = hpp_search(val_idx,cv_splits = input_params.N_CVsplits)
        
        pipe = sklearn.pipeline.make_pipeline(sklearn.preprocessing.StandardScaler(),
                                              sklearn.svm.SVR(**best_hpp))
        
        pipe.fit(X_train,y_train)
                    
        y_pred = pipe.predict(X_test) 
                
        pipe_lasso = sklearn.pipeline.make_pipeline(sklearn.preprocessing.StandardScaler(), sklearn.linear_model.LassoCV(cv=input_params.N_CVsplits, alphas=10.**np.arange(-6,0))) 
        pipe_lasso.fit(X_train,y_train)
        y_pred_lasso = pipe_lasso.predict(X_test)
        
        cv_scores.append({'fold':fold,
                         'r2_svr':sklearn.metrics.r2_score(y_test,y_pred),
                         'pearson_r_svr':scipy.stats.pearsonr(y_test,y_pred)[0],
                         'r2_lasso':sklearn.metrics.r2_score(y_test,y_pred_lasso),
                         'pearson_r_lasso':scipy.stats.pearsonr(y_test,y_pred_lasso)[0]
                         }|best_hpp)
        
cv_scores = pd.DataFrame(cv_scores)

In [24]:
cv_scores.to_csv(data_dir + 'MLM/BC3MS_mll.tsv', sep = '\t', index=None)

# Old

In [None]:
#gss = sklearn.model_selection.GroupShuffleSplit(n_splits=input_params.N_splits, train_size=.9, random_state = input_params.seed) 

#train_idx, test_idx = next(iter(gss.split(X, y, groups)))

#X_train, X_test, y_train, y_test = X[train_idx,:],X[test_idx,:],y[train_idx],y[test_idx] #first split

#best_hpp = hpp_search(X_train,y_train,groups[train_idx],cv_splits = input_params.N_CVsplits) #get optimal hyperparameters

#best_hpp = {'C': 0.03943153578419499, 'epsilon': 0.0712140417882623, 'gamma': 0.000232694021502066}

best_hpp = {}

def apply_SVR(args):
    
    train_idx, test_idx = args

    pipe = sklearn.pipeline.make_pipeline(sklearn.preprocessing.StandardScaler(),
                                              sklearn.svm.SVR(**best_hpp))
        
    pipe.fit(X.loc[train_idx],y.loc[train_idx])  
        
    y_pred = pd.Series(pipe.predict(X.loc[test_idx]), index=test_idx)  
        
    print('done')

    return y_pred
 
def run_pool():
    
    all_res = []
    
    pool = Pool(processes=input_params.n_jobs,maxtasksperchild=5)

    train_test_split = ((data_df[data_df.Fold!=fold].index, data_df[data_df.Fold==fold].index) for fold in range(N_splits))
    
    for res in pool.imap(apply_SVR,train_test_split):
        all_res.append(res)
     
    pool.close()
    pool.join()
    
    return all_res

print('running parallel')

all_res = run_pool()

#preds, scores = zip(*all_res)

#cv_res = np.vstack(preds)

#cv_scores = pd.DataFrame({'round':range(N_splits),'scores':scores}|best_hpp)

In [492]:
cv_scores.pearson_r_svr.mean()

0.2038695562423924

In [426]:
from sklearn.model_selection import cross_val_score

In [427]:
def pearson_r(estimator, X, y):
    y_pred = estimator.predict(X)
    #print(estimator[1].alpha_)
    if len(y_pred.shape) == 2:
        y_pred = y_pred.reshape(-1)
    return scipy.stats.pearsonr(y, y_pred)[0]

In [None]:
pipe = sklearn.pipeline.make_pipeline(sklearn.preprocessing.StandardScaler(), sklearn.linear_model.LassoCV(cv=3, alphas=10.**np.arange(-6,0))) 
#limit runtime
r2 = cross_val_score(pipe,X[train_idx],y[train_idx],scoring=pearson_r,
                     cv=sklearn.model_selection.GroupKFold(n_splits=10), groups=groups[train_idx], n_jobs=-1)#.mean()

In [472]:
np.mean(r2)

0.496721636952289

In [None]:
os.makedirs(input_params.output_dir, exist_ok=True) #make output dir

cv_scores.to_csv(input_params.output_dir + '/cv_scores.tsv', sep='\t', index=None) #save scores

with open(input_params.output_dir + '/cv_res.npy', 'wb') as f:
    np.save(f, cv_res) #save predictions at each round