## Which pathway explains more variation of maternal mRNA degradation rates in WT conditions?

Overview:


Conditions:
   + model type: **linear model**
   + Evaluation Metric: $R^2$
   + Cross Validation: **Repeated 10-fold-cv**

In [1]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedKFold
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, r2_score, mean_squared_error
import warnings; warnings.simplefilter('ignore')

In [2]:
dta = pd.read_csv("./results_data/analysis_data.csv")
dta.set_index('Gene_ID', inplace=True)
dta.head()

Unnamed: 0_level_0,wt_polya,wt_ribo,m6A,MiR430,PLS1,PLS2,m6a_affected_in_mutant
Gene_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENSDARG00000000018,0.185277,-0.108162,False,0,0.866097,1.623199,False
ENSDARG00000000019,-0.138868,-0.064825,False,2,1.076742,0.877583,False
ENSDARG00000000068,0.040206,-0.132033,False,0,2.158753,1.800105,False
ENSDARG00000000086,0.019231,-0.062527,False,0,1.392857,0.715777,False
ENSDARG00000000103,0.119885,-0.004627,False,0,2.140356,0.917946,False


In [7]:
y_train = dta[['wt_ribo', 'wt_polya']]
X_train = dta[['m6a_affected_in_mutant', 'MiR430', 'PLS1', 'PLS2']]

In [8]:
pathways = {
    'codon optimality': ['PLS1'],
    'm6a': ['m6a_affected_in_mutant'],
    'miRNA-430': ['MiR430']
}

class PathwaySelector(BaseEstimator, TransformerMixin):
    """helper class the get the pathway predictors"""
    def __init__(self, columns):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        assert isinstance(X, pd.DataFrame)

        try:
            return X[self.columns]
        except KeyError:
            cols_error = list(set(self.columns) - set(X.columns))
            raise KeyError("The DataFrame does not include the columns: %s" % cols_error)    

In [9]:
results = []

def evaluate_pipeline_in_data(y, pipeline, sample_condition):
    """
    returns:
        pd.DataFrame: CV results
    """
    cv = RepeatedKFold(random_state=43)
    grid = GridSearchCV(estimator=pipeline, param_grid={}, n_jobs=10, scoring=make_scorer(r2_score), cv=cv)
    grid.fit(X_train, y)
    cv_res = pd.DataFrame(grid.cv_results_)
    cv_res['sample_condition'] = sample_condition
    return cv_res
    

for pathway in pathways.keys():
    print('evaluating {}'.format(pathway))
    # make the pipeline
    pipeline = Pipeline(steps=[
        ('pathway', PathwaySelector(pathways[pathway])),
        ('scaler', StandardScaler()),
        ('regresor', LinearRegression())
    ])
    
    # evaluate the data using 10-fold CV
    r_poly = evaluate_pipeline_in_data(y_train.wt_polya, pipeline, 'poly-A')
    r_ribo = evaluate_pipeline_in_data(y_train.wt_ribo, pipeline, 'ribo0')

    # add metadata to the results and append
    r_poly['pathway'] = pathway
    r_ribo['pathway'] = pathway
    
    results.extend([r_ribo, r_poly])

results = pd.concat(results)

evaluating codon optimality
evaluating m6a
evaluating miRNA-430


In [10]:
results.to_csv('results_data/results_lm_10_foldCV.csv', index=False)