In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import ElasticNet, ElasticNetCV, Ridge, RidgeCV
from sklearn.metrics import mean_squared_error
np.warnings.filterwarnings("ignore")
import statsmodels.api as sm
np.warnings.resetwarnings()
import os

In [2]:
def getElasticNetBestHyperparams(X, y):
    l1_ratios = [0.05, .1, .5, .7, .9, 0.92, .95, 0.97, .99, 1]
    min_mse = 1
    best_l1_ratio = 1
    best_alpha = 1
    params_list = []
    for ratio in l1_ratios:
#         encv = ElasticNetCV(l1_ratio = ratio, n_alphas = 100, cv = 5, verbose = 1, precompute = True, max_iter=2500, n_jobs = -1)
        encv = ElasticNetCV(l1_ratio = ratio, n_alphas = 10, cv = 3, verbose = 1, precompute = True, max_iter=2500, n_jobs = -1)
        encv.fit(X, y)
        n_nonzeros = (encv.coef_ != 0).sum()
        _mse = np.mean(encv.mse_path_, axis=1)[np.where(encv.alphas_ == encv.alpha_)[0][0]]
        if (ratio == l1_ratios[0] or _mse < min_mse):
            min_mse = _mse
            best_l1_ratio = ratio
            best_alpha = encv.alpha_
        print("ratio(%e) -- n: %d -- alpha: %f -- mse: %f" % (ratio, n_nonzeros, encv.alpha_, _mse))
        if n_nonzeros != 0:
            params_list.append(tuple([n_nonzeros, _mse, ratio, encv.alpha_]))
    # select the simplest model whose mean-squared error is 'not so bad'
    sorted_params_list = sorted(params_list)
    for param_tuple in sorted_params_list:
        if param_tuple[1] - min_mse <= 0.1:
            print("for target_gene " + y.name + ", nonzero_coeffs_num, MSE, l1-ratio, alpha:")
            print(param_tuple)
            return param_tuple
    return tuple(list[0, 0, 0, 0])

In [3]:
def getElasticNetSelectedFeatures(X, y, alpha, l1_ratio):
    enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, max_iter=2000)
    enet.fit(X, y)
    selected_features = []
    for ix in range(enet.coef_.shape[0]):
        if (enet.coef_[ix] != 0):
            selected_features.append(ix)
    return list(X.columns[selected_features])

In [4]:
def getOlsSelectedFeatures(X, y):
    X = sm.add_constant(X)
    ols_model_results = sm.OLS(y, X).fit()
    p_values = ols_model_results.pvalues
    p_values = p_values.where(p_values < 0.05)
    p_values.dropna(inplace=True)
    return list(p_values.index)[1:]

In [5]:
def getRidgeRegressionBestHyperparams(X, y):
    ridge_cv = RidgeCV(alphas=np.linspace(.01,500,100), cv=10)
    ridge_cv.fit(X, y)
    best_alpha = ridge_cv.alpha_
    y_pred = ridge_cv.predict(X)
    rmse_train = np.sqrt(mean_squared_error(y, y_pred))
    r2_train = ridge_cv.score(X, y)
    return tuple([best_alpha, rmse_train, r2_train])  # (best alpha, rmse_train, r2_train)

In [6]:
def trainRidgeRegressionModel(X, y):
    # hold-out a small percentage of the samples as test set
    num_total_samples = X.shape[0]
    num_features = X.shape[1]
    test_set_size = int(0.075 * num_total_samples)
    num_training_samples = num_total_samples - test_set_size
    if num_training_samples < (10 * num_features):
        test_set_size = num_total_samples - (10 * num_features)
    test_row_ix = X.index[list(np.random.choice(a=num_total_samples, replace=False, size=test_set_size))]
    X_test = X.loc[test_row_ix]
    y_test = y[test_row_ix]
    X_train = X.drop(test_row_ix, axis=0, inplace=False)
    y_train = y.drop(test_row_ix, inplace=False)
    alpha, rmse_train, r2_train = getRidgeRegressionBestHyperparams(X_train, y_train)
    ridge_reg_model = Ridge(fit_intercept=True, alpha=alpha)
    ridge_reg_model.fit(X_train, y_train)
    r2_test = ridge_reg_model.score(X_test, y_test)
    y_test_pred = ridge_reg_model.predict(X_test)
    rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
    return tuple([alpha, rmse_train, r2_train, rmse_test, r2_test, test_set_size])

In [7]:
def runLinearRegressionPipeline(X, y, output_file_name):
    # create new file for this run
    with open(output_file_name, "w") as fp:
        # feature selection using Elastic Net
        _, _, l1_ratio, alpha = getElasticNetBestHyperparams(X, y)
        fp.write("l1_ratio:" + str(l1_ratio) + "\n");
        fp.write("alpha:" + str(alpha) + "\n")
        if l1_ratio == 0:
            print("no features selected for " + output_file_name)
            return
        select_features_1 = getElasticNetSelectedFeatures(X, y, alpha, l1_ratio)
        fp.write("select_features_1:")
        for feature in select_features_1:
            fp.write(feature + ",")
        fp.write("\n")
        X_reduced_1 = X[select_features_1]
        # feature selection using OLS
        select_features_2 = getOlsSelectedFeatures(X_reduced_1, y)
        fp.write("select_features_2:")
        for feature in select_features_2:
            fp.write(feature + ",")
        fp.write("\n")
        if is_empty(select_features_2):
            X_reduced_2 = X_reduced_1
        else:
            X_reduced_2 = X_reduced_1[select_features_2]
        # train Ridge regression model with final set of features
        alpha, rmse_train, r2_train, rmse_test, r2_test, test_set_size = trainRidgeRegressionModel(X_reduced_2, y)
        fp.write("ridge_alpha:" + str(alpha) + "\n")
        fp.write("rmse_train:" + str(rmse_train) + "\n")
        fp.write("r2_train:" + str(r2_train) + "\n")
        fp.write("test_set_size:" + str(test_set_size) + "\n")
        fp.write("rmse_test:" + str(rmse_test) + "\n")
        fp.write("r2_test:" + str(r2_test) + "\n")

In [8]:
def modellingUsingCompleteEmbryo(cells_genes_df, cluster_top_genes):
    for cluster, top_genes in cluster_top_genes.items():
        for target_gene in top_genes:
            print("modelling using complete empbryo for target gene " + target_gene)
            y = cells_genes_df[target_gene].copy(deep=True)
            X = cells_genes_df.drop({target_gene}, axis=1)
            output_file_name = target_gene + "_complete.txt"
            # check if file already exists, if it does, then continue to next target_gene
            if (os.path.isfile(output_file_name)):
                print(output_file_name + " already exists!")
                continue
            runLinearRegressionPipeline(X, y, output_file_name)

In [9]:
def modellingPerCluster(X, cell_cluster_labels, cluster_top_genes):
    cells_genes_df["cluster_labels"] = cell_cluster_labels
    for cluster, top_genes in cluster_top_genes.items():
        X_cluster = cells_genes_df[cells_genes_df["cluster_labels"] == str(cluster)]
        for target_gene in top_genes:
            print("modelling using cluster " + str(cluster) + " for target gene " + target_gene)
            y = X_cluster[target_gene].copy(deep=True)
            X = X_cluster.drop({target_gene, "cluster_labels"}, axis=1)
            output_file_name = target_gene + "_cluster_" + str(cluster) + ".txt"
            if (os.path.isfile(output_file_name)):
                print(output_file_name + " already exists!")
                continue
            runLinearRegressionPipeline(X, y, output_file_name)

In [10]:
def getTopGenesPerCluster():
    cluster_top_genes = {}
    cluster_top_genes_file = "cluster_top_genes.csv"
    with open(cluster_top_genes_file, "r") as fp:
        for line in fp:
            cluster_no = int(line.split(", ")[0])
            top_genes = line.split(", ")[1:]
            cluster_top_genes[cluster_no] = top_genes
    return cluster_top_genes

In [11]:
def getClusterLabels(X):
    cell_cluster_labels = []
    for cell_name in list(X.index):
        cell_cluster_labels.append(cell_name.split("_")[1])
    return cell_cluster_labels

In [12]:
def main2():
    data_file_name = "dge_normalized.txt"
    gene_sc_df = pd.read_csv(data_file_name, delimiter='\t', header=0)
    cells_genes_df = gene_sc_df.T
    cluster_top_genes = {1:["Act87E"]}
    print("calling modellingUsingCompleteEmbryo")
    modellingUsingCompleteEmbryo(cells_genes_df, cluster_top_genes)

In [13]:
def main():
    data_file_name = "dge_normalized.txt"
    gene_sc_df = pd.read_csv(data_file_name, delimiter='\t', header=0)
    cells_genes_df = gene_sc_df.T
    cell_cluster_labels = getClusterLabels(cells_genes_df)
    cluster_top_genes = getTopGenesPerCluster()
    modellingUsingCompleteEmbryo(cells_genes_df, cluster_top_genes)
    modellingPerCluster(cells_genes_df, cell_cluster_labels, cluster_top_genes)

In [14]:
main()

modelling using complete empbryo for target gene spict
spict_complete.txt already exists!
modelling using complete empbryo for target gene Ada1-1
Ada1-1_complete.txt already exists!
modelling using complete empbryo for target gene CG18130
CG18130_complete.txt already exists!
modelling using complete empbryo for target gene Klp10A


..............................[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  4.2min finished


ratio(5.000000e-02) -- n: 87 -- alpha: 4.525979 -- mse: 2.507438


..............................[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  2.8min finished


ratio(1.000000e-01) -- n: 74 -- alpha: 2.262990 -- mse: 2.505802


..............................[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.9min finished


ratio(5.000000e-01) -- n: 51 -- alpha: 0.452598 -- mse: 2.515311


..............................[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  2.0min finished


ratio(7.000000e-01) -- n: 51 -- alpha: 0.323284 -- mse: 2.517231


..............................[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  2.9min finished


ratio(9.000000e-01) -- n: 49 -- alpha: 0.251443 -- mse: 2.518394


..............................[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  2.2min finished


ratio(9.200000e-01) -- n: 49 -- alpha: 0.245977 -- mse: 2.518485


..............................[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  2.1min finished


ratio(9.500000e-01) -- n: 49 -- alpha: 0.238209 -- mse: 2.518616


..............................[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  2.2min finished


ratio(9.700000e-01) -- n: 49 -- alpha: 0.233298 -- mse: 2.518700


..............................[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  2.2min finished


ratio(9.900000e-01) -- n: 49 -- alpha: 0.228585 -- mse: 2.518781


..............................[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  2.2min finished


ratio(1.000000e+00) -- n: 49 -- alpha: 0.226299 -- mse: 2.518821
for target_gene Klp10A, nonzero_coeffs_num, MSE, l1-ratio, alpha:
(49, 2.5183944215039094, 0.9, 0.25144329924377479)
modelling using complete empbryo for target gene mbl


..............................[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  2.3min finished


ratio(5.000000e-02) -- n: 0 -- alpha: 2.207959 -- mse: 0.377184


..............................[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.8min finished


ratio(1.000000e-01) -- n: 0 -- alpha: 1.103980 -- mse: 0.377369


..............................[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  2.2min finished


ratio(5.000000e-01) -- n: 0 -- alpha: 0.220796 -- mse: 0.377663


..............................[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  2.3min finished


ratio(7.000000e-01) -- n: 1 -- alpha: 0.157711 -- mse: 0.377693


..............................[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  2.4min finished


ratio(9.000000e-01) -- n: 0 -- alpha: 0.122664 -- mse: 0.377710


..............................[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  2.4min finished


ratio(9.200000e-01) -- n: 0 -- alpha: 0.119998 -- mse: 0.377712


..............................[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  2.5min finished


ratio(9.500000e-01) -- n: 0 -- alpha: 0.116208 -- mse: 0.377713


..............................[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  2.5min finished


ratio(9.700000e-01) -- n: 0 -- alpha: 0.113812 -- mse: 0.377715


..............................[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  2.4min finished


ratio(9.900000e-01) -- n: 0 -- alpha: 0.111513 -- mse: 0.377716


..............................[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  2.7min finished


ratio(1.000000e+00) -- n: 0 -- alpha: 0.110398 -- mse: 0.377716
for target_gene mbl, nonzero_coeffs_num, MSE, l1-ratio, alpha:
(1, 0.37769290585498805, 0.7, 0.1577113793473004)
modelling using complete empbryo for target gene srl


..............................[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  3.3min finished


ratio(5.000000e-02) -- n: 236 -- alpha: 2.292252 -- mse: 2.133438


..............................[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.8min finished


ratio(1.000000e-01) -- n: 24 -- alpha: 2.469253 -- mse: 2.142086


..............................[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.6min finished


ratio(5.000000e-01) -- n: 20 -- alpha: 0.493851 -- mse: 2.144301


..............................[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  2.3min finished


ratio(7.000000e-01) -- n: 20 -- alpha: 0.352750 -- mse: 2.144904


..............................[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  2.3min finished


ratio(9.000000e-01) -- n: 20 -- alpha: 0.274361 -- mse: 2.145285


..............................[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  2.3min finished


ratio(9.200000e-01) -- n: 20 -- alpha: 0.268397 -- mse: 2.145316


..............................[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  2.4min finished


ratio(9.500000e-01) -- n: 20 -- alpha: 0.259921 -- mse: 2.145361


..............................[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  2.5min finished


ratio(9.700000e-01) -- n: 19 -- alpha: 0.254562 -- mse: 2.145390


..............................[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  2.4min finished


ratio(9.900000e-01) -- n: 19 -- alpha: 0.249420 -- mse: 2.145418


..............................[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  2.5min finished


ratio(1.000000e+00) -- n: 18 -- alpha: 0.246925 -- mse: 2.145431
for target_gene srl, nonzero_coeffs_num, MSE, l1-ratio, alpha:
(18, 2.1454314468362869, 1, 0.2469253391786364)
modelling using complete empbryo for target gene sktl


.............

KeyboardInterrupt: 

.................