In [1]:
from sklearn.linear_model import Ridge
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
def getClusterLabels(X):
    cell_cluster_labels = []
    for cell_name in list(X.index):
        cell_cluster_labels.append(cell_name.split("_")[1])
    return cell_cluster_labels

In [3]:
def getTargetGenesWithClusters():
    target_gene_cluster_map = {
        'twe': ['4', '2'],
        'sna': ['4', '2'],
        'htl': ['4'],
        'tin': ['4'],
        'eve': ['3'],
        'brk': ['3', '5'],
        'vnd': ['3'],
        'rho': ['6', '11'],
        'sli': ['3', '6'],
        'pnt': ['3', '5'],
        'ind': ['3'],
        'sog': ['3', '8'],
        'ths': ['3'],
        'zen': ['6', '10'],
        'pnr': ['6', '3'],
        'shn': ['4', '3'],
        'tup': ['6', '10'],
        'ush': ['6', '10']
    }
    return target_gene_cluster_map

In [4]:
def parseFile(file_name):
    feature_genes1 = []
    feature_genes2 = []
    ridge_alpha = 0.0
    with open(file_name, "r") as fp:
        for line in fp:
            token_list = line.strip().split(":")
            if token_list[0] == "select_features_1":
                feature_genes1 = token_list[1].split(",")[:-1]
            elif token_list[0] == "select_features_2":
                feature_genes2 = token_list[1].split(",")[:-1]
            elif token_list[0] == "ridge_alpha":
                ridge_alpha = float(token_list[1])
    if len(feature_genes2) > 0:
        return tuple([feature_genes2, ridge_alpha])
    return tuple([feature_genes1, ridge_alpha])

In [5]:
def getRidgeRegressionModel(X, y, ridge_alpha):
    model = Ridge(alpha=ridge_alpha, max_iter=2000)
    return model.fit(X, y)

In [6]:
def plotCoefficients(features, model, plot_title, image_file_name):
    plt.figure()
    coef_series = pd.Series(data=model.coef_, index=features)
    coef_series.plot.bar()
    plt.xticks(fontsize=4)
    plt.title(plot_title)
    plt.savefig(image_file_name, dpi=600)
    plt.clf()
    plt.close()

In [7]:
def plotObservedVsPredicted(X, y, model, plot_title, image_file_name):
    y_pred = model.predict(X)
    plt.figure()
    fig, ax = plt.subplots()
    ax.scatter(y, y_pred)
    ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
    ax.set_xlabel('Measured')
    ax.set_ylabel('Predicted')
    plt.title(plot_title)
    plt.savefig(image_file_name, dpi=600)
    plt.clf()
    plt.close()

In [8]:
def main():
    data_file_name = "dge_normalized.txt"
    gene_sc_df = pd.read_csv(data_file_name, delimiter='\t', header=0)
    cells_genes_df = gene_sc_df.T
    cell_cluster_labels = getClusterLabels(cells_genes_df)
    cells_genes_df["cluster_labels"] = cell_cluster_labels
    target_gene_cluster_map = getTargetGenesWithClusters()
    embryo_directory = "using_embryo_2/"
    clusters_directory = "using_clusters_2/"
    for target_gene, clusters in target_gene_cluster_map.items():
        # modeling using entire embryo
        file_name = embryo_directory + target_gene + "_complete.txt"
        feature_genes, ridge_alpha = parseFile(file_name)
        if len(feature_genes) > 0:
            # slice the columns
            y = cells_genes_df[target_gene].copy(deep=True)
            X = cells_genes_df.drop({target_gene, "cluster_labels"}, axis=1)
            X = X[feature_genes]
            ridge_model = getRidgeRegressionModel(X, y, ridge_alpha)
            coeff_file_name = embryo_directory + "coeff_plots/" + target_gene + "_complete_coeff.jpg"
            plot_title = target_gene + " (using all cells)"
            plotCoefficients(feature_genes, ridge_model, plot_title, coeff_file_name)
            qq_file_name = embryo_directory + "obs_pred_plots/" + target_gene + "_complete_qq.jpg"
            plot_title = target_gene + " (using all cells)"
            plotObservedVsPredicted(X, y, ridge_model, plot_title, qq_file_name)
        # modeling using only target_gene's clusters
        file_name = clusters_directory + target_gene + "_cluster_" + '_'.join(clusters) + ".txt"
        feature_genes_cluster, ridge_alpha_cluster = parseFile(file_name)
        if len(feature_genes_cluster) > 0:
            # slice the rows
            X_cluster = cells_genes_df[cells_genes_df["cluster_labels"].isin(clusters)]
            y_cluster = X_cluster[target_gene].copy(deep=True)
            X_cluster = X_cluster.drop({target_gene, "cluster_labels"}, axis=1)
            # slice the columns
            X_cluster = X_cluster[feature_genes_cluster]
            cluster_ridge_model = getRidgeRegressionModel(X_cluster, y_cluster, ridge_alpha_cluster)
            coeff_file_name_cluster = clusters_directory + "coeff_plots/" + target_gene + "_clusters_coeff.jpg"
            plot_title = target_gene + " - using cluster(s) " + ",".join(clusters)
            plotCoefficients(feature_genes_cluster, cluster_ridge_model, plot_title, coeff_file_name_cluster)
            qq_file_name_cluster = clusters_directory + "obs_pred_plots/" + target_gene + "_clusters_qq.jpg"
            plot_title = target_gene + " - using cluster(s) " + ",".join(clusters)
            plotObservedVsPredicted(X_cluster, y_cluster, cluster_ridge_model, plot_title, qq_file_name_cluster)

In [9]:
main()



References:  
- https://scikit-learn.org/0.18/auto_examples/plot_cv_predict.html
- https://medium.com/@dhwajraj/learning-python-regression-analysis-part-9-tests-and-validity-for-regression-models-78dcd5cde3a1