In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
from sklearn.metrics import r2_score, mean_squared_error
from scipy.stats import spearmanr, pearsonr
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import shap
import warnings
warnings.filterwarnings('ignore')

RESOLUTION = 25
window_size = 201

l1_alpha = 0.001

assays = ['DNase', 'H3K36me3', 'H3K27me3', 'H3K27ac', 'H3K4me1', 'H3K4me3', 'H3K9me3']
assay_index = 7  # 1 through 7; this is the assay that's concatenated with H3K27ac in the input features

In [None]:
colnames = ["pos_" + str(x) for x in list(range(-(window_size//2), (window_size//2) + 1))]
colnames = colnames + ["log10p1(TPM)", "cell_type", "chrom", "position", "strand", "assay_index"] 
df = pd.read_csv('../Data/T.2', sep=",", names=colnames, low_memory=False)
df

In [None]:
df_sorted = df.sort_values(['chrom', 'position', 'cell_type', 'assay_index'], ascending=[1, 1, 1, 1])
df_sorted_unique = df_sorted.drop_duplicates()
df_H3K27ac = df_sorted_unique.iloc[range(4-1, len(df_sorted_unique), 7), :]
df_other_assay = df_sorted_unique.iloc[range(assay_index-1, len(df_sorted_unique), 7), :]
df_merged = pd.merge(df_H3K27ac, df_other_assay, on=['cell_type', 'chrom', 'position', 'strand'], suffixes=['_H3K27ac', '_'+assays[assay_index-1]])
df_processed = df_merged.iloc[:, np.r_[0:window_size, (window_size + 6):(window_size + 6 + window_size + 1), window_size + 2]]
df_processed

In [None]:
even_chroms = ["chr" + str(c) for c in range(2,23,2)]
df_even_chroms = df_processed.loc[df_processed['chrom'].isin(even_chroms)]
xTrain = df_even_chroms.iloc[:, :window_size + window_size]
yTrain = df_even_chroms.iloc[:, window_size + window_size]

odd_chroms = ["chr" + str(c) for c in range(1,23,2)]
df_odd_chroms = df_processed.loc[df_processed['chrom'].isin(odd_chroms)]
xTest = df_odd_chroms.iloc[:, :window_size + window_size]
yTest = df_odd_chroms.iloc[:, window_size + window_size]

# xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size=0.30, random_state=42)
(xTrain.shape, xTest.shape, yTrain.shape, yTest.shape)

In [None]:
# model = linear_model.LinearRegression()
model = linear_model.ElasticNet(alpha=l1_alpha, l1_ratio=1.0, max_iter=1000)
# model = RandomForestRegressor(verbose=0, n_estimators=100, n_jobs=12)
model.fit(xTrain.iloc[:-1,:], yTrain.iloc[:-1])

In [None]:
yTrain_Pred = model.predict(xTrain)
yTest_Pred = model.predict(xTest)
mse_Train = mean_squared_error(yTrain, yTrain_Pred)
pc_Train, _ = pearsonr(yTrain, yTrain_Pred)
sc_Train, _ = spearmanr(yTrain, yTrain_Pred)
mse_Test = mean_squared_error(yTest, yTest_Pred)
pc_Test, _ = pearsonr(yTest, yTest_Pred)
sc_Test, _ = spearmanr(yTest, yTest_Pred)

print("Train: MSE = ", round(mse_Train, 3), "Pearson =", round(pc_Train, 3), "Spearman =", round(sc_Train, 3))
print("Test: MSE = ", round(mse_Test, 3), "Pearson =", round(pc_Test, 3), "Spearman =", round(sc_Test, 3))

In [None]:
plt.rcParams["figure.figsize"] = (20, 12)
plt.plot(yTest, yTest_Pred, 'o', markersize=4, color='green')
plt.xlabel("True Normalized TPM", size=40)
plt.ylabel("Predicted Normalized TPM", size=40)
plt.xlim(-0.5, 4)
plt.ylim(-0.5, 2)
plt.title("Pearson = "+str(round(pc_Test, 3))+" Spearman = "+str(round(sc_Test, 3)), size=50)
plt.show()

In [None]:
# explainer = shap.TreeExplainer(model)
# shap_values = explainer.shap_values(xTrain.iloc[:10, :])
# shap.summary_plot(shap_values, xTrain.iloc[:10, :])
# shap.plots.beeswarm(shap_values, xTest.iloc[:10, :])

In [None]:
# Define in-silico epigenesis for a pair of input assays where the first assay is H3K27ac
def ise(gene, trained_model, assay_index, inserted_minuslog10p_value = 3, peak_width = 2):
    
    X = np.load(gene + ".npy")
    
    # Perform inference by introducing p-value changes with a peak width
    yPred = []
    center = window_size // 2
    positions = range(center - center, center + center + 1)
    for pos in positions:
        X_modified = np.copy(X)
        # TODO: Calculate number of bins exactly
        for p in range(pos - peak_width // 2, pos + peak_width // 2 + 1):
            if( (p>=0) and (p < max(positions)) ):
                if(X_modified[:, p, 2] > 1.1):
                    # If H3K27me3 peak exists, then p300 doesn't work
                    pass
                else:
                    X_modified[:, p, 3] += inserted_minuslog10p_value

        # Prepare input for simple model
        X_modified = np.concatenate([X_modified[:, :, 3], X_modified[:, :, assay_index]], axis=1)

        yPred_value = trained_model.predict(X_modified)
        yy = yPred_value[0]
        yPred.append(yy)
    
    # Prepare input for simple model
    X = np.concatenate([X[:, :, 3], X[:, :, assay_index]], axis=1) 

    # Instead of scaling, divide by yPred
    yPred_value = trained_model.predict(X)[0]
    yPred = (np.power(10, yPred) -1) / (np.power(10, yPred_value) -1)

    return yPred


def p_value_mapping(inserted_minuslog10p_value):
    minuslog10p_value = np.expm1(inserted_minuslog10p_value)
    p_value = np.power(10, -1 * minuslog10p_value)
    return round(minuslog10p_value, 2)


def assay_name(assay):
    return assays[assay]


# Load Alan's H3K27ac p300 dataset
df_CXCR4 = pd.read_csv("../Data/CXCR4.p300.tsv", sep="\t")
df_CXCR4_values = df_CXCR4[df_CXCR4['Position_wrt_TSS']!="Control"]
df_CXCR4_values["Position_wrt_TSS"] = pd.to_numeric(df_CXCR4_values["Position_wrt_TSS"], errors='coerce')/RESOLUTION
df_CXCR4_means = df_CXCR4_values.groupby('Position_wrt_TSS').mean()
df_CXCR4_means.index.name = 'Position_wrt_TSS'
df_CXCR4_means.reset_index(inplace=True)

df_TGFBR1 = pd.read_csv("../Data/TGFBR1.p300.tsv", sep="\t")
df_TGFBR1_values = df_TGFBR1[df_TGFBR1['Position_wrt_TSS']!="Control"]
df_TGFBR1_values["Position_wrt_TSS"] = pd.to_numeric(df_TGFBR1_values["Position_wrt_TSS"], errors='coerce')/RESOLUTION
df_TGFBR1_means = df_TGFBR1_values.groupby('Position_wrt_TSS').mean()
df_TGFBR1_means.index.name = 'Position_wrt_TSS'
df_TGFBR1_means.reset_index(inplace=True)


#Perform in-silico epigenesis
genes = ["CXCR4", "TGFBR1"]  # , "High", "Medium", "Low"]
assay_color = ['black', 'red', 'green', 'blue', 'cyan', 'pink', 'brown']
xticklabels = range(-(window_size // 2), (window_size // 2) + 1)
for gene in genes:

    gene_features = np.squeeze(np.load(gene+".npy"), axis=0)
    plt.rcParams["figure.figsize"] = (20, 10)

    '''
    plt.figure(0)
    ax = sns.heatmap(np.transpose(gene_features), annot=False, fmt="f",
                     vmin=0.0, vmax=2.0,
                     yticklabels=assays, xticklabels=xticklabels)
    plt.show()
    plt.close()
    '''

    if(gene == 'CXCR4'):
        df = df_CXCR4_values
        df_means = df_CXCR4_means
    elif(gene == 'TGFBR1'):
        df = df_TGFBR1_values
        df_means = df_TGFBR1_means
    else:
        df = df_CXCR4_values.iloc[0:0,:].copy()
        df_means = df_CXCR4_means.iloc[0:0,:].copy()

    inserted_minuslog10p_value = 2.5
    peak_width = 4
    gene_ise = ise(gene, model, assay_index-1, inserted_minuslog10p_value, peak_width)

    plt.figure(1)

    plt.xlim(-102, 102)
    if(gene == "CXCR4"):
        plt.plot(xticklabels, 1.0 * gene_ise - 0.0, 'o-', color=assay_color[3], markersize=3, label="Model predictions " + assays[3])
        plt.plot(xticklabels, 1.0 * np.transpose(gene_features[:, 3]), 'o-', color='darkorange', markersize=3, label="Epigenetic features " + assays[3])
        plt.ylim(-0.9, 8.5)
    else:
        plt.plot(xticklabels, 0.5 * gene_ise + 0.5, 'o-', color=assay_color[3], markersize=3, label="Model Predictions " + assays[3])
        plt.plot(xticklabels, 1.0 * np.transpose(gene_features[:, 3]), 'o-', color='darkorange', markersize=3, label="Epigenetic Features " + assays[3])
        plt.ylim(-0.5, 3)

    plt.bar(df_means['Position_wrt_TSS'], 1.0 * df_means['Fold_Change'] - 1, color='deepskyblue', bottom=1, width=2, label="Experimental mean from qPCR")
    plt.plot(df['Position_wrt_TSS'], df['Fold_Change'], 'o', color='darkgray', label="Experimental data from qPCR", markersize=10)
    
    ax=plt.gca() 
    x_vals = ax.axes.get_xticks()
    ax.set_xticklabels(['{:3.0f}'.format(x * RESOLUTION) for x in x_vals])
    ax.yaxis.set_major_locator(MaxNLocator(integer=True))
    plt.tick_params(axis='both', which='major', labelsize=30)
    plt.tick_params(axis='both', which='minor', labelsize=30)
    plt.xlabel("Peak Position (in bp) w.r.t TSS", size=50)
    plt.ylabel("Gene expression fold change", size=40)
    plt.legend(loc='upper center', prop={'size': 25}, ncol=2)
    plt.title("Gene "+gene+": inserting " + str(peak_width * RESOLUTION) + "bp peaks of -log10(p_value)="+str(p_value_mapping(inserted_minuslog10p_value)), size=35)
    plt.show()
    plt.close()

In [None]:
plt.plot(model.coef_)