To support the impact score for proteins on pathways, we also analyzed a scRNA-seq dataset downloaded from the tabular project. We did a sysetmatic correlation analysis using BigScale, an R package. The generated correlation data was then selected by choosing the top 0.1% of correlations as described in the original BigScale correlation paper (https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1713-4. The Java code is used to choose these correlations and then perform pathway enriment analysis as shown here: https://github.com/reactome-idg/fi-network-ml/blob/99022e85c2079b445913cdafc637be5ec2c39509/src/main/java/org/reactome/idg/bn/BigScaleCorrelationAnalyzer.java#L51. The basic step is similar to the NLP analysis. Therefore, this notebook is placed in this folder for easy maintenance.

In [None]:
from IPython.display import display, HTML
# Make sure all width is used to better take the screen space
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
# Reuse functions used for NLP
# import sys
# sys.path.insert(1, '.')
import TopicModelingResultAnalyzer as result_analyzer
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# Make sure this folder is right
dir_name = '/Volumes/ssd/results/reactome-idg/fi-network-ml/impact_analysis/scRNASeq'

In [None]:
cor_file_name = dir_name + '/' + 'tabula_blood_corr_recursive.csv'
# The correlation is pair-wise and the file is big. The loading is a quite slow process.
# Make sure don't try to reload!
cor_df = pd.read_csv(cor_file_name, index_col=0)

In [None]:
print(cor_df.shape)
cor_df.head()

In [None]:
# Need to fix the column names. An issue from R
cor_df.columns = cor_df.columns.map(lambda c : c.replace('.', '-'))
cor_df

In [None]:
cor_df.stack().plot.hist(bins=1000)
# The distribution is quite positive with the peak around 0.50. This is a little bit unexpected!

In [None]:
file_name = dir_name + "/../impact_analysis_092121_with_enrichment_092921_with_scRNASeq_082322.txt"
impact_result_df_with_bigscale = pd.read_csv(file_name, sep = '\t')
impact_result_df_with_bigscale.head()

In [None]:
# Note: This is a very long process, taking about 17 minutes
impact_result_df_with_bigscale['BigScale_FDR_Log10'] = impact_result_df_with_bigscale.BigScale_FDR.map(lambda f : -np.log10(f))
print(impact_result_df_with_bigscale.head())
df_cor_fdr_bigscale = calculate_correlation(cor_col_name='BigScale_FDR_Log10', impact_result_df_selected=impact_result_df_with_bigscale)
file_name = dir_name + '/df_cor_fdr_bigscale_113022.csv'
# file_name = dir_name + '/df_cor_fdr_bigscale_082322.csv'
df_cor_fdr_bigscale.to_csv(file_name)

In [None]:
# This cell runs for about 30 minutes
df_cor_average_activation_bigscale = calculate_correlation(cor_col_name='BigScale_FDR_Log10', 
                                                           impact_col_name='Average_Activation',
                                                           impact_result_df_selected=impact_result_df_with_bigscale)
# file_name = dir_name + '/df_cor_average_activation_bigscale_082322.csv'
file_name = dir_name + '/df_cor_average_activation_bigscale_113022.csv'
df_cor_average_activation_bigscale.to_csv(file_name)
df_cor_average_inhibition_bigscale = calculate_correlation(cor_col_name='BigScale_FDR_Log10', 
                                                           impact_col_name='Average_Inhibition',
                                                           impact_result_df_selected=impact_result_df_with_bigscale)
file_name = dir_name + '/df_cor_average_inhibition_bigscale_110322.csv'
# file_name = dir_name + '/df_cor_average_inhibition_bigscale_082322.csv'
df_cor_average_inhibition_bigscale.to_csv(file_name)

In [None]:
# Load these results without re-clculation, which is very long
file_name = dir_name + '/df_cor_fdr_bigscale_113022.csv'
df_cor_fdr_bigscale = pd.read_csv(file_name)
file_name = dir_name + '/df_cor_average_activation_bigscale_113022.csv'
df_cor_average_activation_bigscale = pd.read_csv(file_name)
file_name = dir_name + '/df_cor_average_inhibition_bigscale_110322.csv'
df_cor_average_inhibition_bigscale = pd.read_csv(file_name)

# Plots
fig, axes = plt.subplots(nrows=1, ncols=3)
fig.tight_layout(pad=.01)
plt.subplots_adjust(bottom=0.2, top=0.95, hspace=0.0)
plt.rcParams.update({'font.size': 20})  # This apply to titles only
fig.set_figwidth(30)
fig.set_figheight(10)
g = sns.scatterplot(x = df_cor_fdr_bigscale.Pearson_Cor,
                    y = -np.log10(df_cor_fdr_bigscale.Peason_P_Value),
                    ax = axes[0])
g.set_title('FDR')
g = sns.scatterplot(x = df_cor_average_activation_bigscale.Pearson_Cor,
                    y = -np.log10(df_cor_average_activation_bigscale.Peason_P_Value), 
                    ax = axes[1])
g.set_title('Average_Activation')
g = sns.scatterplot(x = df_cor_average_inhibition_bigscale.Pearson_Cor, 
                    y = -np.log10(df_cor_average_inhibition_bigscale.Peason_P_Value), 
                    ax = axes[2])
g.set_title('Average_Inhibition')
fig.savefig(dir_name + '/df_cor_bigscale_113022_scatter_plot.pdf')

In [None]:
# For annotation
# Following tutorial: https://levelup.gitconnected.com/statistics-on-seaborn-plots-with-statannotations-2bfce0394c00
from statannotations.Annotator import Annotator
from scipy.stats import mannwhitneyu
def annotate_plot(x,
                  y,
                  ax,
                  pairs,
                  pval = None):
    annotator = Annotator(ax,
                          pairs,
                          x=x,
                          y=y)
    alternative = 'two-sided'
    if pairs[0][0] or pairs[0][0] == 'True':
        alternative = 'greater'
    elif not pairs[0][0] or pairs[0][0] == 'False':
        alternative = 'less'
    values_1 = y[x == pairs[0][0]]
    values_2 = y[x == pairs[0][1]]
    # print('Alternative: {}'.format(alternative))
    if pval is None:
        test_results = mannwhitneyu(values_1, values_2, alternative=alternative)
        annotator.set_pvalues([test_results.pvalue])
    else:
        annotator.set_pvalues([pval])
    annotator.annotate()

def set_up_subplots(nrow=1, ncols=4):
    fig, axes = plt.subplots(nrows=nrow, ncols=ncols)
    fig.tight_layout(pad=0.05)
    plt.subplots_adjust(left=0.03, right=0.99, bottom=0.1, top=0.95, hspace=0.15, wspace=0.2)
    plt.rcParams.update({'font.size': 20})  # This apply to titles only
    fig.set_figwidth(36)
    fig.set_figheight(12)
    return fig, axes

def plot_correlation_distribution(cor_df,
                                  out_file_name):
    # Plot individual score correlations for the manuscript
    fig, axes = set_up_subplots(nrow=1, ncols=4)
    g = sns.scatterplot(x = 'Pearson_Cor',
                        y = -np.log10(cor_df.Peason_P_Value),
                        # hue = 'Annotated',
                        data = cor_df,
                        ax = axes[0])
    g.set_xlabel('Pearson Correlation')
    g.set_ylabel('-Log10(pValue)')
    # left and right dots
    cor_df['Is_Cor_Positive'] = cor_df['Pearson_Cor'].map(lambda x : "True" if x > 0.0 else "False")
    cor_df.sort_values(by='Is_Cor_Positive', inplace=True, ascending=True)
    # Count
    is_cor_positive_count = cor_df.groupby('Is_Cor_Positive').count()
    print("Plot positive and negative counts:")
    g = sns.barplot(y = 'Pearson_Cor',
                    x = is_cor_positive_count.index,
                    data = is_cor_positive_count,
                    ax = axes[1])
    g.set_xlabel('Pearson Correlation > 0')
    g.set_ylabel('Counts')
    # Need to do proportion
    from statsmodels.stats.proportion import proportions_ztest
    true_count = is_cor_positive_count['Pearson_Cor']['True']
    false_count = is_cor_positive_count['Pearson_Cor']['False']
    print(is_cor_positive_count['Pearson_Cor'])
    stat, pval = proportions_ztest(true_count, true_count + false_count, 0.50)
    annotate_plot(y = is_cor_positive_count.Pearson_Cor,
                x = is_cor_positive_count.index,
                ax = axes[1],
                pairs=[('True', 'False')],
                pval = pval)

    # Plot pvalues
    print("Plot p-values:")
    g = sns.boxplot(x = 'Is_Cor_Positive',
                    y = -np.log10(cor_df.Peason_P_Value),
                    data = cor_df,
                    ax = axes[2])
    g.set_xlabel('Pearson Correlation > 0')
    g.set_ylabel('-Log10(pValue)')
    annotate_plot(x=cor_df['Is_Cor_Positive'],
                y=-np.log10(cor_df.Peason_P_Value),
                ax=axes[2],
                pairs=[('True', 'False')])

    # Plot correlations
    print("Plot absolute correlations:")
    g = sns.boxplot(x='Is_Cor_Positive',
                    y=np.abs(cor_df['Pearson_Cor']),
                    data = cor_df,
                    ax = axes[3])
    g.set_xlabel('Pearson Correlation > 0')
    g.set_ylabel('Absolute Correlation')
    pairs = [("True", "False")]
    annotate_plot(x=cor_df['Is_Cor_Positive'],
                y=np.abs(cor_df['Pearson_Cor']),
                ax=axes[3],
                pairs=pairs)

    fig.savefig(out_file_name)

def plot_correlation_comparison(cor_df,
                                fig_out_file_name):
    cor_df.set_index('Gene', drop=False, inplace=True)
    result_analyzer._attach_devlevel_annotated(cor_df)

    col_names = ['Pearson_Cor', 'Peason_P_Value', 'length'] # There is a typo
    y_label = ['Pearson Correlation', '-Log10(pValue)', 'Number of Pathways']
    fig, axes = set_up_subplots(nrow=1, ncols=len(col_names))
    plt.subplots_adjust(left=0.035)
    for i in range(len(col_names)):
        print('Plot: ' + col_names[i])
        y = col_names[i]
        if y.endswith('P_Value'):
            y = -np.log10(cor_df[y])
        # Plot numbers of pathways
        g = sns.boxplot(y = y,
                        x = 'Annotated',
                        data = cor_df,
                        hue='Tdark',
                        ax = axes[i])
        g.set_ylabel(y_label[i])
        pairs = [((True, False), (True, True)),
                ((False, False), (False, True)),
                ((False, False), (True, False)),
                ((False, True), (True, True))]
        annotator = Annotator(y = y,
                        x = 'Annotated',
                        data = cor_df,
                        hue='Tdark',
                        ax = axes[i],
                        pairs=pairs)
        annotator.configure(test='Mann-Whitney', verbose=True)
        annotator.apply_and_annotate()
    fig.savefig(fig_out_file_name)

In [None]:
# Plot enrichment score
fig_file_name = dir_name + "/df_cor_fdr_bigscale_113022_dist.pdf"
plot_correlation_distribution(df_cor_fdr_bigscale, fig_file_name)

In [None]:
fig_out_file_name = dir_name + '/df_cor_fdr_bigscale_113022_comparison.pdf'
plot_correlation_comparison(df_cor_fdr_bigscale, fig_out_file_name)

In [None]:
# Plot for average_activation
fig_out_file_name = dir_name + '/df_cor_average_activation_bigscale_113022_dist.pdf'
plot_correlation_distribution(df_cor_average_activation_bigscale, fig_out_file_name)
fig_out_file_name = dir_name + '/df_cor_average_activation_bigscale_113022_comparson.pdf'
plot_correlation_comparison(df_cor_average_activation_bigscale, fig_out_file_name)

In [None]:
# Plot for average_inhibition
fig_out_file_name = dir_name + '/df_cor_average_inhibition_bigscale_113022_dist.pdf'
plot_correlation_distribution(df_cor_average_inhibition_bigscale, fig_out_file_name)
fig_out_file_name = dir_name + '/df_cor_average_inhibition_bigscale_113022_comparson.pdf'
plot_correlation_comparison(df_cor_average_inhibition_bigscale, fig_out_file_name)

In [None]:
# Plot the similar results for NLP
# Load the dataframe
def load_nlp_cor_df(file_name):
    df_cor_nlp_fdr = pd.read_csv(file_name, sep='\t')
    # Use Impacted_Pathways for length
    df_cor_nlp_fdr['length'] = df_cor_nlp_fdr['Impacted_Pathways']
    # Need to use the typo
    df_cor_nlp_fdr['Peason_P_Value'] = df_cor_nlp_fdr['Pearson_PValue']
    df_cor_nlp_fdr['Pearson_Cor'] = df_cor_nlp_fdr['Pearson']
    return df_cor_nlp_fdr
    
file_name = dir_name + '/../nlp_files/FDR_impact_pubmed_score_cor_04272022_0.txt'
df_cor_nlp_fdr = load_nlp_cor_df(file_name)
fig_out_file_name = dir_name + '/../nlp_files/FDR_impact_pubmed_score_cor_04272022_0_dist.pdf'
plot_correlation_distribution(df_cor_nlp_fdr, fig_out_file_name)
fig_out_file_name = dir_name + '/../nlp_files/FDR_impact_pubmed_score_cor_04272022_0_comparison.pdf'
plot_correlation_comparison(df_cor_nlp_fdr, fig_out_file_name)

In [None]:
# Analyze the average activation for nlp
file_name = dir_name + '/../nlp_files/Average_Activation_impact_pubmed_score_cor_04272022_0.txt'
df_cor_nlp_average_activation = load_nlp_cor_df(file_name)
fig_out_file_name = dir_name + '/../nlp_files/Average_Activation_impact_pubmed_score_cor_04272022_0_dist.pdf'
plot_correlation_distribution(df_cor_nlp_average_activation, fig_out_file_name)
fig_out_file_name = dir_name + '/../nlp_files/Average_Activation_impact_pubmed_score_cor_04272022_0_comparison.pdf'
plot_correlation_comparison(df_cor_nlp_average_activation, fig_out_file_name)

In [None]:
# Analyze the average inhibition for nlp
file_name = dir_name + '/../nlp_files/Average_Inhibition_impact_pubmed_score_cor_04272022_0.txt'
df_cor_nlp_average_inhibition = load_nlp_cor_df(file_name)
fig_out_file_name = dir_name + '/../nlp_files/Average_Inhibition_impact_pubmed_score_cor_04272022_0_dist.pdf'
plot_correlation_distribution(df_cor_nlp_average_inhibition, fig_out_file_name)
fig_out_file_name = dir_name + '/../nlp_files/Average_Inhibition_impact_pubmed_score_cor_04272022_0_comparison.pdf'
plot_correlation_comparison(df_cor_nlp_average_inhibition, fig_out_file_name)

In [None]:
# Plot out the results using plotly into files.
# The following code can be run without running the above cells
import importlib
import pandas as pd
import TopicModelingResultAnalyzer as analyzer
importlib.reload(analyzer)
dir_name = '/Volumes/ssd/results/reactome-idg/fi-network-ml/impact_analysis/scRNASeq'
color_col_names = ['Tdark', 'Annotated']
# file_names = ['df_cor_fdr_bigscale_082322.csv', 'df_cor_average_activation_bigscale_082322.csv', 'df_cor_average_inhibition_bigscale_082322.csv']
file_names = ['df_cor_fdr_bigscale_113022.csv',
              'df_cor_average_activation_bigscale_113022.csv', 
              'df_cor_average_inhibition_bigscale_110322.csv']
for color_col_name in color_col_names:
    print('color_col_name: {}'.format(color_col_name))
    for file_name in file_names:
        print('\tfile_name: {}'.format(file_name))
        analyzer.plot_cor_batch_results(file_name=dir_name + '/' + file_name,
                                        sep=',', 
                                        need_violin_plot=True, 
                                        mannwhitneyu_test=True, 
                                        color_col_name=color_col_name)