In [1]:
# conda install -c conda-forge nodejs
# jupyter labextension install @jupyter-widgets/jupyterlab-manager
# jupyter labextension list
# 

In [2]:
import pandas as pd
import numpy as np
from scipy.stats import zscore
import seaborn as sns
import matplotlib.pyplot as plt
from icecream import ic
import plotly.express as px
import ipywidgets as widgets
from IPython.display import display



In [3]:
global significant_proteins_df
significant_proteins_df = None

In [4]:
def merge_data(df, ttest):
    merged_df = pd.merge(df, ttest, on='Protein.Group')
     
    return merged_df

In [5]:
def ttest_clustermap(df, path, title):
    
    filtered = df[df['pval']<0.01]
    filtered = filtered.iloc[:, :-2]
    
    filtered.iloc[:,1:] = filtered.iloc[:,1:].apply(zscore, axis=1)
    
    filtered = filtered.set_index('Protein.Group')

    # Define the figure size, width and especially height to accommodate more y-axis labels
    height = len(filtered.index) * 0.3  # Adjust 0.3 to a smaller number if labels overlap, or larger for more space
    width = 10
    
    g = sns.clustermap(filtered, figsize=(width, height), method='average', metric='euclidean',  cmap='viridis')
    
    
    # sns.heatmap(total_filtered.iloc[:400,:], annot=False, cmap='viridis', linewidths=.5, vmin=-1,vmax=1)
    
    plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=-45, fontsize=6)
    g.savefig(f'{path}cluster_{title}.pdf')
    plt.show()

In [6]:
def create_volcano_plot(df, title):
    def volcano(significance_level=0.05, fold_change=1.0, show_labels=False):
        df['-log10(p_value)'] = -np.log10(df['pval'])

        # Determine the color and labels for each point
        df['color'] = 'blue'  # Default color
        significant = df['-log10(p_value)'] >= -np.log10(significance_level)
        high_fold_change = np.abs(df['log2fc']) >= fold_change
        df['is_significant'] = significant & high_fold_change
        df.loc[df['is_significant'], 'color'] = 'red'

        if show_labels:
            df['labels'] = np.where(df['is_significant'], df['Protein.Group'], None)
        else:
            df['labels'] = None

        fig = px.scatter(df, x='log2fc', y='-log10(p_value)', hover_name='Protein.Group', color='color',
                         title=title,
                         labels={'log2fc': 'Log2 Fold Change', '-log10(p_value)': '-log10(p-value)'},
                         color_discrete_map={'red': 'red', 'blue': 'blue'},
                         text='labels')

        significant_threshold = -np.log10(significance_level)
        fig.add_hline(y=significant_threshold, line_dash="dash", line_color="red")

        # Vertical lines for fold change
        fig.add_vline(x=fold_change, line_dash="dash", line_color="blue")
        fig.add_vline(x=-fold_change, line_dash="dash", line_color="blue")

        fig.update_layout(height=600, width=800)
        fig.update_traces(textposition='top center')
        return fig

    def generate_significant_proteins_list():
        significant = df['-log10(p_value)'] >= -np.log10(sig_slider.value)
        high_fold_change = np.abs(df['log2fc']) >= fc_slider.value
        significant_proteins = df[significant & high_fold_change]

        significant_proteins['upregulated'] = significant_proteins['log2fc'] > 0
        significant_proteins['downregulated'] = significant_proteins['log2fc'] < 0

        result_df = significant_proteins[['Protein.Group', 'upregulated', 'downregulated']]
        result_df.columns = ['Proteins', 'Upregulated', 'Downregulated']

        return result_df

    def update_plot(change):
        with output:
            output.clear_output(wait=True)
            display(volcano(sig_slider.value, fc_slider.value, label_button.value))

    def on_button_clicked(b):
        global significant_proteins_df
        significant_proteins_df = generate_significant_proteins_list()
        with output_df:
            output_df.clear_output(wait=True)
            display(significant_proteins_df)

    # Sliders and Buttons
    sig_slider = widgets.FloatSlider(value=0.05, min=0.001, max=0.1, step=0.01, description='Significance Level:', continuous_update=False)
    fc_slider = widgets.FloatSlider(value=1.0, min=0.1, max=5.0, step=0.1, description='Fold Change:', continuous_update=False)
    label_button = widgets.ToggleButton(value=False, description='Show Labels', icon='eye')
    generate_button = widgets.Button(description="Generate List")

    # Observers
    sig_slider.observe(update_plot, names='value')
    fc_slider.observe(update_plot, names='value')
    label_button.observe(update_plot, names='value')
    generate_button.on_click(on_button_clicked)

    # Output Widgets
    output = widgets.Output()
    output_df = widgets.Output()

    # Display Widgets
    with output:
        display(volcano(sig_slider.value, fc_slider.value, label_button.value))

    control_widgets = widgets.VBox([sig_slider, fc_slider, label_button, generate_button])
    display(control_widgets, output, output_df)




In [7]:
def reduce_df_size(df, top_n):
    
    # Sort the DataFrame by the p-value column in ascending order and select the top x rows
    filtered_df = df.sort_values(by='pval', ascending=True).head(top_n)
    return filtered_df

In [8]:
# import imputed dfs
path = "G:/My Drive/Data/data/eIF4F optimization/imputed intensities 8hr/"
light = pd.read_csv(f'{path}light.csv', sep=',')
total = pd.read_csv(f'{path}total.csv', sep=',')
nsp = pd.read_csv(f'{path}nsp.csv', sep=',')
#import ttests
e_total = pd.read_csv(f'{path}e_total.csv', sep=',')
g1_total = pd.read_csv(f'{path}g1_total.csv', sep=',')
g2_total = pd.read_csv(f'{path}g2_total.csv', sep=',')
g3_total = pd.read_csv(f'{path}g3_total.csv', sep=',')

e_light = pd.read_csv(f'{path}e_light.csv', sep=',')
g1_light = pd.read_csv(f'{path}g1_light.csv', sep=',')
g2_light = pd.read_csv(f'{path}g2_light.csv', sep=',')
g3_light = pd.read_csv(f'{path}g3_light.csv', sep=',')

e_nsp = pd.read_csv(f'{path}e_nsp.csv', sep=',')
g1_nsp = pd.read_csv(f'{path}g1_nsp.csv', sep=',')
g2_nsp = pd.read_csv(f'{path}g2_nsp.csv', sep=',')
g3_nsp = pd.read_csv(f'{path}g3_nsp.csv', sep=',')

In [9]:
# merge ttests onto sub dfds based on cols
e_total = merge_data(total[['Protein.Group','eIF4E- 8h 1','eIF4E- 8h 2', 'eIF4E+ 8h 1', 'eIF4E+ 8h 2']], e_total)
g1_total = merge_data(total[['Protein.Group','eIF4G1- 8h 1','eIF4G1- 8h 2', 'eIF4G1+ 8h 1', 'eIF4G1+ 8h 2']], g1_total)
g2_total = merge_data(total[['Protein.Group','eIF4G2- 8h 1','eIF4G2- 8h 2', 'eIF4G2+ 8h 1', 'eIF4G2+ 8h 2']], g2_total)
g3_total = merge_data(total[['Protein.Group','eIF4G3- 8h 1','eIF4G3- 8h 2', 'eIF4G3+ 8h 1', 'eIF4G3+ 8h 2']], g3_total)


e_light = merge_data(light[['Protein.Group','eIF4E- 8h 1','eIF4E- 8h 2', 'eIF4E+ 8h 1', 'eIF4E+ 8h 2']], e_light)
g1_light = merge_data(light[['Protein.Group','eIF4G1- 8h 1','eIF4G1- 8h 2', 'eIF4G1+ 8h 1', 'eIF4G1+ 8h 2']], g1_light)
g2_light = merge_data(light[['Protein.Group','eIF4G2- 8h 1','eIF4G2- 8h 2', 'eIF4G2+ 8h 1', 'eIF4G2+ 8h 2']], g2_light)
g3_light = merge_data(light[['Protein.Group','eIF4G3- 8h 1','eIF4G3- 8h 2', 'eIF4G3+ 8h 1', 'eIF4G3+ 8h 2']], g3_light)


e_nsp = merge_data(nsp[['Protein.Group','eIF4E- 8h 1','eIF4E- 8h 2', 'eIF4E+ 8h 1', 'eIF4E+ 8h 2']], e_nsp)
g1_nsp = merge_data(nsp[['Protein.Group','eIF4G1- 8h 1','eIF4G1- 8h 2', 'eIF4G1+ 8h 1', 'eIF4G1+ 8h 2']], g1_nsp)
g2_nsp = merge_data(nsp[['Protein.Group','eIF4G2- 8h 1','eIF4G2- 8h 2', 'eIF4G2+ 8h 1', 'eIF4G2+ 8h 2']], g2_nsp)
g3_nsp = merge_data(nsp[['Protein.Group','eIF4G3- 8h 1','eIF4G3- 8h 2', 'eIF4G3+ 8h 1', 'eIF4G3+ 8h 2']], g3_nsp)


In [10]:
#reduce size of dataframes for plotting function
e_light = reduce_df_size(e_light, 1000)
g1_light = reduce_df_size(g1_light, 1000)
g2_light = reduce_df_size(g2_light, 1000)
g3_light = reduce_df_size(g3_light, 1000)

e_total = reduce_df_size(e_total, 1000)
g1_total = reduce_df_size(g1_total, 1000)
g2_total = reduce_df_size(g2_total, 1000)
g3_total = reduce_df_size(g3_total, 1000)

In [11]:
# clustermap
#ttest_clustermap(e_nsp, path, 'e_nsp')
#ttest_clustermap(g1_nsp, path, 'g1_nsp')
#ttest_clustermap(g2_nsp, path, 'g2_nsp')
#ttest_clustermap(g3_nsp, path, 'g3_nsp')

#ttest_clustermap(e_light, path, 'e_light')
#ttest_clustermap(g1_light, path, 'g1_light')
#ttest_clustermap(g2_light, path, 'g2_light')
#ttest_clustermap(g3_light, path, 'g3_light')

#ttest_clustermap(e_total, path, 'e_total')
#ttest_clustermap(g1_total, path, 'g1_total')
#ttest_clustermap(g2_total, path, 'g2_total')
#ttest_clustermap(g3_total, path, 'g3_total')

In [12]:
create_volcano_plot(e_light, 'Light E')

VBox(children=(FloatSlider(value=0.05, continuous_update=False, description='Significance Level:', max=0.1, mi…

Output()

Output()

In [13]:
create_volcano_plot(g1_light, 'Light G1')

VBox(children=(FloatSlider(value=0.05, continuous_update=False, description='Significance Level:', max=0.1, mi…

Output()

Output()

In [14]:
create_volcano_plot(g2_light, 'Light G2')

VBox(children=(FloatSlider(value=0.05, continuous_update=False, description='Significance Level:', max=0.1, mi…

Output()

Output()

In [15]:
create_volcano_plot(g3_light, 'Light G3')

VBox(children=(FloatSlider(value=0.05, continuous_update=False, description='Significance Level:', max=0.1, mi…

Output()

Output()

In [16]:
create_volcano_plot(e_total, 'Total E')

VBox(children=(FloatSlider(value=0.05, continuous_update=False, description='Significance Level:', max=0.1, mi…

Output()

Output()

In [17]:
create_volcano_plot(g1_total, 'Total G1')

VBox(children=(FloatSlider(value=0.05, continuous_update=False, description='Significance Level:', max=0.1, mi…

Output()

Output()

In [18]:
create_volcano_plot(g2_total, 'Total G2')

VBox(children=(FloatSlider(value=0.05, continuous_update=False, description='Significance Level:', max=0.1, mi…

Output()

Output()

In [19]:
create_volcano_plot(g3_total, 'Total G3')

VBox(children=(FloatSlider(value=0.05, continuous_update=False, description='Significance Level:', max=0.1, mi…

Output()

Output()

In [20]:
create_volcano_plot(e_nsp, 'NSP E')

VBox(children=(FloatSlider(value=0.05, continuous_update=False, description='Significance Level:', max=0.1, mi…

Output()

Output()

In [21]:
create_volcano_plot(g1_nsp, 'NSP G1')

VBox(children=(FloatSlider(value=0.05, continuous_update=False, description='Significance Level:', max=0.1, mi…

Output()

Output()

In [22]:
create_volcano_plot(g2_nsp, 'NSP G2')

VBox(children=(FloatSlider(value=0.05, continuous_update=False, description='Significance Level:', max=0.1, mi…

Output()

Output()

In [23]:
create_volcano_plot(g3_nsp, 'NSP G3')


VBox(children=(FloatSlider(value=0.05, continuous_update=False, description='Significance Level:', max=0.1, mi…

Output()

Output()