## RNA-seq data analysis visualisation - HeLa +/- enDR3

Visualisation of the results obtained from DESeq2 analysis


Imports

In [None]:
from pathlib import Path
import pandas as pd
import re

import matplotlib as mpl
import matplotlib.pyplot as plt
import math
import numpy as np
import seaborn as sns
from scipy.stats import pearsonr, ttest_ind, shapiro, anderson

mpl.rcParams['font.sans-serif'] = ['Arial']
mpl.rcParams['font.family'] = 'sans-serif'

# Make SVG text as font not as curves
mpl.rcParams['svg.fonttype'] = 'none'

%matplotlib inline

Constants

In [None]:
SAVE_FIGS = False

Paths

In [None]:
data_dir_path = Path(r'path\to\input\data')
save_dir_path = Path(r'path\to\output\data')

Load in RNA-seq results from csv file - data generated using DESeq2

In [None]:
de_results = pd.read_csv(data_dir_path/'HeLa_enDR3_DE_genes.csv', sep=',')

In [None]:
de_results.head(5)

Generate volcano plots

In [None]:
x_axis = de_results['log2FoldChange']

# Log2 trasform P-value
y_axis = [-math.log10(x) for x in de_results['padj']]

de_data_colors = []

for adj_pvalue in de_results['padj']:
    if adj_pvalue < 0.05:
        de_data_colors.append('red')
    else:
        de_data_colors.append('grey')

In [None]:
vp_fig = plt.figure(figsize=(4.25, 4))

plt.scatter(x_axis, y_axis, c=de_data_colors, s=8)
plt.axvline(x=-1, color='green', linewidth=0.5, linestyle='--')
plt.axvline(x=1, color='green', linewidth=0.5, linestyle='--')
plt.axhline(y=-math.log10(0.05), color='r', linewidth=0.5, linestyle='--')
plt.xlabel('log2(fold change)', fontsize=12)
plt.ylabel('-log10(adj p-value)', fontsize=12)
_ = plt.title('HeLa +/- enDR3', fontsize=12)

In [None]:
if SAVE_FIGS:
    vp_fig.savefig(save_dir_path/'Vulcano_plot_all_genes.png', dpi=600, format='png', bbox_inches='tight')
    vp_fig.savefig(save_dir_path/'Vulcano_all_genes.pdf', dpi=600, format='pdf', bbox_inches='tight')

### Generate scatter plots and estimate correlation between samples

Log10 transform data

In [None]:
enDR3_rep_1 = np.log10(de_results['enDR3_1']+1)
enDR3_rep_2 = np.log10(de_results['enDR3_2']+1)
enDR3_rep_3 = np.log10(de_results['enDR3_3']+1)

In [None]:
HeLa_rep_1 = np.log10(de_results['Hela_1']+1)
HeLa_rep_2 = np.log10(de_results['Hela_2']+1)
HeLa_rep_3 = np.log10(de_results['Hela_3']+1)

#### Calculate Pearson correaltions

HeLa enDR3 samples

In [None]:
enDR3_rep_1_vs_2_corr, enDR3_rep_1_vs_2_pvalue = pearsonr(enDR3_rep_1, enDR3_rep_2)
enDR3_rep_1_vs_3_corr, enDR3_rep_1_vs_3_pvalue = pearsonr(enDR3_rep_1, enDR3_rep_3)
enDR3_rep_2_vs_3_corr, enDR3_rep_2_vs_3_pvalue = pearsonr(enDR3_rep_2, enDR3_rep_3)

print(enDR3_rep_1_vs_2_corr)
print(enDR3_rep_1_vs_3_corr)
print(enDR3_rep_2_vs_3_corr)

In [None]:
m1, b1 = np.polyfit(np.array(enDR3_rep_1), np.array(enDR3_rep_2), 1)
m2, b2 = np.polyfit(np.array(enDR3_rep_1), np.array(enDR3_rep_3), 1)
m3, b3 = np.polyfit(np.array(enDR3_rep_2), np.array(enDR3_rep_3), 1)

In [None]:
fig1 = plt.figure(figsize=(7, 7))
fig1.suptitle('HeLa enDR3 replicates', fontsize=12)

fig1.subplots_adjust(wspace=.2)
ax1 = fig1.add_subplot(2, 2, 1)
ax1.scatter(enDR3_rep_1, enDR3_rep_2, s=1)
ax1.plot([0, ax1.get_xlim()[1]], [0, ax1.get_xlim()[1]], ls="--", color='black', linewidth=1)
ax1.plot(enDR3_rep_1, m1*np.array(enDR3_rep_1) + b1, color='red', linewidth=1)
ax1.set_ylabel('Replicate 2\nlog$_{10}$(Norm. counts)', fontsize=10)
ax1.text(0, 4.2, f'r = {enDR3_rep_1_vs_2_corr:.5f}\np-value = {enDR3_rep_1_vs_2_pvalue:.2E}', fontsize=9)

ax2 = fig1.add_subplot(2, 2, 3)
ax2.scatter(enDR3_rep_1, enDR3_rep_3, s=1)
ax2.plot([0, ax2.get_xlim()[1]], [0, ax2.get_xlim()[1]], ls="--", color='black', linewidth=1)
ax2.plot(enDR3_rep_1, m2*np.array(enDR3_rep_1) + b2, color='red', linewidth=1)
ax2.set_ylabel('Replicate 3\nlog$_{10}$(Norm. counts)', fontsize=10)
ax2.set_xlabel('Replicate 1\nlog$_{10}$(Norm. counts)', fontsize=10)
ax2.text(0, 4.2, f'r = {enDR3_rep_1_vs_3_corr:.5f}\np-value = {enDR3_rep_1_vs_3_pvalue:.2E}', fontsize=9)

ax3 = fig1.add_subplot(2, 2, 4)
ax3.scatter(enDR3_rep_2, enDR3_rep_3, s=1)
ax3.plot([0, ax3.get_xlim()[1]], [0, ax3.get_xlim()[1]], ls="--", color='black', linewidth=1)
ax3.plot(enDR3_rep_2, m3*np.array(enDR3_rep_2) + b3, color='red', linewidth=1)
ax3.set_xlabel('Replicate 2\nlog$_{10}$(Norm. counts)', fontsize=10)
_ = ax3.text(0, 4.2, f'r = {enDR3_rep_2_vs_3_corr:.5f}\np-value = {enDR3_rep_2_vs_3_pvalue:.2E}', fontsize=9)
fig1.tight_layout()

In [None]:
if SAVE_FIGS:
    fig1.savefig(save_dir_path/'HeLa_enDR3_corr_scatters.jpg', dpi=600, format='jpg')
    fig1.savefig(save_dir_path/'HeLa_enDR3_corr_scatters.pdf', dpi=600, format='pdf')

HeLa "wild-type"samples

In [None]:
HeLa_rep_1_vs_2_corr, HeLa_rep_1_vs_2_pvalue = pearsonr(HeLa_rep_1, HeLa_rep_2)
HeLa_rep_1_vs_3_corr, HeLa_rep_1_vs_3_pvalue = pearsonr(HeLa_rep_1, HeLa_rep_3)
HeLa_rep_2_vs_3_corr, HeLa_rep_2_vs_3_pvalue = pearsonr(HeLa_rep_2, HeLa_rep_3)

print(HeLa_rep_1_vs_2_corr)
print(HeLa_rep_1_vs_3_corr)
print(HeLa_rep_2_vs_3_corr)

In [None]:
WT_m1, WT_b1 = np.polyfit(np.array(HeLa_rep_1), np.array(HeLa_rep_2), 1)
WT_m2, WT_b2 = np.polyfit(np.array(HeLa_rep_1), np.array(HeLa_rep_3), 1)
WT_m3, WT_b3 = np.polyfit(np.array(HeLa_rep_2), np.array(HeLa_rep_3), 1)

In [None]:
fig2 = plt.figure(figsize=(7, 7))
fig2.suptitle('Wild-type HeLa replicates', fontsize=12)

fig2.subplots_adjust(wspace=.2)

ax1 = fig2.add_subplot(2, 2, 1)
ax1.scatter(HeLa_rep_1, HeLa_rep_2, s=1)
ax1.plot([0, ax1.get_xlim()[1]], [0, ax1.get_xlim()[1]], ls="--", color='black', linewidth=1)
ax1.plot(HeLa_rep_1, WT_m1*np.array(HeLa_rep_1) + WT_b1, color='red', linewidth=1)
ax1.set_ylabel('Replicate 2\nlog$_{10}$(Norm. counts)', fontsize=12)
ax1.text(0, 4.2, f'r = {HeLa_rep_1_vs_2_corr:.5f}\np-value = {HeLa_rep_1_vs_2_pvalue:.2E}', fontsize=9)

ax2 = fig2.add_subplot(2, 2, 3)
ax2.scatter(HeLa_rep_1, HeLa_rep_3, s=1)
ax2.plot([0, ax2.get_xlim()[1]], [0, ax2.get_xlim()[1]], ls="--", color='black', linewidth=1)
ax2.plot(HeLa_rep_1, WT_m2*np.array(HeLa_rep_1) + WT_b2, color='red', linewidth=1)
ax2.set_ylabel('Replicate 3\nlog$_{10}$(Norm. counts)', fontsize=12)
ax2.set_xlabel('Replicate 1\nlog$_{10}$(Norm. counts)', fontsize=12)
ax2.text(0, 4.2, f'r = {HeLa_rep_1_vs_3_corr:.5f}\np-value = {HeLa_rep_1_vs_3_pvalue:.2E}', fontsize=9)

ax3 = fig2.add_subplot(2, 2, 4)
ax3.scatter(HeLa_rep_2, HeLa_rep_3, s=1)
ax3.plot([0, ax3.get_xlim()[1]], [0, ax3.get_xlim()[1]], ls="--", color='black', linewidth=1)
ax3.plot(HeLa_rep_2, WT_m3*np.array(HeLa_rep_2) + WT_b3, color='red', linewidth=1)
ax3.set_xlabel('Replicate 2\nlog$_{10}$(Norm. counts)', fontsize=12)
_ = ax3.text(0, 4.2, f'r = {HeLa_rep_2_vs_3_corr:.5f}\np-value = {HeLa_rep_2_vs_3_pvalue:.2E}', fontsize=9)
fig2.tight_layout()

In [None]:
if SAVE_FIGS:
    fig2.savefig(save_dir_path/'HeLa_WT_corr_scatters.jpg', dpi=600, format='jpg')
    fig2.savefig(save_dir_path/'HeLa_WT_corr_scatters.pdf', dpi=600, format='pdf')

### Generate MA plots

In [None]:
# Log10 trasform baseMean expression
ma_x_axis_all = [math.log10(x) for x in de_results['baseMean']]

# Fold change is already log2 transformed
ma_y_axis_all = de_results['log2FoldChange']

ma_all_de_data_colors = np.where(de_results['padj'] < 0.05, 'red', 'grey')

In [None]:
ma_fig = plt.figure(figsize=(4.25,4))

ax2 = ma_fig.add_subplot(1, 1, 1)
ax2.scatter(ma_x_axis_all, ma_y_axis_all, c=ma_all_de_data_colors, s=6)
ax2.axhline(y=0, color='r', linewidth=1)
ax2.axhline(y=math.log2(2), color='green', linestyle=':', linewidth=1)
ax2.axhline(y=-math.log2(2), color='green', linestyle=':', linewidth=1)
ax2.axvline(x=math.log10(10), color='blue', linestyle=':', linewidth=1)
ax2.set_xlabel('log10(Mean expression)', fontsize=12)
ax2.set_ylabel('log2(Fold change)', fontsize=12)
_ = ax2.set_title('HeLa +/- enDR3', fontsize=12)

In [None]:
if SAVE_FIGS:
    ma_fig.savefig(save_dir_path/'MA_plot_all_genes.png', dpi=600, format='png', bbox_inches='tight')
    ma_fig.savefig(save_dir_path/'MA_plot_all_genes.pdf', dpi=600, format='pdf', bbox_inches='tight')