In [1]:
#importing necessary python packages
import pandas as pd
import numpy as np
import os

In [2]:
os.chdir('C:\\Users\\U0033207PC\\Desktop\\GENE Enrichment\\FINAL VISUALISATION\\SUPPLENMENTARY')#setting the right directory

In [3]:
# Importing the dataset
dt = pd.read_excel('FEATURES_MEAN_PATHWAYS.xlsx',index_col=0)

In [4]:
dt.head(2)

Unnamed: 0_level_0,Fatty acid activation (cytosolic),Fatty acid oxidation,Folate metabolism,Glycolysis / Gluconeogenesis
HUGO_SYMBOL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
PAT1,249.678953,203.645436,174.799119,975.407107
PAT2,159.84334,192.578976,162.708229,596.051379


In [5]:
from sklearn.preprocessing import StandardScaler
dt1 = StandardScaler().fit_transform(dt)

In [6]:
pd.DataFrame(dt1, columns=dt.columns).head()

Unnamed: 0,Fatty acid activation (cytosolic),Fatty acid oxidation,Folate metabolism,Glycolysis / Gluconeogenesis
0,0.916753,-0.417542,0.65475,0.916961
1,-1.023971,-0.777515,0.350597,-0.806307
2,0.666127,-1.230996,0.274819,0.08706
3,-0.248295,-0.61274,0.616088,0.240792
4,-0.31795,0.7857,0.07132,-0.623378


In [7]:
from sklearn.decomposition import PCA
pca_out = PCA().fit(dt1)

In [8]:
# get the component variance
# Proportion of Variance (from PC1 to PC453)
pca_out.explained_variance_ratio_

array([0.41650881, 0.3078531 , 0.16090503, 0.11473307])

In [9]:
# Cumulative proportion of variance (from PC1 to PC453)   
np.cumsum(pca_out.explained_variance_ratio_)

array([0.41650881, 0.72436191, 0.88526693, 1.        ])

In [10]:
# get component loadings (correlation coefficient between original variables and the component) 
# the squared loadings within the PCs always sums to 1

In [11]:
loadings = pca_out.components_

In [12]:
num_pc = pca_out.n_features_

In [13]:
pc_list = ["PC"+str(i) for i in list(range(1, num_pc+1))]
loadings_df = pd.DataFrame.from_dict(dict(zip(pc_list, loadings)))
loadings_df['variable'] = dt.columns.values
loadings_df = loadings_df.set_index('variable')
loadings_df

Unnamed: 0_level_0,PC1,PC2,PC3,PC4
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fatty acid activation (cytosolic),0.650045,0.013443,-0.408748,0.640458
Fatty acid oxidation,0.146812,-0.7658,0.579535,0.23693
Folate metabolism,0.65189,-0.183041,-0.109509,-0.727697
Glycolysis / Gluconeogenesis,0.361838,0.616332,0.696471,0.064305


In [14]:
# positive and negative values in component loadings reflects the positive and negative correlation of the variables
# with then PCs. 

In [15]:
# get correlation matrix plot for loadings
import seaborn as sns
import matplotlib.pyplot as plt
ax = sns.heatmap(loadings_df, annot=True, cmap='Spectral')
ax.figure.savefig("Correlation of Flux Pathway Analysis.png",
                    format='png',
                    dpi=300)

In [16]:
corr = loadings_df
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

Unnamed: 0_level_0,PC1,PC2,PC3,PC4
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fatty acid activation (cytosolic),0.65,0.01,-0.41,0.64
Fatty acid oxidation,0.15,-0.77,0.58,0.24
Folate metabolism,0.65,-0.18,-0.11,-0.73
Glycolysis / Gluconeogenesis,0.36,0.62,0.7,0.06


In [17]:
corr.style.background_gradient(cmap='coolwarm', axis=None)

Unnamed: 0_level_0,PC1,PC2,PC3,PC4
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fatty acid activation (cytosolic),0.650045,0.013443,-0.408748,0.640458
Fatty acid oxidation,0.146812,-0.7658,0.579535,0.23693
Folate metabolism,0.65189,-0.183041,-0.109509,-0.727697
Glycolysis / Gluconeogenesis,0.361838,0.616332,0.696471,0.064305


In [18]:
# get eigenvalues (from PC1 to PC453) 
pca_out.explained_variance_

array([1.669264  , 1.23379884, 0.64486743, 0.45982167])

In [19]:
# get scree plot (for scree or elbow test)
from bioinfokit.visuz import cluster
cluster.screeplot(obj=[pc_list, pca_out.explained_variance_ratio_])

# Scree plot will be saved in the same directory with name screeplot.png

In [20]:
# get PCA loadings plots (2D and 3D)
# 2D

cluster.pcaplot(x=loadings[0], y=loadings[1], labels=dt.columns.values, 
var1=round(pca_out.explained_variance_ratio_[0]*100, 2),
var2=round(pca_out.explained_variance_ratio_[1]*100, 2))

In [21]:
# get PC scores
pca_scores = PCA().fit_transform(dt1)

In [22]:
# get 2D biplot
cluster.biplot(cscore=pca_scores, loadings=loadings, labels=dt.columns.values, var1=round(pca_out.explained_variance_ratio_[0]*100, 2),
var2=round(pca_out.explained_variance_ratio_[1]*100, 2))