In [None]:
# Script for reducing dimension using PCA
# Programmer: Abdullah Al Mamun
# Date: March 18, 2019
# Ref: https://medium.com/@kyasar.mail/pca-principal-component-analysis-729068e28ec8 
# sklearn has built-in iris dataset
# load and store it as numpy array for further matrix calculations in PCA
from sklearn import datasets
import numpy as np

iris = datasets.load_iris()
R = np.array(iris.data)

# Compute the covariance matrix using y-axes only
R_cov = np.cov(R, rowvar=False)


# Print the cov mat
# print(R_cov)

# Compute the eigenvalues and eigenvectors of a square array using numpy linear algebra lib
eig_values, eig_vectors = np.linalg.eig(R_cov)

# print values and vectors
# print(eig_values)
# print(eig_vectors)

# pick 2 eigenvectors whose eigenvalues are highest
featureVector = eig_vectors[:,:2]
# print(featureVector)

featureVector_t = np.transpose(featureVector)
R_t = np.transpose(R)

# Transpose to adjust new features
newDB_t = np.matmul(featureVector_t, R_t)
newDB = np.transpose(newDB_t)
newDB.shape

# for plotting 
import pandas as pd
import seaborn as sns
%matplotlib inline 

df = pd.DataFrame(data=newDB, columns=['PC1','PC2'])
y=pd.Series(iris.target)

# make 3 classes with 3 colors
y=y.replace(0, 's')
y=y.replace(1, 've')
y=y.replace(2, 'vi')
df['Target'] = y

sns.lmplot(x='PC1', y='PC2', data=df, hue='Target', fit_reg=False, legend=True)

In [None]:
# PCA in 2D
import numpy as np
import pandas as pd
cancer_name = ['BLCA', 'BRCA', 'HNSC', 'KIRC', 'KIRP', 'LIHC', 'LUAD', 'LUSC','PRAD', 'STAD', 'THCA'] 
df1 = pd.read_csv('lncRNA/DeepCC/DeepCC_v2.0/data/TCGA-THCA-rnaexpr-reduced-T.tsv', sep='\t', )
# df1 = pd.read_csv('exp.csv')
# df2 = df1.iloc[1:,1:-1]
R = np.array(df1.iloc[1:,1:-1])

R_cov = np.cov(R, rowvar=False)

eig_values, eig_vectors = np.linalg.eig(R_cov)

featureVector = eig_vectors[:,:2] #Number of components

featureVector_t = np.transpose(featureVector)

R_t = np.transpose(R)
newDB_t = np.matmul(featureVector_t, R_t)
newDB = np.transpose(newDB_t)
# print(newDB)

import seaborn as sns
%matplotlib inline 

df = pd.DataFrame(data=newDB, columns=['PC1','PC2'])
y=df1.iloc[:,-1]

# make 3 classes with 3 colors
y=y.replace(0, 'Normal')
y=y.replace(1, 'Tumor')
# y=y.replace(2, 'COAD')
# y=y.replace(3, 'HNSC')
# y=y.replace(4, 'KIRC')
df['Type'] = y

sns.lmplot(x='PC1', y='PC2', data=df, hue='Type', fit_reg=False, legend=True)

# sns.lmplot(x='PC1', y='PC2', data=df1, fit_reg=False, legend=True)

In [None]:
# PCA using scikit-learn
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline 

cancer_name = ['BLCA', 'BRCA', 'HNSC', 'KIRC', 'KIRP', 'LIHC', 'LUAD', 'LUSC','PRAD', 'STAD', 'THCA'] 
# df1 = pd.read_csv('lncRNA/DeepCC/DeepCC_v2.0/data/TCGA-BLCA-rnaexpr-reduced-T.tsv', sep='\t', )
df1 = pd.read_csv('lncRNA/DeepCC/Data/BLCA_CESC_COAD_HNSC_KIRP_LGG_LIHC_LUAD-68.csv')

R = np.array(df1.iloc[1:,1:-1])

pca = PCA(n_components=2)
principalComponents = pca.fit_transform(R)

print("Components:", pca.components_)
print("explained_variance_ratio", pca.explained_variance_ratio_)

df = pd.DataFrame(data=principalComponents, columns=['PC1','PC2'])
y=df1.iloc[:,-1]

# make 2 classes with 3 colors
# y=y.replace(0, 'Normal')
# y=y.replace(1, 'Tumor')

y=y.replace(0, 'BLCA')
y=y.replace(1, 'CESC')
y=y.replace(2, 'COAD')
y=y.replace(3, 'HNSC')
y=y.replace(4, 'KIRP')
y=y.replace(5, 'LGG')
y=y.replace(6, 'LIHC')
y=y.replace(7, 'LUAD')

df['Type'] = y

fig = plt.figure()
fig = sns.lmplot(x='PC1', y='PC2', data=df, hue='Type', fit_reg=False, legend=True)
fig.savefig("lncRNA/DeepCC/Figure/PCA-2d-68.pdf")

In [None]:
# PCA in 3D
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
 
# Get the iris dataset
import seaborn as sns
sns.set_style("white")

df1 = pd.read_csv('lncRNA/DeepCC/Data/BLCA_CESC_COAD_HNSC_KIRP_LGG_LIHC_LUAD-4k.csv')
df = np.array(df1.iloc[1:,1:-1])
my_color=df1.iloc[1:,-1]

# df = sns.load_dataset('iris')
 
my_dpi=96
plt.figure(figsize=(480/my_dpi, 480/my_dpi), dpi=my_dpi)
 
# Keep the 'specie' column appart + make it numeric for coloring
# df['species']=pd.Categorical(df['species'])
# my_color=df['species'].cat.codes
# df = df.drop('species', 1)


# Run The PCA
pca = PCA(n_components=3)
pca.fit(df)
 
# Store results of PCA in a data frame
result=pd.DataFrame(pca.transform(df), columns=['PCA%i' % i for i in range(3)])
 
# Plot initialisation
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(result['PCA0'], result['PCA1'], result['PCA2'], c=my_color, cmap="Set2_r", s=60)
 
# make simple, bare axis lines through space:
xAxisLine = ((min(result['PCA0']), max(result['PCA0'])), (0, 0), (0,0))
ax.plot(xAxisLine[0], xAxisLine[1], xAxisLine[2], 'r')
yAxisLine = ((0, 0), (min(result['PCA1']), max(result['PCA1'])), (0,0))
ax.plot(yAxisLine[0], yAxisLine[1], yAxisLine[2], 'r')
zAxisLine = ((0, 0), (0,0), (min(result['PCA2']), max(result['PCA2'])))
ax.plot(zAxisLine[0], zAxisLine[1], zAxisLine[2], 'r')
 
# label the axes
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.set_zlabel("PC3")
# ax.set_title("PCA on the iris data set")
plt.savefig("lncRNA/DeepCC/Figure/PCA-3d-4k.pdf")
plt.show()

In [None]:
#tSNE
from __future__ import print_function
import time
from sklearn.manifold import TSNE
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from pylab import savefig

cancer_name = ['BLCA', 'BRCA', 'HNSC', 'KIRC', 'KIRP', 'LIHC', 'LUAD', 'LUSC','PRAD', 'STAD', 'THCA'] 
# df1 = pd.read_csv('lncRNA/DeepCC/DeepCC_v2.0/data/TCGA-KIRP-rnaexpr-reduced-T.tsv', sep='\t', )
df1 = pd.read_csv('lncRNA/DeepCC/Data/BLCA_CESC_COAD_HNSC_KIRP_LGG_LIHC_LUAD-4k.csv')

R = np.array(df1.iloc[1:,1:-1])
# R = np.array(df1.iloc[:,0:-1])

time_start = time.time()
tsne = TSNE(n_components=2, verbose=1, perplexity=10, n_iter=400)
tsne_results = tsne.fit_transform(R)

print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))

df = pd.DataFrame(data=tsne_results, columns=['tSNE1','tSNE2'])
y=df1.iloc[1:,-1]

# y=y.replace(0, 'Normal')
# y=y.replace(1, 'Tumor')

# make 2 classes with 3 colors
y=y.replace(0, 'BLCA')
y=y.replace(1, 'CESC')
y=y.replace(2, 'COAD')
y=y.replace(3, 'HNSC')
y=y.replace(4, 'KIRP')
y=y.replace(5, 'LGG')
y=y.replace(6, 'LIHC')
y=y.replace(7, 'LUAD')

df['Type'] = y

fig = plt.figure()
fig = sns.lmplot(x='tSNE1', y='tSNE2', data=df, hue='Type', fit_reg=False, legend=True)
fig.savefig("lncRNA/DeepCC/Figure/tSNE-4k.pdf")

In [None]:
fig.savefig("lncRNA/DeepCC/Figure/tSNE-68.pdf")