# PCA on Three Files

In [1]:
import os
import matplotlib.pyplot as plt
import plotly.express as px
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
import json

for filename in os.listdir(os.getcwd()):
    if filename.endswith(".txt"):
        currentFile = os.getcwd()+'/'+filename
        print("------------------------------------------------------------------------------------")
        print("NOW WORKING ON FILE "+filename)
        print("------------------------------------------------------------------------------------")
        df = pd.read_csv(currentFile, sep='\t', header = None)
        featureArrayPCA = df.iloc[:,0:len(df.columns)-1].values
        diseaseArrayPCA = df.iloc[:,len(df.columns)-1].values
        centeredFeatureArrayPCA = featureArrayPCA-featureArrayPCA.mean()
        covarianceMatrix = np.cov(centeredFeatureArrayPCA.T)
        eigen_values, eigen_vectors = np.linalg.eig(covarianceMatrix)
        eig_pairs = [(np.abs(eigen_values[i]), eigen_vectors[:,i]) for i in range(len(eigen_values))]
        eig_pairs = sorted(eig_pairs, key=lambda x : x[0])
        eig_pairs.reverse()
        matrix = np.hstack((eig_pairs[0][1].reshape(len(eig_pairs[0][1]),1),eig_pairs[1][1].reshape(len(eig_pairs[1][1]),1)) )
        projectionSpaceMatrix = matrix.T.dot(centeredFeatureArrayPCA.T)        
        projectionSpaceMatrix = projectionSpaceMatrix.T
        projectionSpaceMatrix = projectionSpaceMatrix.real
        PCA_DF = pd.DataFrame(projectionSpaceMatrix,columns = ["PC1","PC2"])
        PCA_DF['disease'] = diseaseArrayPCA
        print("SCATTER PLOT FOR PCA OF FILE: "+filename)
        fig = px.scatter(PCA_DF, x="PC1", y="PC2", color="disease")
        fig.update_traces(marker=dict(size=12,line=dict(width=2,color='DarkSlateGrey'),opacity = 0.9),selector=dict(mode='markers'))
        fig.show()
        print("FINISHED WORKING ON FILE "+filename+"\n")
    else:
        continue

ModuleNotFoundError: No module named 'plotly'

# SVD on Three Files

In [2]:
import os
import plotly.express as px
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

for filename in os.listdir(os.getcwd()):
    if filename.endswith(".txt"):
        currentFile = os.getcwd()+'/'+filename
        print("------------------------------------------------------------------------------------")
        print("NOW WORKING ON FILE "+filename)
        print("------------------------------------------------------------------------------------")
        df = pd.read_csv(currentFile, sep='\t', header = None)
        featureArraySVD = df.iloc[:,0:len(df.columns)-1].values
        diseaseArraySVD = df.iloc[:,len(df.columns)-1].values
        u,s,v = np.linalg.svd(featureArraySVD)
        SVD_DF = pd.DataFrame(u[:,0:2],columns = ["SV1","SV2"])
        SVD_DF['disease'] = diseaseArraySVD
        print("SCATTER PLOT FOR SVD OF FILE: "+filename)
        fig = px.scatter(SVD_DF, x="SV1", y="SV2", color="disease")
        fig.update_traces(marker=dict(size=12,line=dict(width=2,color='DarkSlateGrey'),opacity = 0.9),selector=dict(mode='markers'))
        fig.show()
        print("FINISHED WORKING ON FILE "+filename+"\n")
    else:
        continue

------------------------------------------------------------------------------------
NOW WORKING ON FILE pca_b.txt
------------------------------------------------------------------------------------
SCATTER PLOT FOR SVD OF FILE: pca_b.txt


FINISHED WORKING ON FILE pca_b.txt

------------------------------------------------------------------------------------
NOW WORKING ON FILE pca_demo.txt
------------------------------------------------------------------------------------
SCATTER PLOT FOR SVD OF FILE: pca_demo.txt


FINISHED WORKING ON FILE pca_demo.txt

------------------------------------------------------------------------------------
NOW WORKING ON FILE pca_c.txt
------------------------------------------------------------------------------------
SCATTER PLOT FOR SVD OF FILE: pca_c.txt


FINISHED WORKING ON FILE pca_c.txt

------------------------------------------------------------------------------------
NOW WORKING ON FILE pca_a.txt
------------------------------------------------------------------------------------
SCATTER PLOT FOR SVD OF FILE: pca_a.txt


FINISHED WORKING ON FILE pca_a.txt



# t-SNE on Three files

In [3]:
import os
import plotly.express as px
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE

for filename in os.listdir(os.getcwd()):
    if filename.endswith(".txt"):
        currentFile = os.getcwd()+'/'+filename
        print("------------------------------------------------------------------------------------")
        print("NOW WORKING ON FILE "+filename)
        print("------------------------------------------------------------------------------------")
        df = pd.read_csv(currentFile, sep='\t', header = None)
        featureArraySNE = df.iloc[:,0:len(df.columns)-1].values
        diseaseArraySNE = df.iloc[:,len(df.columns)-1].values
        tsneParams = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=400)
        tsneOutput = tsneParams.fit_transform(featureArraySNE)
        SNE_DF = pd.DataFrame(tsneOutput,columns = ["t-SNE1","t-SNE2"])
        SNE_DF['disease'] = diseaseArraySNE
        print("SCATTER PLOT FOR t-SNE OF FILE: "+filename)
        fig = px.scatter(SNE_DF, x="t-SNE1", y="t-SNE2", color="disease")
        fig.update_traces(marker=dict(size=12,line=dict(width=2,color='DarkSlateGrey'),opacity = 0.9),selector=dict(mode='markers'))
        fig.show()
        print("FINISHED WORKING ON FILE "+filename+"\n")
    else:
        continue

------------------------------------------------------------------------------------
NOW WORKING ON FILE pca_b.txt
------------------------------------------------------------------------------------
[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 386 samples in 0.001s...
[t-SNE] Computed neighbors for 386 samples in 0.036s...
[t-SNE] Computed conditional probabilities for sample 386 / 386
[t-SNE] Mean sigma: 0.824576
[t-SNE] KL divergence after 250 iterations with early exaggeration: 65.691788
[t-SNE] Error after 400 iterations: 0.603519
SCATTER PLOT FOR t-SNE OF FILE: pca_b.txt


FINISHED WORKING ON FILE pca_b.txt

------------------------------------------------------------------------------------
NOW WORKING ON FILE pca_demo.txt
------------------------------------------------------------------------------------
[t-SNE] Computing 99 nearest neighbors...
[t-SNE] Indexed 100 samples in 0.000s...
[t-SNE] Computed neighbors for 100 samples in 0.013s...
[t-SNE] Computed conditional probabilities for sample 100 / 100
[t-SNE] Mean sigma: 0.689194
[t-SNE] KL divergence after 250 iterations with early exaggeration: 50.827942
[t-SNE] Error after 400 iterations: 0.080354
SCATTER PLOT FOR t-SNE OF FILE: pca_demo.txt


FINISHED WORKING ON FILE pca_demo.txt

------------------------------------------------------------------------------------
NOW WORKING ON FILE pca_c.txt
------------------------------------------------------------------------------------
[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 428 samples in 0.000s...
[t-SNE] Computed neighbors for 428 samples in 0.011s...
[t-SNE] Computed conditional probabilities for sample 428 / 428
[t-SNE] Mean sigma: 0.365075
[t-SNE] KL divergence after 250 iterations with early exaggeration: 62.174610
[t-SNE] Error after 400 iterations: 0.525936
SCATTER PLOT FOR t-SNE OF FILE: pca_c.txt


FINISHED WORKING ON FILE pca_c.txt

------------------------------------------------------------------------------------
NOW WORKING ON FILE pca_a.txt
------------------------------------------------------------------------------------
[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 150 samples in 0.000s...
[t-SNE] Computed neighbors for 150 samples in 0.002s...
[t-SNE] Computed conditional probabilities for sample 150 / 150
[t-SNE] Mean sigma: 0.669058
[t-SNE] KL divergence after 250 iterations with early exaggeration: 48.583202
[t-SNE] Error after 400 iterations: 0.084539
SCATTER PLOT FOR t-SNE OF FILE: pca_a.txt


FINISHED WORKING ON FILE pca_a.txt

