<a href="https://colab.research.google.com/github/sanjaynagi/AnoExpressIR/blob/main/workflow/notebooks/misc/pca-count-data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!pip install anoexpress -q 

In [None]:
import anoexpress as xpress
import pandas as pd
import numpy as np
import plotly.express as px

# Principal components analysis

In this notebook, we perform principal components analysis to explore overall structure across the count and fold change data. 

In [None]:
def pca(analysis, data_type='log2counts', x="PC1", y="PC2", microarray=False):
    from sklearn.decomposition import PCA

    res_data = xpress.data(analysis=analysis, data_type=data_type, microarray=microarray)
    
    assert data_type in ['fcs', 'log2counts']
    if data_type == 'fcs':
        metadata = xpress.metadata(analysis=analysis, microarray=microarray)
        hover_data = ['comparison', 'country', 'technology']
        res_data = res_data.set_index(['GeneID', 'GeneName']).dropna()
    elif data_type == 'log2counts':
        metadata = xpress.sample_metadata(analysis=analysis)
        hover_data = ['condition', 'resistance']
        res_data = res_data.set_index('GeneID')

    pca = PCA(n_components=6)
    principalComponents = pca.fit_transform(res_data.T)
    pc = pd.DataFrame(data = principalComponents, columns = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6'])
    pca_df = pd.concat([metadata, pc], axis=1)
    fig = px.scatter(data_frame=pca_df, x=x, y=y, color='species', hover_data = hover_data, template="ggplot2", width=800, height=600)

    pc_df = pd.DataFrame({'PC':['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6'], 
              'explained_variance':pca.explained_variance_ratio_})

    var_fig=px.bar(pc_df, x='PC', y='explained_variance', width=800, height=400)  
    var_fig.show()
    return(fig)

### PCA on count data
#### PC1 V PC2

In [None]:
pca(analysis='gamb_colu_arab_fun', data_type='log2counts', x='PC1', y='PC2')

#### PC3 vc PC4

In [None]:
pca(analysis='gamb_colu_arab_fun', data_type='log2counts', x='PC3', y="PC4")

### PCA on Fold change data

In [None]:
pca(analysis='gamb_colu_arab_fun', data_type='fcs', microarray=True)

In [None]:
pca(analysis='gamb_colu_arab_fun', data_type='fcs', microarray=True, x='PC3', y='PC4')