In [186]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn import preprocessing
import re
import os
#import mygene         #for alternative method of changing ensembl names to "normal" gene names
#mg = mygene.MyGeneInfo()

In [187]:
#Function for performing the PCA. Takes dataframe with expression values as input
def my_pca(df, n_pc=1, normalize=True):
    df = df.dropna(axis = 0, how = 'all') #Remove rows with only NA values 
    x = df.values.T #Set x as transpose of only the numerical values of the dataframe
    if normalize:
        x2 = preprocessing.scale(x) #Standardize the data (center to mean and scale to unit variance)
    else:
        x2 = x
    pca = PCA(n_components = n_pc) #Set PCA parameters
    pca.fit(x2) #Fit data to model
    print(pca.explained_variance_ratio_)
    #my_pca.pca = pca #Used for Patrik's dictionary method for components
    x3 = pca.fit_transform(x2) #Transform the data (apply dimensionality reduciton) and set x3 as principal components 
    out_df = pd.DataFrame(x3.transpose(), index=list(range(1,n_pc+1)), columns=df.columns) #Create dataframe with vlues from the PCA and set columnindex as the PC number 
    out_df = out_df.transpose()
    return out_df

#Function for replacing the ensenbl gene IDs with gene symbols/names in expression data
def ID_to_symbol(datatsv):    
    dataset = pd.read_csv(datatsv, sep='\t') 
    symbols = pd.read_csv('../data/Gene_names.txt', index_col='Gene stable ID')
    symbols = testing.drop_duplicates(keep=False)
    x = 0
    for gene in dataset.loc[:,'Gene']:
        try:
            dataset.loc[x,'Symbol'] = symbols.loc[gene,'Gene name']
            x = x + 1
        except KeyError:
            dataset.loc[x,'Symbol'] = float('NaN')
            x = x + 1
    dataset = dataset.dropna(how='any')
    dataset = dataset.set_index('Symbol')
    dataset = dataset.drop(columns='Gene')
    dataset.to_csv('../data/genes.raw.htseq2.symbols.tsv', sep='\t')
    return dataset

# The original chip_atlas_analysis_list.csv that was downloaded did not work, as it contained an extra '"' in the start and end of each row with '"' in it, and each '"' had an extra  '"'
def csv_fix(csv):
    name, ext = os.path.splitext(csv)
    newfilename = '{name}_{uid}{ext}'.format(name=name, uid='CORRECTED', ext='.txt')
    with open(csv,'r+') as csv_file:
        for line in csv_file:
            # removing starting and ending quotes of a line
            pattern1 = re.compile(r'^"|"$',re.MULTILINE)
            line = re.sub(r'^"|"$',"",line)
            # substituting escaped quote with a single quote
            pattern2 = re.compile(r'""')
            line = re.sub(r'""','|',line)
            corrected_csv = open(newfilename,'a')
            corrected_csv.write(line)
            corrected_csv.close()
    csv_df = pd.read_csv(newfilename, quotechar = '|')
    return csv_df

In [188]:
#Read in dataset with expression values and gene symbols. If not available, replace ensembl IDs and create the csv
try:
    dataset = pd.read_csv('../data/genes.raw.htseq2.symbols.tsv', sep='\t', index_col='Symbol')
except:
    ID_to_symbol('../data/genes.raw.htseq2.tsv')

#Read in dataset with sample sepcifications
dataset2 = pd.read_csv('../data/E-MTAB-2328.sdrf.tsv', sep='\t') #Read csv with specifications of assays

In [189]:
#Create dataframe with multi-colunindex of organ and developmental stage

#Exctract important characteristincs of samples from dataset2
chars = pd.DataFrame()
chars['assay'] = dataset2.loc[:,'Assay Name'].str.slice(stop=6)
#chars['dev_stage'] = dataset2.loc[:,'Characteristics[developmental stage]']
chars['organ'] = dataset2.loc[:,'Characteristics[organism part]']
chars = chars.drop_duplicates()
chars['dev_stage'] = [-5.5,-5.5,-2.5,-2.5,0.5,0.5,4,4,22,22,29,29,-5.5,-5.5,-2.5,-2.5,0.5,0.5,4,4,22,22,29,29]

chars = chars.set_index('assay')

#Insert the important characteristics into dataset with expression values as column index
datasetT = dataset.T
datasetT['dev_stage'] = datasetT.index.to_series().map(chars['dev_stage'])
datasetT['organ'] = datasetT.index.to_series().map(chars['organ'])
datasetT.set_index(['organ','dev_stage'], inplace=True)
#expdata = datasetT.sort_index(level = 0).T
expdata = datasetT.T
expdata


organ,liver,liver,brain,brain,liver,liver,brain,brain,liver,liver,...,brain,brain,liver,liver,brain,brain,liver,liver,brain,brain
dev_stage,-5.5,-5.5,-5.5,-5.5,-2.5,-2.5,-2.5,-2.5,0.5,0.5,...,4.0,4.0,22.0,22.0,22.0,22.0,29.0,29.0,29.0,29.0
Symbol,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Gnai3,4800,1646,2510,1424,4779,3478,1694,2512,2481,1138,...,2429,1502,2125,1974,1153,1409,1543,1470,1937,856
Pbsn,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Cdc45,984,337,163,108,549,490,63,90,420,202,...,99,123,98,110,41,40,27,32,73,29
H19,77631,31553,4113,1862,182270,140922,1289,3352,79202,72111,...,403,996,2219,2385,34,73,151,71,47,29
Scml2,72,25,85,54,48,38,63,117,34,17,...,92,63,17,5,13,17,6,2,26,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Vmn1r-ps47,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Gm22396,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Gm22394,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Vmn1r-ps147,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [193]:
PCA_per_TF = pd.DataFrame(index=expdata.columns)

#Test PCA for a single TF
TF = 'Acaa2'
filename = '{path}/{TF}{ext}'.format(path='../data/Transfactors', TF=TF, ext='.10.tsv')
TF_gene_set = pd.read_csv(filename, sep='\t')
genesIndex = TF_gene_set.set_index('Target_genes')
genes = genesIndex.index.intersection(expdata.index)

#Test PCA for a single TF using data from liver samples
TFdataliver = expdata.loc[genes,'liver']
resliver = my_pca(TFdataliver)
PCA_per_TF.loc['liver',TF] = resliver

#Test PCA for a single TF using data from brain samples
TFdatabrain = expdata.loc[genes,'brain']
resbrain = my_pca(TFdatabrain)
PCA_per_TF.loc['brain',TF] = resbrain

PCA_per_TF.sort_index()
PCA_per_TF.to_csv('../exp/single_TF_test.csv')
PCA_per_TF

[0.67520751]
[0.6616275]


  raw_cell, store_history, silent, shell_futures)
  raw_cell, store_history, silent, shell_futures)


Unnamed: 0_level_0,Unnamed: 1_level_0,Acaa2
organ,dev_stage,Unnamed: 2_level_1
liver,-5.5,25.448702
liver,-5.5,-19.654379
brain,-5.5,-5.629221
brain,-5.5,-26.383718
liver,-2.5,17.343544
liver,-2.5,7.976795
brain,-2.5,-18.505471
brain,-2.5,7.070578
liver,0.5,3.516864
liver,0.5,-19.682575


In [194]:
PCA_per_TF2 = pd.DataFrame(index=expdata.columns)

#Test PCA for a single TF
TF = 'Acaa2'
filename = '{path}/{TF}{ext}'.format(path='../data/Transfactors', TF=TF, ext='.10.tsv')
TF_gene_set = pd.read_csv(filename, sep='\t')
genesIndex = TF_gene_set.set_index('Target_genes')
genes = genesIndex.index.intersection(expdata.index)

#Test PCA for a single TF using data from all samples
TFdata = expdata.loc[genes,:]
res = my_pca(TFdata)
PCA_per_TF2.loc[:,TF] = res


PCA_per_TF2.sort_index()
PCA_per_TF2
PCA_per_TF2.to_csv('../exp/single_TF_test2.csv')


[0.48646623]


  raw_cell, store_history, silent, shell_futures)


In [109]:
#Read csv with raw data with ensembl IDs replaced with gene symbols. If not available, create such a csv using mygene package
#try:
#    datasetsym = pd.read_csv('../data/genes.raw.htseq2.symbols.tsv', sep='\t', index_col='Gene')
#except FileNotFoundError:
#    dataset = pd.read_csv('../data/genes.raw.htseq2.tsv', sep='\t', index_col='Gene') 
#    datasetsym = dataset[:]
#    genes = datasetsym.index
#    genesyms = mg.querymany(genes, scopes='ensembl.gene', fields='symbol', as_dataframe=True)
#    genesyms = genesyms.dropna(how='any')
#    genesyms = genesyms.drop_duplicates(subset='Gene')
#    datasetsym.loc[:,'Gene'] = genesyms.loc[:,'symbol']
#    datasetsym = datasetsym.set_index('Gene')
#    datasetsym.to_csv('../data/genes.raw.htseq2.symbols.tsv', sep='\t')

#Attempt at finding gene symbols for each gene and set it as index.
#dataset = pd.read_csv('../data/genes.raw.htseq2.tsv', sep='\t', index_col='Gene') 
#datasetsym = dataset[:]
#genes = datasetsym.index
#genesyms = mg.querymany(genes, scopes='ensembl.gene', fields='symbol', as_dataframe=True)
#genesyms = genesyms.dropna(subset=['symbol'], how='any')
#print(genesyms.shape)

#print(genesyms.duplicated())
#genesyms = genesyms.drop_duplicates(subset='Gene', keep=False)
#datasetsym.loc[:,'Gene'] = genesyms.loc[:,'symbol']
#datasetsym = datasetsym.dropna(how='any')
#datasetsym = datasetsym.set_index('Gene')

#datasetsym.loc[:,'Gene'] = genesyms.loc[:,'symbol']
#datasetsym = datasetsym.dropna(how='any')
#datasetsym = datasetsym.set_index('Gene')
#datasetsym.to_csv('../data/genes.raw.htseq2.symbols.tsv', sep='\t')
