In [133]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn import preprocessing
import re
import os
#import mygene         #for alternative method of changing ensembl names to "normal" gene names
#mg = mygene.MyGeneInfo()

In [134]:
#Function for performing the PCA. Takes dataframe with expression values as input
def my_pca(df, n_pc=1, normalize=True):
    df = df.dropna(axis = 0, how = 'all') #Remove rows with only NA values. Should be none
    x = df.values.T #Set x as transpose of only the numerical values of the dataframe
    if normalize:
        x2 = preprocessing.scale(x) #Standardize the data (center to mean and scale to unit variance)
    else:
        x2 = x
    pca = PCA(n_components = n_pc) #Set PCA parameters
    pca.fit(x2) #Fit data to model
    expl = pca.explained_variance_ratio_
    #my_pca.pca = pca #Used for Patrik's dictionary method for components
    x3 = pca.fit_transform(x2) #Transform the data (apply dimensionality reduciton) and set x3 as principal components 
    out_df = pd.DataFrame(x3.transpose(), index=list(range(1,n_pc+1)), columns=df.columns) #Create dataframe with vlues from the PCA and set columnindex as the PC number 
    out_df = out_df.transpose()
    return out_df, expl

#Function for creating csv with all TFs and their target genes. For each TF, identifies urls for target gene data and fetches the info and puts it into a list that is then converted to a dataframe and given as output
def TFs_targets(kbp, chip):   
    TFrange = ('Target Genes (TSS }'+kbp+'k)')
    TF_gene_list = []
    dex = 0
    for url in chip.loc[:,TFrange]: #OBS! This takes a long time.
        try:
            TF_gene_set = pd.read_csv(url, sep='\t') #For a specific TF, read csv from url as a dataframe
            genes = TF_gene_set['Target_genes'].tolist() #Take the contents of column 'Target_genes' and puts it into a list
            TF_gene_list.append(genes) #Append the list for a specific TF to list with all TFs
#            print('Genes for '+chip.index[dex]+' found')
        except HTTPError: #If the url can't be reached, insert 'Not found' in the list and continue (to get correct index)
            genes = ['Not found']
            TF_gene_list.append(genes) #Append message that genes were not found
#            print('Genes for '+chip.index[dex]+' NOT found')
        dex = dex+1
    TF_gene_sets = pd.DataFrame({'Genes':TF_gene_list}, index=chip.index) #Create a dataframe from the list of TFs and their target genes
    TF_gene_sets.to_csv('../data/Transfactors/TF_gene_sets.tsv', sep='\t')
    return TF_gene_sets

#Function for replacing the ensenbl gene IDs with gene symbols/names in expression data
def ID_to_symbol(datatsv):    
    dataset = pd.read_csv(datatsv, sep='\t') 
    symbols = pd.read_csv('../data/Gene_names.txt', index_col='Gene stable ID')
    symbols = testing.drop_duplicates(keep=False)
    x = 0
    for gene in dataset.loc[:,'Gene']:
        try:
            dataset.loc[x,'Symbol'] = symbols.loc[gene,'Gene name']
            x = x + 1
        except KeyError:
            dataset.loc[x,'Symbol'] = float('NaN')
            x = x + 1
    dataset = dataset.dropna(how='any')
    dataset = dataset.set_index('Symbol')
    dataset = dataset.drop(columns='Gene')
    dataset.to_csv('../data/genes.raw.htseq2.symbols.tsv', sep='\t')
    return dataset

# The original chip_atlas_analysis_list.csv that was downloaded did not work, as it contained an extra '"' in the start and end of each row with '"' in it, and each '"' had an extra  '"'
def csv_fix(csv):
    name, ext = os.path.splitext(csv)
    newfilename = '{name}_{uid}{ext}'.format(name=name, uid='CORRECTED', ext='.csv')
    with open(csv,'r+') as csv_file:
        for line in csv_file:
            # removing starting and ending quotes of a line
            pattern1 = re.compile(r'^"|"$',re.MULTILINE)
            line = re.sub(r'^"|"$',"",line)
            # substituting escaped quote with a single quote
            pattern2 = re.compile(r'""')
            line = re.sub(r'""','"',line)
            corrected_csv = open(newfilename,'a')
            corrected_csv.write(line)
            corrected_csv.close()
    csv_df = pd.read_csv(newfilename, quotechar = '"')
    return csv_df

In [135]:
# Read csv with all all TFs and urls to files with their target genes
try:
    chip = pd.read_csv('../data/Transfactors/chip_atlas_analysis_list_CORRECTED.csv').loc[1255:1953,['Antigen','Target Genes (TSS }1k)','Target Genes (TSS }5k)', 'Target Genes (TSS }10k)']]
#    chip = pd.read_csv('../data/Transfactors/chip_atlas_analysis_list_CORRECTED.csv').loc[1255:1265,['Antigen','Target Genes (TSS }1k)','Target Genes (TSS }5k)', 'Target Genes (TSS }10k)']]
except FileNotFoundError:
    chip = csv_fix('../data/Transfactors/chip_atlas_analysis_list.csv').loc[1255:1953,['Antigen','Target Genes (TSS }1k)','Target Genes (TSS }5k)', 'Target Genes (TSS }10k)']]
#    chip = csv_fix('../data/Transfactors/chip_atlas_analysis_list.csv').loc[1255:1265,['Antigen','Target Genes (TSS }1k)','Target Genes (TSS }5k)', 'Target Genes (TSS }10k)']]
chip = chip.set_index('Antigen')

#Read csv with TFs and their target genes. If not available, fetch the data and create a csv
try: 
    TF_gene_sets = pd.read_csv('../data/Transfactors/TF_gene_sets.tsv', sep='\t', index_col='Antigen', converters={'Genes': lambda x: x.strip('[]').replace("'","").split(', ')})
except FileNotFoundError:
    TF_gene_sets = TFs_targets('10', chip)

#Read in dataset with expression values and gene symbols. If not available, replace ensembl IDs and create the csv
try:
    dataset = pd.read_csv('../data/genes.raw.htseq2.symbols.tsv', sep='\t', index_col='Symbol')
except:
    ID_to_symbol('../data/genes.raw.htseq2.tsv')

#Read in dataset with sample sepcifications
dataset2 = pd.read_csv('../data/E-MTAB-2328.sdrf.tsv', sep='\t') #Read csv with specifications of assays

NameError: name 'HTTPError' is not defined

In [107]:
#Create dataframe with multi-colunindex of organ and developmental stage

#Exctract important characteristincs of samples from dataset2
chars = pd.DataFrame()
chars['assay'] = dataset2.loc[:,'Assay Name'].str.slice(stop=6)
#chars['dev_stage'] = dataset2.loc[:,'Characteristics[developmental stage]']
chars['organ'] = dataset2.loc[:,'Characteristics[organism part]']
chars = chars.drop_duplicates()
chars['dev_stage'] = [-5.5,-5.5,-2.5,-2.5,0.5,0.5,4,4,22,22,29,29,-5.5,-5.5,-2.5,-2.5,0.5,0.5,4,4,22,22,29,29]

chars = chars.set_index('assay')

#Insert the important characteristics into dataset with expression values as column index
datasetT = dataset.T
datasetT['dev_stage'] = datasetT.index.to_series().map(chars['dev_stage'])
datasetT['organ'] = datasetT.index.to_series().map(chars['organ'])
datasetT.set_index(['organ','dev_stage'], inplace=True)
#expdata = datasetT.sort_index(level = 0).T
expdata = datasetT.T
expdata


organ,liver,liver,brain,brain,liver,liver,brain,brain,liver,liver,...,brain,brain,liver,liver,brain,brain,liver,liver,brain,brain
dev_stage,-5.5,-5.5,-5.5,-5.5,-2.5,-2.5,-2.5,-2.5,0.5,0.5,...,4.0,4.0,22.0,22.0,22.0,22.0,29.0,29.0,29.0,29.0
Symbol,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Gnai3,4800,1646,2510,1424,4779,3478,1694,2512,2481,1138,...,2429,1502,2125,1974,1153,1409,1543,1470,1937,856
Pbsn,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Cdc45,984,337,163,108,549,490,63,90,420,202,...,99,123,98,110,41,40,27,32,73,29
H19,77631,31553,4113,1862,182270,140922,1289,3352,79202,72111,...,403,996,2219,2385,34,73,151,71,47,29
Scml2,72,25,85,54,48,38,63,117,34,17,...,92,63,17,5,13,17,6,2,26,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Vmn1r-ps47,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Gm22396,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Gm22394,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Vmn1r-ps147,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [132]:
#Performing the PCA
expl_var = []
PCA_per_TF = pd.DataFrame(index=expdata.columns)
for TF in TF_gene_sets.index: 
    genesIndex = pd.DataFrame(TF_gene_sets.loc[TF,'Genes'], columns=['Genes'])
    genesIndex.set_index('Genes', inplace=True)  
    genes = genesIndex.index.intersection(expdata.index)
    TFdata = expdata.loc[genes,:]
    [res, expl] = my_pca(TFdata)
    PCA_per_TF[TF] = res
    expl_var.append([TF,expl[0]])

df_expl_var = pd.DataFrame(expl_var, columns=['TF','Explained variance'])
df_expl_var = df_expl_var.set_index('TF')
PCA_per_TF.sort_index(inplace=True)
display(PCA_per_TF)
display(df_expl_var)

Unnamed: 0_level_0,Unnamed: 1_level_0,Acaa2,Acss2,Actb,Adnp,Aebp2,Aff3,Aff4,Ahcy,Ahr,Aicda,Aire
organ,dev_stage,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
liver,-5.5,15.100648,2.313086,-0.748672,-2.127639,-13.675117,0.106447,19.121834,-1.92708,-2.281798,-6.515634,3.786126
liver,-5.5,-21.549613,-24.508499,-4.593633,-52.13841,-84.165669,-76.702732,-39.272114,-9.995879,-30.269061,-14.245539,-86.873317
brain,-5.5,1.747293,2.444239,-0.389841,10.456071,20.74231,17.259539,4.404573,0.868003,7.866191,3.476843,16.006622
brain,-5.5,-16.368792,-14.695345,-3.079625,-28.001705,-40.139809,-39.943489,-27.362309,-5.275965,-14.826787,-5.14956,-48.613741
liver,-2.5,8.967775,1.069314,-1.627141,-8.09592,-19.34869,-7.46836,11.569513,-2.152834,-5.809596,-5.114745,-4.586812
liver,-2.5,0.737843,-5.673242,-1.843599,-18.150348,-35.360876,-24.701879,-1.714771,-4.107312,-12.066858,-7.666245,-24.979101
brain,-2.5,-10.016745,-7.6078,-1.178894,-9.961827,-12.593929,-15.928864,-16.179566,-2.451338,-4.04138,-1.128816,-20.661155
brain,-2.5,12.552613,15.927837,2.642237,40.16542,69.567399,60.463024,24.818623,8.083802,26.121284,10.920752,63.468088
liver,0.5,-2.672261,-7.489024,-1.513259,-22.610687,-41.413032,-31.210093,-7.643147,-4.042826,-14.557463,-7.064359,-32.632975
liver,0.5,-21.536325,-23.144168,-3.949341,-49.727491,-81.001449,-73.327609,-38.577362,-9.073917,-29.578726,-12.120662,-82.703472


Unnamed: 0_level_0,Explained variance
TF,Unnamed: 1_level_1
Acaa2,0.486466
Acss2,0.483593
Actb,0.466215
Adnp,0.40334
Aebp2,0.445511
Aff3,0.42928
Aff4,0.453694
Ahcy,0.409004
Ahr,0.366293
Aicda,0.366782


In [None]:
try:
    PCA_per_TF.to_csv('../exp/'+str(date.today())+'/PCA_results.csv')
    df_expl_var.to_csv('../exp/'+str(date.today())+'/Variance_explained.csv')
except FileNotFoundError:
    os.mkdir('../exp/'+str(date.today()))
    PCA_per_TF.to_csv('../exp/'+str(date.today())+'/PCA_results.csv')
    df_expl_var.to_csv('../exp/'+str(date.today())+'/Variance_explained.csv')

In [109]:
#Read csv with raw data with ensembl IDs replaced with gene symbols. If not available, create such a csv using mygene package
#try:
#    datasetsym = pd.read_csv('../data/genes.raw.htseq2.symbols.tsv', sep='\t', index_col='Gene')
#except FileNotFoundError:
#    dataset = pd.read_csv('../data/genes.raw.htseq2.tsv', sep='\t', index_col='Gene') 
#    datasetsym = dataset[:]
#    genes = datasetsym.index
#    genesyms = mg.querymany(genes, scopes='ensembl.gene', fields='symbol', as_dataframe=True)
#    genesyms = genesyms.dropna(how='any')
#    genesyms = genesyms.drop_duplicates(subset='Gene')
#    datasetsym.loc[:,'Gene'] = genesyms.loc[:,'symbol']
#    datasetsym = datasetsym.set_index('Gene')
#    datasetsym.to_csv('../data/genes.raw.htseq2.symbols.tsv', sep='\t')

#Attempt at finding gene symbols for each gene and set it as index.
#dataset = pd.read_csv('../data/genes.raw.htseq2.tsv', sep='\t', index_col='Gene') 
#datasetsym = dataset[:]
#genes = datasetsym.index
#genesyms = mg.querymany(genes, scopes='ensembl.gene', fields='symbol', as_dataframe=True)
#genesyms = genesyms.dropna(subset=['symbol'], how='any')
#print(genesyms.shape)

#print(genesyms.duplicated())
#genesyms = genesyms.drop_duplicates(subset='Gene', keep=False)
#datasetsym.loc[:,'Gene'] = genesyms.loc[:,'symbol']
#datasetsym = datasetsym.dropna(how='any')
#datasetsym = datasetsym.set_index('Gene')

#datasetsym.loc[:,'Gene'] = genesyms.loc[:,'symbol']
#datasetsym = datasetsym.dropna(how='any')
#datasetsym = datasetsym.set_index('Gene')
#datasetsym.to_csv('../data/genes.raw.htseq2.symbols.tsv', sep='\t')
