In [11]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn import preprocessing
import re
import os
from datetime import date
#import mygene         #for alternative method of changing ensembl names to "normal" gene names
#mg = mygene.MyGeneInfo()

In [2]:
#Function for performing the PCA. Takes dataframe with expression values as input
def my_pca(df, n_pc=1, normalize=True):
    df = df.dropna(axis = 0, how = 'all') #Remove rows with only NA values. Should be none
    x = df.values.T #Set x as transpose of only the numerical values of the dataframe
    if normalize:
        x2 = preprocessing.scale(x) #Standardize the data (center to mean and scale to unit variance)
    else:
        x2 = x
    pca = PCA(n_components = n_pc) #Set PCA parameters
    pca.fit(x2) #Fit data to model
    expl = pca.explained_variance_ratio_
    #my_pca.pca = pca #Used for Patrik's dictionary method for components
    x3 = pca.fit_transform(x2) #Transform the data (apply dimensionality reduciton) and set x3 as principal components 
    out_df = pd.DataFrame(x3.transpose(), index=list(range(1,n_pc+1)), columns=df.columns) #Create dataframe with vlues from the PCA and set columnindex as the PC number 
    out_df = out_df.transpose()
    return out_df, expl

#Function for creating csv with all TFs and their target genes. For each TF, identifies urls for target gene data and fetches the info and puts it into a list that is then converted to a dataframe and given as output
def TFs_targets(kbp, chip):   
    TFrange = ('Target Genes (TSS }'+kbp+'k)')
    TF_gene_list = []
    dex = 0
    for url in chip.loc[:,TFrange]: #OBS! This takes a long time.
        try:
            TF_gene_set = pd.read_csv(url, sep='\t') #For a specific TF, read csv from url as a dataframe
            genes = TF_gene_set['Target_genes'].tolist() #Take the contents of column 'Target_genes' and puts it into a list
            TF_gene_list.append(genes) #Append the list for a specific TF to list with all TFs
            print('Genes for '+chip.index[dex]+' ('+str(dex)+') found')
        except HTTPError: #If the url can't be reached, insert 'Not found' in the list and continue (to get correct index)
            genes = ['Not found']
            TF_gene_list.append(genes) #Append message that genes were not found
            print('Genes for '+chip.index[dex]+' ('+str(dex)+') NOT found')
        dex = dex+1
    TF_gene_sets = pd.DataFrame({'Genes':TF_gene_list}, index=chip.index) #Create a dataframe from the list of TFs and their target genes
    TF_gene_sets.to_csv('../data/Transfactors/TF_gene_sets.tsv', sep='\t')
    return TF_gene_sets

#Function for replacing the ensenbl gene IDs with gene symbols/names in expression data
def ID_to_symbol(datatsv):    
    dataset = pd.read_csv(datatsv, sep='\t') 
    symbols = pd.read_csv('../data/Gene_names.txt', index_col='Gene stable ID')
    symbols = testing.drop_duplicates(keep=False)
    x = 0
    for gene in dataset.loc[:,'Gene']:
        try:
            dataset.loc[x,'Symbol'] = symbols.loc[gene,'Gene name']
            x = x + 1
        except KeyError:
            dataset.loc[x,'Symbol'] = float('NaN')
            x = x + 1
    dataset = dataset.dropna(how='any')
    dataset = dataset.set_index('Symbol')
    dataset = dataset.drop(columns='Gene')
    dataset.to_csv('../data/genes.raw.htseq2.symbols.tsv', sep='\t')
    return dataset

# The original chip_atlas_analysis_list.csv that was downloaded did not work, as it contained an extra '"' in the start and end of each row with '"' in it, and each '"' had an extra  '"'
def csv_fix(csv):
    name, ext = os.path.splitext(csv)
    newfilename = '{name}_{uid}{ext}'.format(name=name, uid='CORRECTED', ext='.csv')
    with open(csv,'r+') as csv_file:
        for line in csv_file:
            # removing starting and ending quotes of a line
            pattern1 = re.compile(r'^"|"$',re.MULTILINE)
            line = re.sub(r'^"|"$',"",line)
            # substituting escaped quote with a single quote
            pattern2 = re.compile(r'""')
            line = re.sub(r'""','"',line)
            corrected_csv = open(newfilename,'a')
            corrected_csv.write(line)
            corrected_csv.close()
    csv_df = pd.read_csv(newfilename, quotechar = '"')
    return csv_df

In [3]:
# Read csv with all all TFs and urls to files with their target genes
try:
    chip = pd.read_csv('../data/Transfactors/chip_atlas_analysis_list_CORRECTED.csv').loc[1255:1953,['Antigen','Target Genes (TSS }1k)','Target Genes (TSS }5k)', 'Target Genes (TSS }10k)']]
#    chip = pd.read_csv('../data/Transfactors/chip_atlas_analysis_list_CORRECTED.csv').loc[1255:1265,['Antigen','Target Genes (TSS }1k)','Target Genes (TSS }5k)', 'Target Genes (TSS }10k)']]
except FileNotFoundError:
    chip = csv_fix('../data/Transfactors/chip_atlas_analysis_list.csv').loc[1255:1953,['Antigen','Target Genes (TSS }1k)','Target Genes (TSS }5k)', 'Target Genes (TSS }10k)']]
#    chip = csv_fix('../data/Transfactors/chip_atlas_analysis_list.csv').loc[1255:1265,['Antigen','Target Genes (TSS }1k)','Target Genes (TSS }5k)', 'Target Genes (TSS }10k)']]
chip = chip.set_index('Antigen')

#Read csv with TFs and their target genes. If not available, fetch the data and create a csv
try: 
    TF_gene_sets = pd.read_csv('../data/Transfactors/TF_gene_sets.tsv', sep='\t', index_col='Antigen', converters={'Genes': lambda x: x.strip('[]').replace("'","").split(', ')})
except FileNotFoundError:
    TF_gene_sets = TFs_targets('10', chip)

#Read in dataset with expression values and gene symbols. If not available, replace ensembl IDs and create the csv
try:
    dataset = pd.read_csv('../data/genes.raw.htseq2.symbols.tsv', sep='\t', index_col='Symbol')
except:
    ID_to_symbol('../data/genes.raw.htseq2.tsv')

#Read in dataset with sample sepcifications
dataset2 = pd.read_csv('../data/E-MTAB-2328.sdrf.tsv', sep='\t') #Read csv with specifications of assays

Genes for Acaa2 (0) found
Genes for Acss2 (1) found
Genes for Actb (2) found
Genes for Adnp (3) found
Genes for Aebp2 (4) found
Genes for Aff3 (5) found
Genes for Aff4 (6) found
Genes for Ahcy (7) found
Genes for Ahr (8) found
Genes for Aicda (9) found
Genes for Aire (10) found
Genes for Anp32e (11) found
Genes for Ar (12) found
Genes for Arid1a (13) found
Genes for Arid3a (14) found
Genes for Arid4b (15) found
Genes for Arntl (16) found
Genes for Art1 (17) found
Genes for Ascl1 (18) found
Genes for Ascl2 (19) found
Genes for Ash1l (20) found
Genes for Ash2l (21) found
Genes for Asxl1 (22) found
Genes for Atf2 (23) found
Genes for Atf3 (24) found
Genes for Atf4 (25) found
Genes for Atf7 (26) found
Genes for Atf7ip (27) found
Genes for Atm (28) found
Genes for Atoh1 (29) found
Genes for Atrx (30) found
Genes for Auts2 (31) found
Genes for Bach1 (32) found
Genes for Bach2 (33) found
Genes for Bap1 (34) found
Genes for Batf (35) found
Genes for Batf3 (36) found
Genes for Baz1a (37) found


Genes for Lmnb1 (300) found
Genes for Lmo2 (301) found
Genes for Lmx1b (302) found
Genes for Lrrfip1 (303) found
Genes for Lyl1 (304) found
Genes for Maf (305) found
Genes for Mafa (306) found
Genes for Mafb (307) found
Genes for Maff (308) found
Genes for Mafg (309) found
Genes for Mafk (310) found
Genes for Mapk8 (311) found
Genes for Max (312) found
Genes for Maz (313) found
Genes for Mbd1 (314) found
Genes for Mbd2 (315) found
Genes for Mbd3 (316) found
Genes for Mcrs1 (317) found
Genes for Mecp2 (318) found
Genes for Med1 (319) found
Genes for Med12 (320) found
Genes for Med23 (321) found
Genes for Med24 (322) found
Genes for Med26 (323) found
Genes for Mef2a (324) found
Genes for Mef2c (325) found
Genes for Mef2d (326) found
Genes for Meis1 (327) found
Genes for Men1 (328) found
Genes for Mettl3 (329) found
Genes for Mitf (330) found
Genes for Mllt3 (331) found
Genes for Mnx1 (332) found
Genes for Morc3 (333) found
Genes for Mphosph8 (334) found
Genes for Mpnd (335) found
Genes f

Genes for Taf2 (595) found
Genes for Taf3 (596) found
Genes for Taf7l (597) found
Genes for Taf9b (598) found
Genes for Tal1 (599) found
Genes for Tbl1x (600) found
Genes for Tbp (601) found
Genes for Tbpl1 (602) found
Genes for Tbr1 (603) found
Genes for Tbx19 (604) found
Genes for Tbx20 (605) found
Genes for Tbx21 (606) found
Genes for Tbx3 (607) found
Genes for Tbx4 (608) found
Genes for Tbx5 (609) found
Genes for Tcea1 (610) found
Genes for Tcf12 (611) found
Genes for Tcf3 (612) found
Genes for Tcf4 (613) found
Genes for Tcf7 (614) found
Genes for Tcf7l1 (615) found
Genes for Tcf7l2 (616) found
Genes for Tdg (617) found
Genes for Tead1 (618) found
Genes for Tead2 (619) found
Genes for Tead4 (620) found
Genes for Terf1 (621) found
Genes for Terf2ip (622) found
Genes for Tet1 (623) found
Genes for Tet2 (624) found
Genes for Tet3 (625) found
Genes for Tfam (626) found
Genes for Tfap2a (627) found
Genes for Tfap2c (628) found
Genes for Tfap4 (629) found
Genes for Tfcp2l1 (630) found
Ge

In [4]:
#Create dataframe with multi-colunindex of organ and developmental stage

#Exctract important characteristincs of samples from dataset2
chars = pd.DataFrame()
chars['assay'] = dataset2.loc[:,'Assay Name'].str.slice(stop=6)
#chars['dev_stage'] = dataset2.loc[:,'Characteristics[developmental stage]']
chars['organ'] = dataset2.loc[:,'Characteristics[organism part]']
chars = chars.drop_duplicates()
chars['dev_stage'] = [-5.5,-5.5,-2.5,-2.5,0.5,0.5,4,4,22,22,29,29,-5.5,-5.5,-2.5,-2.5,0.5,0.5,4,4,22,22,29,29]

chars = chars.set_index('assay')

#Insert the important characteristics into dataset with expression values as column index
datasetT = dataset.T
datasetT['dev_stage'] = datasetT.index.to_series().map(chars['dev_stage'])
datasetT['organ'] = datasetT.index.to_series().map(chars['organ'])
datasetT.set_index(['organ','dev_stage'], inplace=True)
#expdata = datasetT.sort_index(level = 0).T
expdata = datasetT.T
expdata


organ,liver,liver,brain,brain,liver,liver,brain,brain,liver,liver,...,brain,brain,liver,liver,brain,brain,liver,liver,brain,brain
dev_stage,-5.5,-5.5,-5.5,-5.5,-2.5,-2.5,-2.5,-2.5,0.5,0.5,...,4.0,4.0,22.0,22.0,22.0,22.0,29.0,29.0,29.0,29.0
Symbol,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Gnai3,4800,1646,2510,1424,4779,3478,1694,2512,2481,1138,...,2429,1502,2125,1974,1153,1409,1543,1470,1937,856
Pbsn,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Cdc45,984,337,163,108,549,490,63,90,420,202,...,99,123,98,110,41,40,27,32,73,29
H19,77631,31553,4113,1862,182270,140922,1289,3352,79202,72111,...,403,996,2219,2385,34,73,151,71,47,29
Scml2,72,25,85,54,48,38,63,117,34,17,...,92,63,17,5,13,17,6,2,26,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Vmn1r-ps47,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Gm22396,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Gm22394,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Vmn1r-ps147,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
#Performing the PCA
var_expl = []
PCA_per_TF = pd.DataFrame(index=expdata.columns)
for TF in TF_gene_sets.index: 
    genesIndex = pd.DataFrame(TF_gene_sets.loc[TF,'Genes'], columns=['Genes'])
    genesIndex.set_index('Genes', inplace=True)  
    genes = genesIndex.index.intersection(expdata.index)
    TFdata = expdata.loc[genes,:]
    [res, expl] = my_pca(TFdata)
    PCA_per_TF[TF] = res
    var_expl.append([TF,expl[0],len(genes)])

df_var_expl = pd.DataFrame(expl_var, columns=['TF','Variance explained', 'Nr of genes'])
df_var_expl = df_var_expl.set_index('TF')
PCA_per_TF.sort_index(inplace=True)
display(PCA_per_TF)
display(df_var_expl)

Unnamed: 0_level_0,Unnamed: 1_level_0,Acaa2,Acss2,Actb,Adnp,Aebp2,Aff3,Aff4,Ahcy,Ahr,Aicda,...,Zfp64,Zfp708,Zfpm1,Zic1,Zic2,Zic3,Zkscan1,Zmiz1,Zmynd8,Zscan5b
organ,dev_stage,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
brain,-5.5,1.747293,2.444239,-0.389841,10.456071,20.74231,17.259539,4.404573,0.868003,7.866191,3.476843,...,0.891667,3.024333,9.826311,9.512205,26.134589,14.06058,11.415897,13.35509,7.192901,1.3939
brain,-5.5,-16.368792,-14.695345,-3.079625,-28.001705,-40.139809,-39.943489,-27.362309,-5.275965,-14.826787,-5.14956,...,-11.676557,-2.531955,-45.110102,-9.632183,-38.312881,-20.266803,-33.727867,-54.029346,-31.700744,-1.997014
brain,-2.5,-10.016745,-7.6078,-1.178894,-9.961827,-12.593929,-15.928864,-16.179566,-2.451338,-4.04138,-1.128816,...,-5.794706,-0.523828,-22.398309,0.117635,-6.950821,-4.51692,-13.369795,-26.13239,-15.431444,-0.378993
brain,-2.5,12.552613,15.927837,2.642237,40.16542,69.567399,60.463024,24.818623,8.083802,26.121284,10.920752,...,9.090336,5.686613,46.321882,28.783649,81.942773,43.502839,44.289497,61.100415,32.750855,3.367006
brain,0.5,-6.033572,-2.34254,-0.152437,1.484007,6.427715,-0.19232,-8.946137,-0.338615,2.784527,1.992556,...,-2.460196,0.769505,-8.576348,7.201797,14.239627,6.970055,-0.864806,-8.633786,-5.45351,0.681648
brain,0.5,1.859274,5.662469,0.864787,17.335766,32.924194,25.606613,5.728136,3.547043,12.592226,5.611799,...,2.241064,2.981267,13.712794,16.741473,42.102828,22.564314,17.473618,20.235745,9.370672,2.022423
brain,4.0,9.201828,14.634429,3.14467,38.001785,66.316622,51.686346,18.700779,6.578579,24.481821,9.7054,...,9.374409,4.249094,39.475251,28.3649,79.642484,41.412403,40.589018,52.272321,28.12341,3.343877
brain,4.0,-5.374515,-1.173573,0.395917,3.721429,12.249583,2.768042,-7.160156,1.874227,3.553837,1.710126,...,-1.618491,1.84468,-7.694277,10.770661,20.846285,10.951036,0.532411,-6.074351,-4.990081,0.899811
brain,22.0,6.069004,14.315439,2.701008,28.940155,53.871243,39.569089,12.851784,6.314423,19.262161,7.731381,...,8.029251,3.270514,30.50853,24.170055,66.388645,34.276539,31.143595,40.867234,20.493,2.493858
brain,22.0,20.127346,28.776729,6.244283,62.873229,106.729003,88.416258,37.707608,12.568846,39.267023,15.04347,...,19.353874,7.413928,75.503758,44.327298,124.905256,65.484842,70.601057,97.398267,52.179527,5.243999


Unnamed: 0_level_0,Variance explained,Nr of genes
TF,Unnamed: 1_level_1,Unnamed: 2_level_1
Acaa2,0.486466,613
Acss2,0.483593,781
Actb,0.466215,32
Adnp,0.403340,4361
Aebp2,0.445511,10761
...,...,...
Zic3,0.419930,4240
Zkscan1,0.409957,5528
Zmiz1,0.422100,11167
Zmynd8,0.451450,3245


In [9]:
df_var_expl.nlargest(10, ['Variance explained'])

Unnamed: 0_level_0,Variance explained,Nr of genes
TF,Unnamed: 1_level_1,Unnamed: 2_level_1
Thoc5,0.974005,3
Gcm1,0.676908,8
Sp110,0.673078,10
Art1,0.627226,8
Nlrp3,0.60848,5
Snai1,0.571641,604
Phf20,0.526773,12
Nsl1,0.513757,2451
Elk1,0.503574,4674
Glyr1,0.502517,19


In [12]:
try:
    PCA_per_TF.to_csv('../exp/'+str(date.today())+'/PCA_results.csv')
    df_var_expl.to_csv('../exp/'+str(date.today())+'/Variance_explained.csv')
except FileNotFoundError:
    os.mkdir('../exp/'+str(date.today()))
    PCA_per_TF.to_csv('../exp/'+str(date.today())+'/PCA_results.csv')
    df_var_expl.to_csv('../exp/'+str(date.today())+'/Variance_explained.csv')

In [109]:
#Read csv with raw data with ensembl IDs replaced with gene symbols. If not available, create such a csv using mygene package
#try:
#    datasetsym = pd.read_csv('../data/genes.raw.htseq2.symbols.tsv', sep='\t', index_col='Gene')
#except FileNotFoundError:
#    dataset = pd.read_csv('../data/genes.raw.htseq2.tsv', sep='\t', index_col='Gene') 
#    datasetsym = dataset[:]
#    genes = datasetsym.index
#    genesyms = mg.querymany(genes, scopes='ensembl.gene', fields='symbol', as_dataframe=True)
#    genesyms = genesyms.dropna(how='any')
#    genesyms = genesyms.drop_duplicates(subset='Gene')
#    datasetsym.loc[:,'Gene'] = genesyms.loc[:,'symbol']
#    datasetsym = datasetsym.set_index('Gene')
#    datasetsym.to_csv('../data/genes.raw.htseq2.symbols.tsv', sep='\t')

#Attempt at finding gene symbols for each gene and set it as index.
#dataset = pd.read_csv('../data/genes.raw.htseq2.tsv', sep='\t', index_col='Gene') 
#datasetsym = dataset[:]
#genes = datasetsym.index
#genesyms = mg.querymany(genes, scopes='ensembl.gene', fields='symbol', as_dataframe=True)
#genesyms = genesyms.dropna(subset=['symbol'], how='any')
#print(genesyms.shape)

#print(genesyms.duplicated())
#genesyms = genesyms.drop_duplicates(subset='Gene', keep=False)
#datasetsym.loc[:,'Gene'] = genesyms.loc[:,'symbol']
#datasetsym = datasetsym.dropna(how='any')
#datasetsym = datasetsym.set_index('Gene')

#datasetsym.loc[:,'Gene'] = genesyms.loc[:,'symbol']
#datasetsym = datasetsym.dropna(how='any')
#datasetsym = datasetsym.set_index('Gene')
#datasetsym.to_csv('../data/genes.raw.htseq2.symbols.tsv', sep='\t')
