In [5]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn import preprocessing
import re
import os
#import mygene         #for alternative method of changing ensembl names to "normal" gene names
#mg = mygene.MyGeneInfo()

In [6]:
#Function for performing the PCA. Takes dataframe with expression values as input
def my_pca(df, n_pc=1, normalize=True):
    df = df.dropna(axis = 0, how = 'all') #Remove rows with only NA values 
    x = df.values.T #Set x as transpose of only the numerical values of the dataframe
    if normalize:
        x2 = preprocessing.scale(x) #Standardize the data (center to mean and scale to unit variance)
    else:
        x2 = x
    pca = PCA(n_components = n_pc) #Set PCA parameters
    pca.fit(x2) #Fit data to model
    #my_pca.pca = pca #Used for Patrik's dictionary method for components
    x3 = pca.fit_transform(x2) #Transform the data (apply dimensionality reduciton) and set x3 as principal components 
    out_df = pd.DataFrame(x3.transpose(), index=list(range(1,n_pc+1)), columns=df.columns) #Create dataframe with vlues from the PCA and set columnindex as the PC number 
    out_df = out_df.transpose()
    return out_df

#Function for creating csv with all TFs and their target genes. For each TF, identifies urls for target gene data and fetches the info and puts it into a list that is then converted to a dataframe and given as output
def TFs_targets(TFrange):   
    #chip = pd.read_csv('../data/Transfactors/chip_atlas_analysis_list_CORRECTED.csv').loc[1255:1953,['Antigen','Target Genes (TSS }1k)','Target Genes (TSS }5k)', 'Target Genes (TSS }10k)']]
    chip = pd.read_csv('../data/Transfactors/chip_atlas_analysis_list_CORRECTED.csv').loc[1255:1265,['Antigen','Target Genes (TSS }1k)','Target Genes (TSS }5k)', 'Target Genes (TSS }10k)']]
    chip = chip.set_index('Antigen')
    TF_gene_list = []
    dex = 0
    for url in chip.loc[:,TFrange]: #OBS! This takes a long time.
        try:
            TF_gene_set = pd.read_csv(url, sep='\t') #For a specific TF, read csv from url as a dataframe
            genes = TF_gene_set['Target_genes'].tolist() #Take the contents of column 'Target_genes' and puts it into a list
            TF_gene_list.append(genes) #Append the list for a specific TF to list with all TFs
            print('Genes for '+chip.index[dex]+' found')
        except HTTPError: #If the url can't be reached, insert 'Not found' in the list and continue (to get correct index)
            genes = ['Not found']
            TF_gene_list.append(genes) #Append message that genes were not found
            print('Genes for '+chip.index[dex]+' NOT found')
        dex = dex+1
    TF_gene_sets = pd.DataFrame({'Genes':TF_gene_list}, index=chip.index) #Create a dataframe from the list of TFs and their target genes
    TF_gene_sets.to_csv('../data/Transfactors/TF_gene_sets_test.txt')
    return TF_gene_sets

#Function for replacing the ensenbl gene IDs with gene symbols/names in expression data
def ID_to_symbol(datatsv):    
    dataset = pd.read_csv(datatsv, sep='\t') 
    symbols = pd.read_csv('../data/Gene_names.txt', index_col='Gene stable ID')
    symbols = testing.drop_duplicates(keep=False)
    x = 0
    for gene in dataset.loc[:,'Gene']:
        try:
            dataset.loc[x,'Symbol'] = symbols.loc[gene,'Gene name']
            x = x + 1
        except KeyError:
            dataset.loc[x,'Symbol'] = float('NaN')
            x = x + 1
    dataset = dataset.dropna(how='any')
    dataset = dataset.set_index('Symbol')
    dataset = dataset.drop(columns='Gene')
    dataset.to_csv('../data/genes.raw.htseq2.symbols.tsv', sep='\t')
    return dataset

# The original chip_atlas_analysis_list.csv that was downloaded did not work, as it contained an extra '"' in the start and end of each row with '"' in it, and each '"' had an extra  '"'
def csv_fix(csv):
    name, ext = os.path.splitext(csv)
    newfilename = '{name}_{uid}{ext}'.format(name=name, uid='CORRECTED', ext='.txt')
    with open(csv,'r+') as csv_file:
        for line in csv_file:
            # removing starting and ending quotes of a line
            pattern1 = re.compile(r'^"|"$',re.MULTILINE)
            line = re.sub(r'^"|"$',"",line)
            # substituting escaped quote with a single quote
            pattern2 = re.compile(r'""')
            line = re.sub(r'""','|',line)
            corrected_csv = open(newfilename,'a')
            corrected_csv.write(line)
            corrected_csv.close()
    csv_df = pd.read_csv(newfilename, quotechar = '|')
    return csv_df

In [7]:
#Set if want to find genes 1, 5 or 10 kb from TF binding site
targetrange = 'Target Genes (TSS }10k)' 

# Read csv with all all TFs and urls to files with their target genes
try:
    chip = pd.read_csv('../data/Transfactors/chip_atlas_analysis_list_CORRECTED.csv').loc[1255:1265,['Antigen','Target Genes (TSS }1k)','Target Genes (TSS }5k)', 'Target Genes (TSS }10k)']]
except FileNotFoundError:
    chip = csv_fix('../data/Transfactors/chip_atlas_analysis_list.csv').loc[1255:1265,['Antigen','Target Genes (TSS }1k)','Target Genes (TSS }5k)', 'Target Genes (TSS }10k)']]

#Read csv with TFs and their target genes. If not available, fetch the data and create a csv
try: 
    TF_gene_sets = pd.read_csv('../data/Transfactors/TF_gene_sets_test.txt', index_col = 'Antigen')
except FileNotFoundError:
    TF_gene_sets = TF_targets(targetrange, chip)

#Read in dataset with expression values and gene symbols. If not available, replace ensembl IDs and create the csv
try:
    dataset = pd.read_csv('../data/genes.raw.htseq2.symbols.tsv', sep='\t', index_col='Symbol')
except:
    ID_to_symbol('../data/genes.raw.htseq2.tsv')

#Read in dataset with sample sepcifications
dataset2 = pd.read_csv('../data/E-MTAB-2328.sdrf.tsv', sep='\t') #Read csv with specifications of assays

In [8]:
#Create dataframe with multi-colunindex of organ and developmental stage

#Exctract important characteristincs of samples from dataset2
chars = pd.DataFrame()
chars['assay'] = dataset2.loc[:,'Assay Name'].str.slice(stop=6)
#chars['dev_stage'] = dataset2.loc[:,'Characteristics[developmental stage]']
chars['organ'] = dataset2.loc[:,'Characteristics[organism part]']
chars = chars.drop_duplicates()
chars['dev_stage'] = [-5.5,-5.5,-2.5,-2.5,0.5,0.5,4,4,22,22,29,29,-5.5,-5.5,-2.5,-2.5,0.5,0.5,4,4,22,22,29,29]

chars = chars.set_index('assay')

#Insert the important characteristics into dataset with expression values as column index
datasetT = dataset.T
datasetT['dev_stage'] = datasetT.index.to_series().map(chars['dev_stage'])
datasetT['organ'] = datasetT.index.to_series().map(chars['organ'])
datasetT.set_index(['organ','dev_stage'], inplace=True)
#expdata = datasetT.sort_index(level = 0).T
expdata = datasetT.T
expdata

organ,liver,liver,brain,brain,liver,liver,brain,brain,liver,liver,...,brain,brain,liver,liver,brain,brain,liver,liver,brain,brain
dev_stage,-5.5,-5.5,-5.5,-5.5,-2.5,-2.5,-2.5,-2.5,0.5,0.5,...,4.0,4.0,22.0,22.0,22.0,22.0,29.0,29.0,29.0,29.0
Symbol,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Gnai3,4800,1646,2510,1424,4779,3478,1694,2512,2481,1138,...,2429,1502,2125,1974,1153,1409,1543,1470,1937,856
Pbsn,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Cdc45,984,337,163,108,549,490,63,90,420,202,...,99,123,98,110,41,40,27,32,73,29
H19,77631,31553,4113,1862,182270,140922,1289,3352,79202,72111,...,403,996,2219,2385,34,73,151,71,47,29
Scml2,72,25,85,54,48,38,63,117,34,17,...,92,63,17,5,13,17,6,2,26,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Vmn1r-ps47,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Gm22396,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Gm22394,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Vmn1r-ps147,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:

PCA_per_TF = pd.DataFrame(index=expdata.columns)

for TF in TF_gene_sets.index: #Performing the PCA
    genes = list(TF_gene_sets.loc[TF, 'Genes'])
    TFdata = dataset.loc[genes]
    res = my_pca(TFdata)
    PCA_per_TF.loc[:,TF] = res

PCA_per_TF
#PCA_per_TF = pd.DataFrame(index=dataset.columns)
#PCA_per_TF[TF] = 100
#print(PCA_per_TF)
#try:
#    TF_gene_sets_test = pd.read_csv('../data/Transfactors/TF_gene_sets_test_CORRECTED.csv', quotechar='|')
#except:
#    TF_gene_sets_test = csv_fix('../data/Transfactors/TF_gene_sets_test.csv')
    #TF_gene_sets_test = TF_gene_sets_test.set_index('Antigen')
#TF_gene_sets = pd.read_csv('../data/Transfactors/TF_gene_sets.csv', index_col = 'Antigen', quotechar='"')

#TF = TF_gene_sets_test.index  #Pick out a TF to test
#TF
#TFgenes = list(TF_gene_sets.loc[TF,'Genes']) #Find genes associated with the TF

#Testing = ['Kpna1','Brox']
#print(TF)
#genes = TF_gene_sets.loc[TF, 'Genes']
#print(type(genes))
#type(Testing)
#type(TFgenes)

#Values = dataset.loc[TFgenes,:]
#Values





#res = my_pca(TF_data)
#genes1
#print(genes)
#TF = TF_gene_sets.index[1]


#print(test)


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  """
  raw_cell, store_history, silent, shell_futures)
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  """
  raw_cell, store_history, silent, shell_futures)
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  """
  raw_cell, store_history, silent, shell_futures)
Passing list-likes to .loc or [] with any m

Unnamed: 0_level_0,Unnamed: 1_level_0,Acaa2,Acss2,Actb,Adnp,Aebp2,Aff3,Aff4,Ahcy,Ahr,Aicda,Aire
organ,dev_stage,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
liver,-5.5,,,,,,,,,,,
liver,-5.5,,,,,,,,,,,
brain,-5.5,,,,,,,,,,,
brain,-5.5,,,,,,,,,,,
liver,-2.5,,,,,,,,,,,
liver,-2.5,,,,,,,,,,,
brain,-2.5,,,,,,,,,,,
brain,-2.5,,,,,,,,,,,
liver,0.5,,,,,,,,,,,
liver,0.5,,,,,,,,,,,


In [25]:
#Test PCA for a single TF using data from liver samples
TF_gene_set = pd.read_csv('../data/Transfactors/Acaa2.10.tsv', sep='\t')
genesIndex = TF_gene_set.set_index('Target_genes')
genes = genesIndex.index.intersection(expdata.index)
TFdata = expdata.loc[genes,'liver']
res = my_pca(TFdata,2)
res


Unnamed: 0_level_0,1,2
dev_stage,Unnamed: 1_level_1,Unnamed: 2_level_1
embryonic day 15.5,25.448702,15.709511
embryonic day 15.5,-19.654379,14.333832
embryonic day 18.5,17.343544,6.495601
embryonic day 18.5,7.976795,6.943702
postnatal day 0.5,3.516864,4.05128
postnatal day 0.5,-19.682575,7.426887
postnatal day 4,46.258553,-14.940903
postnatal day 4,-2.45724,1.226796
postnatal day 22,-8.101159,-14.760299
postnatal day 22,-12.191108,-11.442672


In [24]:
#Test PCA for a single TF using data from brain samples
TF_gene_set = pd.read_csv('../data/Transfactors/Acaa2.10.tsv', sep='\t')
genesIndex = TF_gene_set.set_index('Target_genes')
genes = genesIndex.index.intersection(expdata.index)
TFdata = expdata.loc[genes,'brain']
res = my_pca(TFdata)
res

Unnamed: 0_level_0,1
dev_stage,Unnamed: 1_level_1
embryonic day 15.5,-5.629221
embryonic day 15.5,-26.383718
embryonic day 18.5,-18.505471
embryonic day 18.5,7.070578
postnatal day 0.5,-13.636643
postnatal day 0.5,-5.345545
postnatal day 4,4.088137
postnatal day 4,-13.266394
postnatal day 22,2.345806
postnatal day 22,19.251184


In [109]:
#Read csv with raw data with ensembl IDs replaced with gene symbols. If not available, create such a csv using mygene package
#try:
#    datasetsym = pd.read_csv('../data/genes.raw.htseq2.symbols.tsv', sep='\t', index_col='Gene')
#except FileNotFoundError:
#    dataset = pd.read_csv('../data/genes.raw.htseq2.tsv', sep='\t', index_col='Gene') 
#    datasetsym = dataset[:]
#    genes = datasetsym.index
#    genesyms = mg.querymany(genes, scopes='ensembl.gene', fields='symbol', as_dataframe=True)
#    genesyms = genesyms.dropna(how='any')
#    genesyms = genesyms.drop_duplicates(subset='Gene')
#    datasetsym.loc[:,'Gene'] = genesyms.loc[:,'symbol']
#    datasetsym = datasetsym.set_index('Gene')
#    datasetsym.to_csv('../data/genes.raw.htseq2.symbols.tsv', sep='\t')

#Attempt at finding gene symbols for each gene and set it as index.
#dataset = pd.read_csv('../data/genes.raw.htseq2.tsv', sep='\t', index_col='Gene') 
#datasetsym = dataset[:]
#genes = datasetsym.index
#genesyms = mg.querymany(genes, scopes='ensembl.gene', fields='symbol', as_dataframe=True)
#genesyms = genesyms.dropna(subset=['symbol'], how='any')
#print(genesyms.shape)

#print(genesyms.duplicated())
#genesyms = genesyms.drop_duplicates(subset='Gene', keep=False)
#datasetsym.loc[:,'Gene'] = genesyms.loc[:,'symbol']
#datasetsym = datasetsym.dropna(how='any')
#datasetsym = datasetsym.set_index('Gene')

#datasetsym.loc[:,'Gene'] = genesyms.loc[:,'symbol']
#datasetsym = datasetsym.dropna(how='any')
#datasetsym = datasetsym.set_index('Gene')
#datasetsym.to_csv('../data/genes.raw.htseq2.symbols.tsv', sep='\t')


In [116]:
#Testing things to solve problem with the gene list being a string
TF_gene_list = []
TF_gene_test = pd.read_csv('../data/Transfactors/Acaa2.10.tsv', sep='\t') #For a specific TF, read csv from url as a dataframe
genes = TF_gene_test['Target_genes'].tolist() #Take the contents of column 'Target_genes' and puts it into a list
TF_gene_list.append(genes) #Append the list for a specific TF to list with all TFs

type(genes)
genes
TF_gene_list
type(TF_gene_list[0])
TF_gene_sets = pd.DataFrame({'Genes':TF_gene_list})#Create a dataframe from the list of TFs and their target genes
X = TF_gene_sets.loc[:,'Genes']
print(X)
why = pd.read_csv('../data/Transfactors/TF_gene_sets.csv', index_col = 'Antigen')
why2 = list(why.iloc[0,0])
type(why2)
len(why2)

0    [Cldn34d, Brox, Aida, Tmem125, Greb1, Txndc11,...
Name: Genes, dtype: object


6205

In [131]:
dataset

Unnamed: 0_level_0,do1474,do1647,do2174,do2183,do1473,do1653,do2175,do2184,do1650,do2191,...,do2177,do2186,do2170,do2180,do2187,do2189,do2171,do2173,do2188,do2190
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Gnai3,4800,1646,2510,1424,4779,3478,1694,2512,2481,1138,...,2429,1502,2125,1974,1153,1409,1543,1470,1937,856
Pbsn,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Cdc45,984,337,163,108,549,490,63,90,420,202,...,99,123,98,110,41,40,27,32,73,29
H19,77631,31553,4113,1862,182270,140922,1289,3352,79202,72111,...,403,996,2219,2385,34,73,151,71,47,29
Scml2,72,25,85,54,48,38,63,117,34,17,...,92,63,17,5,13,17,6,2,26,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Vmn1r-ps47,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Gm22396,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Gm22394,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Vmn1r-ps147,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
