In [1]:
import pandas as pd
import numpy as np

from google.cloud import bigquery as bq

In [2]:
cancer_gene = set([line. rstrip('\n') for line in open("./preliminary/sig-DMD.cancer.genes")])
trip_gene = set([line. rstrip('\n') for line in open("./preliminary/sig-DMD.trip.genes")])
nontrip_gene = set([line. rstrip('\n') for line in open("./preliminary/sig-DMD.nontrip.genes")])
err_gene = set([line. rstrip('\n') for line in open("./preliminary/sig-DMD.err.genes")])

In [3]:
union_gene = cancer_gene.union(trip_gene, nontrip_gene, err_gene)
len(union_gene)

3518

In [4]:
gene_df = pd.DataFrame()
gene_df['gene_symbol'] = sorted(list(union_gene))
print(gene_df.shape)
gene_df.head()

(3518, 1)


Unnamed: 0,gene_symbol
0,A1CF
1,AACS
2,AARS2
3,AASS
4,AATF


In [5]:
gene_df['BRCA'] = gene_df['gene_symbol'].apply(lambda x: "Yes" if x in cancer_gene else "No")
gene_df['TN-BRCA'] = gene_df['gene_symbol'].apply(lambda x: "Yes" if x in trip_gene else "No")
gene_df['nonTN-BRCA'] = gene_df['gene_symbol'].apply(lambda x: "Yes" if x in nontrip_gene else "No")
gene_df['ERR-BRCA'] = gene_df['gene_symbol'].apply(lambda x: "Yes" if x in err_gene else "No")

In [6]:
gene_df.head()

Unnamed: 0,gene_symbol,BRCA,TN-BRCA,nonTN-BRCA,ERR-BRCA
0,A1CF,Yes,No,No,Yes
1,AACS,Yes,No,No,No
2,AARS2,Yes,No,No,No
3,AASS,Yes,Yes,Yes,No
4,AATF,Yes,No,No,No


In [7]:
print("Number of genes associated with subtype-specific signatures:")
l = gene_df.loc[(gene_df['BRCA']=="Yes") | (gene_df['TN-BRCA']=="Yes") | (gene_df['nonTN-BRCA']=="Yes")].shape[0]
print(l)
print("Number of genes associated with endocrine resistance signatures:")
l = gene_df.loc[gene_df['ERR-BRCA']=="Yes"].shape[0]
print(l)

Number of genes associated with subtype-specific signatures:
3470
Number of genes associated with endocrine resistance signatures:
229


In [8]:
gene_df.to_csv("./metadata/associated_genes_list.tsv", sep="\t", index=False)