## Obtain exon coordinates corresponding to the 65 CH genes

### CH gene list
IntOGen-DriverGenes-CH.tsv - Obtained from www.intogen.org/ch/ (2021-02-24)

### Gene annotations (Canonical transcripts)

**1. Data from Biomart obtained from file from Mónica: ensembl_canonical_transcripts.tsv (2021-02-24)**

ensembl_canonical_transcripts_genes.txt - Obtained from Biomart (Ensembl Genes 103 & GRCh38.p13) using previous file as Filter (2021-02-25)

**2. Data from cluster (Claudia)**
/workspace/projects/genomic_regions/hg38/release_2/cds_with_stop_codon/hg38_cds_with_stop_codon.canonical.overlap.gz

Ensembl v101 & Gencode v35 coordenates


In [1]:
import numpy as np
import pandas as pd

### READ DATA

In [2]:
# Obtain CH genes data

ch_genes_df = pd.read_csv("/home/sdemajo/CH_gnomad_2021_02/data/IntOGen-DriverGenes-CH.tsv", sep="\t")
ch_genes_df

Unnamed: 0,Symbol,Mutations,Samples,Samples (%),Cohorts
0,DNMT3A,3343,3005,15.64,3
1,TET2,1011,925,4.81,3
2,PPM1D,776,683,3.56,3
3,ASXL1,438,433,2.25,3
4,ATM,352,337,1.75,3
...,...,...,...,...,...
60,KDM5C,109,7,0.04,1
61,MYO5A,96,7,0.04,1
62,ERF,27,6,0.03,1
63,CUX1,76,5,0.03,1


In [3]:
## Obtain exons hg38 1 (Biomart)

exons1 = pd.read_csv("/home/sdemajo/CH_gnomad_2021_02/data/ensembl_canonical_transcripts_exons.txt", sep="\t")
exons1

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,Gene stable ID,Gene stable ID version,Transcript stable ID,Transcript stable ID version,Chromosome/scaffold name,Gene start (bp),Gene end (bp),Gene name,Exon region start (bp),Exon region end (bp),Exon rank in transcript,Exon stable ID
0,ENSG00000198888,ENSG00000198888.2,ENST00000361390,ENST00000361390.2,MT,3307,4262,MT-ND1,3307,4262,1,ENSE00001435714
1,ENSG00000198763,ENSG00000198763.3,ENST00000361453,ENST00000361453.3,MT,4470,5511,MT-ND2,4470,5511,1,ENSE00001435686
2,ENSG00000198804,ENSG00000198804.2,ENST00000361624,ENST00000361624.2,MT,5904,7445,MT-CO1,5904,7445,1,ENSE00001435647
3,ENSG00000198712,ENSG00000198712.1,ENST00000361739,ENST00000361739.1,MT,7586,8269,MT-CO2,7586,8269,1,ENSE00001435613
4,ENSG00000228253,ENSG00000228253.1,ENST00000361851,ENST00000361851.1,MT,8366,8572,MT-ATP8,8366,8572,1,ENSE00001435286
...,...,...,...,...,...,...,...,...,...,...,...,...
220303,ENSG00000171163,ENSG00000171163.16,ENST00000451251,ENST00000451251.5,1,248850006,248859144,ZNF692,248858131,248858321,2,ENSE00003615063
220304,ENSG00000171163,ENSG00000171163.16,ENST00000451251,ENST00000451251.5,1,248850006,248859144,ZNF692,248850066,248850516,12,ENSE00001648925
220305,ENSG00000185220,ENSG00000185220.12,ENST00000329291,ENST00000329291.6,1,248906196,248919946,PGBD2,248906235,248906342,1,ENSE00001637224
220306,ENSG00000185220,ENSG00000185220.12,ENST00000329291,ENST00000329291.6,1,248906196,248919946,PGBD2,248913816,248913879,2,ENSE00003610993


In [4]:
## Obtain exons hg38 2 (Cluster data)

exons2 = pd.read_csv("/workspace/projects/genomic_regions/hg38/release_2/cds_with_stop_codon/hg38_cds_with_stop_codon.canonical.overlap.gz", sep="\t")
exons2

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,CHROMOSOME,START,END,STRAND,GENE_ID,TRANSCRIPT_ID,SYMBOL
0,1,69091,70008,+,ENSG00000186092,ENST00000335137,OR4F5
1,1,450740,451678,-,ENSG00000284733,ENST00000426406,OR4F29
2,1,685716,686654,-,ENSG00000284662,ENST00000332831,OR4F16
3,1,925942,926013,+,ENSG00000187634,ENST00000342066,SAMD11
4,1,930155,930336,+,ENSG00000187634,ENST00000342066,SAMD11
...,...,...,...,...,...,...,...
193025,Y,25038809,25038914,-,ENSG00000185894,ENST00000382287,BPY2C
193026,Y,25041769,25041886,-,ENSG00000185894,ENST00000382287,BPY2C
193027,Y,25043946,25044023,-,ENSG00000185894,ENST00000382287,BPY2C
193028,Y,25622443,25624034,+,ENSG00000172288,ENST00000306609,CDY1


### OBTAIN EXONS FROM CH GENES

**1. Prepare data**

In [5]:
# Obtain list of CH genes

ch_genes = ch_genes_df["Symbol"]
ch_genes

0     DNMT3A
1       TET2
2      PPM1D
3      ASXL1
4        ATM
       ...  
60     KDM5C
61     MYO5A
62       ERF
63      CUX1
64    CDKN1B
Name: Symbol, Length: 65, dtype: object

In [6]:
# Obtain reduced exons 1 data frame
exons1_s = exons1.iloc[:, [0,2,4,7,8,9]]
exons1_s

Unnamed: 0,Gene stable ID,Transcript stable ID,Chromosome/scaffold name,Gene name,Exon region start (bp),Exon region end (bp)
0,ENSG00000198888,ENST00000361390,MT,MT-ND1,3307,4262
1,ENSG00000198763,ENST00000361453,MT,MT-ND2,4470,5511
2,ENSG00000198804,ENST00000361624,MT,MT-CO1,5904,7445
3,ENSG00000198712,ENST00000361739,MT,MT-CO2,7586,8269
4,ENSG00000228253,ENST00000361851,MT,MT-ATP8,8366,8572
...,...,...,...,...,...,...
220303,ENSG00000171163,ENST00000451251,1,ZNF692,248858131,248858321
220304,ENSG00000171163,ENST00000451251,1,ZNF692,248850066,248850516
220305,ENSG00000185220,ENST00000329291,1,PGBD2,248906235,248906342
220306,ENSG00000185220,ENST00000329291,1,PGBD2,248913816,248913879


**2. Get exons CH genes Biomart**

In [7]:
# Obtain exons from 65 genes

ch_genes_exons1 = exons1_s[exons1_s["Gene name"].isin(ch_genes)]

ch_genes_exons1

Unnamed: 0,Gene stable ID,Transcript stable ID,Chromosome/scaffold name,Gene name,Exon region start (bp),Exon region end (bp)
15155,ENSG00000282932,ENST00000634556,CHR_HSCHR9_1_CTG6,PTPRD,10033718,10033790
15156,ENSG00000282932,ENST00000634556,CHR_HSCHR9_1_CTG6,PTPRD,9938507,9938610
15157,ENSG00000282932,ENST00000634556,CHR_HSCHR9_1_CTG6,PTPRD,9766810,9766851
15158,ENSG00000282932,ENST00000634556,CHR_HSCHR9_1_CTG6,PTPRD,9734533,9734571
15159,ENSG00000282932,ENST00000634556,CHR_HSCHR9_1_CTG6,PTPRD,9574732,9574781
...,...,...,...,...,...,...
217167,ENSG00000198625,ENST00000367182,1,MDM4,204538209,204538308
217168,ENSG00000198625,ENST00000367182,1,MDM4,204542784,204542944
217169,ENSG00000198625,ENST00000367182,1,MDM4,204544535,204544684
217170,ENSG00000198625,ENST00000367182,1,MDM4,204546797,204546877


In [8]:
# Check number of genes obtained
len(ch_genes_exons1["Gene name"].unique())

62

In [9]:
# Check missing genes
ch_genes[-ch_genes.isin(ch_genes_exons1["Gene name"])]


15     LZTR1
43    NOTCH1
48      MKL1
Name: Symbol, dtype: object

In [20]:
# Save

ch_genes_exons1.to_csv("/home/sdemajo/CH_gnomad_2021_02/results/CH_genes_exons_BMart.tsv", sep="\t")

**3. Get exons CH genes Cluster**

In [11]:
# Obtain exons from 65 genes

ch_genes_exons2 = exons2[exons2["SYMBOL"].isin(ch_genes)]

ch_genes_exons2

Unnamed: 0,CHROMOSOME,START,END,STRAND,GENE_ID,TRANSCRIPT_ID,SYMBOL
435,1,1787331,1787437,-,ENSG00000078369,ENST00000378609,GNB1
436,1,1789053,1789269,-,ENSG00000078369,ENST00000378609,GNB1
437,1,1790395,1790596,-,ENSG00000078369,ENST00000378609,GNB1
438,1,1793245,1793311,-,ENSG00000078369,ENST00000378609,GNB1
439,1,1804419,1804581,-,ENSG00000078369,ENST00000378609,GNB1
...,...,...,...,...,...,...,...
191001,X,124090575,124090764,+,ENSG00000101972,ENST00000371145,STAG2
191002,X,124090854,124090964,+,ENSG00000101972,ENST00000371145,STAG2
191003,X,124094018,124094144,+,ENSG00000101972,ENST00000371145,STAG2
191004,X,124095372,124095449,+,ENSG00000101972,ENST00000371145,STAG2


In [12]:
# Check number of genes obtained
len(ch_genes_exons2["SYMBOL"].unique())

64

In [13]:
# Check missing genes
ch_genes[-ch_genes.isin(ch_genes_exons2["SYMBOL"])]


48    MKL1
Name: Symbol, dtype: object

In [14]:
# Obtain genes again with MKL1 (=MRTFA)
ch_genes = ch_genes.append(pd.Series("MRTFA"))
ch_genes_exons2 = exons2[exons2["SYMBOL"].isin(ch_genes)]

ch_genes_exons2

Unnamed: 0,CHROMOSOME,START,END,STRAND,GENE_ID,TRANSCRIPT_ID,SYMBOL
435,1,1787331,1787437,-,ENSG00000078369,ENST00000378609,GNB1
436,1,1789053,1789269,-,ENSG00000078369,ENST00000378609,GNB1
437,1,1790395,1790596,-,ENSG00000078369,ENST00000378609,GNB1
438,1,1793245,1793311,-,ENSG00000078369,ENST00000378609,GNB1
439,1,1804419,1804581,-,ENSG00000078369,ENST00000378609,GNB1
...,...,...,...,...,...,...,...
191001,X,124090575,124090764,+,ENSG00000101972,ENST00000371145,STAG2
191002,X,124090854,124090964,+,ENSG00000101972,ENST00000371145,STAG2
191003,X,124094018,124094144,+,ENSG00000101972,ENST00000371145,STAG2
191004,X,124095372,124095449,+,ENSG00000101972,ENST00000371145,STAG2


In [21]:
# Check number of genes obtained
len(ch_genes_exons2["SYMBOL"].unique())

65

In [22]:
# Save

ch_genes_exons2.to_csv("/home/sdemajo/CH_gnomad_2021_02/results/CH_genes_exons_Clust.tsv", sep="\t")

### Compare results from the two sources

In [16]:
## Check exons for each gene

# BioMart data
#exonresults1 = ch_genes_exons1.groupby("Gene name").count().iloc[:,1]
exonresults1 = ch_genes_exons1['Gene name'].value_counts().to_frame()
exonresults1.columns = ["BioMart"] # Change column name
exonresults1


Unnamed: 0,BioMart
PTPRD,86
ATM,63
KMT2C,59
NF1,58
KMT2D,55
...,...
TMEM127,4
SDHAF2,4
CDKN1B,3
MYCN,3


In [17]:
# Cluster data

#exonresults2 = ch_genes_exons2.groupby("SYMBOL").count().iloc[:,0]

exonresults2 = ch_genes_exons2['SYMBOL'].value_counts().to_frame()
exonresults2.columns = ["Cluster"] # Change column name
exonresults2

Unnamed: 0,Cluster
ATM,62
KMT2C,59
NF1,58
KMT2D,54
MYO5A,41
...,...
NRAS,4
TMEM127,3
SRSF2,2
CDKN1B,2


In [18]:
# Compare both results

# Join data
exonresults12 = exonresults2.join(exonresults1)

# show all results
with pd.option_context("display.max_rows", 100):
    display(exonresults12)

Unnamed: 0,Cluster,BioMart
ATM,62,63.0
KMT2C,59,59.0
NF1,58,58.0
KMT2D,54,55.0
MYO5A,41,41.0
PTPRD,35,86.0
NOTCH1,34,
STAG2,33,35.0
KDM6A,29,29.0
KDM5C,26,26.0


In [19]:
# Get total exon number

exonresults12.sum()

Cluster    1146.0
BioMart    1171.0
dtype: float64