# TODO

- check sorting order of chromosomes throughout analysis
- there is a bug in gtfanno which makes it fail to read the appris principal score, does this influence the results?
- there is a "transcript = 0" count in the gtfanno basic stat output. is this indicative of a problem?
- double check the changes made to allow DCRD intervals enveloping the promoter interval
- check annos in IGV
- are feature coordinates really 0-based, right open?

annos to add
- motif: CG, CHH, CHG
- strand, illumina strands are not cytosine strands
- hematopoietic regions
    - cis reg atlas
    - vision
    - amit enhancers
- general regulatory regions
    - ensembl reg regions
    - chrom hmm - ask maxi again what he had in mind here - forgot which resource he mentioned
- tfbs

# Setup

## Resource parameters

In [22]:
n_cores = 12

## Imports

In [23]:
# isort: off
import os

num_threads = str(n_cores)

# these need to be set prior to numpy import
os.environ["OMP_NUM_THREADS"] = num_threads
os.environ["OPENBLAS_NUM_THREADS"] = num_threads
os.environ["MKL_NUM_THREADS"] = num_threads
os.environ["VECLIB_MAXIMUM_THREADS"] = num_threads
os.environ["NUMEXPR_NUM_THREADS"] = num_threads

import numpy as np

# isort: on

import subprocess
import tempfile

import gtfanno as ga
import matplotlib.pyplot as plt
import pandas as pd
import pyranges as pr
from IPython.display import display

import mouse_hema_meth.utils as ut

In [24]:
%matplotlib inline

In [25]:
import mouse_hema_meth.methylome.annotation.epic_array_probe_annotation_lib as lib

## Rerun flags

In [26]:
recompute = True

## Dtypes

In [63]:
chrom_dtype_prefixed = pd.api.types.CategoricalDtype(
    categories=[
        "chr1",
        "chr2",
        "chr3",
        "chr4",
        "chr5",
        "chr6",
        "chr7",
        "chr8",
        "chr9",
        "chr10",
        "chr11",
        "chr12",
        "chr13",
        "chr14",
        "chr15",
        "chr16",
        "chr17",
        "chr18",
        "chr19",
        "chrX",
        "chrY",
        "chrMT",
    ],
    ordered=True,
)

# Paths

## Project paths

In [28]:
project_dir = (
    "/icgc/dkfzlsdf/analysis/B080/kraemers/projects/epic-arrays-for-hematopoiesis"
)

In [29]:
temp_dir_obj = tempfile.TemporaryDirectory(dir=project_dir)
temp_dir_name = temp_dir_obj.name
temp_dir_name

'/icgc/dkfzlsdf/analysis/B080/kraemers/projects/epic-arrays-for-hematopoiesis/tmpija_b2vj'

## Gencode

In [131]:
gencode_download_url = "ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.annotation.gtf.gz"

unmodified gencode gtf (note: with chr prefix)

In [132]:
gencode_gtf = project_dir + "/gencode.vM25.annotation.gtf.gz"
gencode_gtf

'/icgc/dkfzlsdf/analysis/B080/kraemers/projects/epic-arrays-for-hematopoiesis/gencode.vM25.annotation.gtf.gz'

gencode filtered for principal transcripts of protein coding genes, note that that chromosome prefix ('chr') is removed in this file

In [133]:
gencode_coding_canonical_gtf = (
    project_dir + "/gencode.vM25.annotation_coding_canonical.gtf.gz"
)
gencode_coding_canonical_gtf

'/icgc/dkfzlsdf/analysis/B080/kraemers/projects/epic-arrays-for-hematopoiesis/gencode.vM25.annotation_coding_canonical.gtf.gz'

## Probes

path to original probe file

In [33]:
original_probes_bed = project_dir + "/2021-03-22_mmbc_probes.bed"

In [34]:
!head {original_probes_bed}

chr1	3102470	3102471	cg36603287_TC21
chr1	3199334	3199335	cg36603791_TC21
chr1	3216335	3216336	cg36603848_TC21
chr1	3253527	3253528	cg36604001_TC21
chr1	3353526	3353527	cg36604489_TC21
chr1	3367941	3367942	cg36604536_TC21
chr1	3469621	3469622	cg36604958_TC21
chr1	3482624	3482625	cg36605010_TC21
chr1	3531911	3531912	cg36605223_TC21
chr1	3650975	3650976	cg36605802_TC21


path to reformatted probe file

In [35]:
reformatted_probes_bed = project_dir + "/2021-03-22_mmbc_probes_reformatted.bed"

full Illumina probe file

In [36]:
illumina_probes_url = "https://support.illumina.com/content/dam/illumina-support/documents/downloads/productfiles/mouse-methylation/Infinium%20Mouse%20Methylation%20v1.0%20A1%20GS%20Manifest%20File.csv"
illumina_probes_csv = (
    project_dir
    + "/Infinium_20Mouse_20Methylation_20v1.0_20A1_20GS_20Manifest_20File.csv"
)
illumina_probes_csv

'/icgc/dkfzlsdf/analysis/B080/kraemers/projects/epic-arrays-for-hematopoiesis/Infinium_20Mouse_20Methylation_20v1.0_20A1_20GS_20Manifest_20File.csv'

full Illumina probe file BED coordinates

In [78]:
illumina_coordinate_bed = project_dir + "/illumina-all-probes.bed"
illumina_coordinate_bed

'/icgc/dkfzlsdf/analysis/B080/kraemers/projects/epic-arrays-for-hematopoiesis/illumina-all-probes.bed'

## Gene annotation results

### gtfanno results

In [37]:
custom_intervals_results_dir = project_dir + "/custom-intervals_1500-500"
os.makedirs(custom_intervals_results_dir, exist_ok=True)

In [38]:
custom_intervals_trunk_path = (
    custom_intervals_results_dir + "/custom-intervals_1500-500"
)
custom_intervals_trunk_path

'/icgc/dkfzlsdf/analysis/B080/kraemers/projects/epic-arrays-for-hematopoiesis/custom-intervals_1500-500/custom-intervals_1500-500'

In [39]:
custom_intervals_results_paths_d = dict(
    primary_annos_bed=custom_intervals_trunk_path + "_primary-annotations.bed",
    primary_annos_p=custom_intervals_trunk_path + "_primary-annotations.p",
    all_annos_bed=custom_intervals_trunk_path + "_all-annotations.bed",
    all_annos_p=custom_intervals_trunk_path + "_all-annotations.p",
)
custom_intervals_results_paths_d

{'primary_annos_bed': '/icgc/dkfzlsdf/analysis/B080/kraemers/projects/epic-arrays-for-hematopoiesis/custom-intervals_1500-500/custom-intervals_1500-500_primary-annotations.bed',
 'primary_annos_p': '/icgc/dkfzlsdf/analysis/B080/kraemers/projects/epic-arrays-for-hematopoiesis/custom-intervals_1500-500/custom-intervals_1500-500_primary-annotations.p',
 'all_annos_bed': '/icgc/dkfzlsdf/analysis/B080/kraemers/projects/epic-arrays-for-hematopoiesis/custom-intervals_1500-500/custom-intervals_1500-500_all-annotations.bed',
 'all_annos_p': '/icgc/dkfzlsdf/analysis/B080/kraemers/projects/epic-arrays-for-hematopoiesis/custom-intervals_1500-500/custom-intervals_1500-500_all-annotations.p'}

### Final tables

In [185]:
gene_annos_primary_one_row = project_dir + "/gene-annos_primary_one-row.bed"
print(gene_annos_primary_one_row)
gene_annos_primary_multi_row = project_dir + "/gene-annos_primary_multi-row.bed"
print(gene_annos_primary_multi_row)

/icgc/dkfzlsdf/analysis/B080/kraemers/projects/epic-arrays-for-hematopoiesis/gene-annos_primary_one-row.bed
/icgc/dkfzlsdf/analysis/B080/kraemers/projects/epic-arrays-for-hematopoiesis/gene-annos_primary_multi-row.bed


# Analysis

## Prepare input data

### CpG island annos

In [40]:
import mouse_hema_meth.genome_annotations.get_genome_annos_paths as get_genome_annos_paths

cpg_islands_pickle_d = get_genome_annos_paths.cpg_islands_shores_shelves_pickle_paths_d

### Prepare gene annotation

#### download gencode

In [135]:
if recompute:
    subprocess.run(
        ["wget", "-O", gencode_gtf, gencode_download_url],
        check=True,
    )

In [137]:
!zcat {gencode_gtf} | head -n 6

##description: evidence-based annotation of the mouse genome (GRCm38), version M25 (Ensembl 100)
##provider: GENCODE
##contact: gencode-help@ebi.ac.uk
##format: gtf
##date: 2020-03-24
chr1	HAVANA	gene	3073253	3074322	.	+	.	gene_id "ENSMUSG00000102693.1"; gene_type "TEC"; gene_name "4933401J01Rik"; level 2; mgi_id "MGI:1918292"; havana_gene "OTTMUSG00000049935.1";

gzip: stdout: Broken pipe


#### Filter and reformat gencode GTF

- restrict to canonical transcripts
- restrict to coding transcripts
- remove chr prefix
- change M to MT

In [138]:
gencode_df = pr.read_gtf(gencode_gtf, as_df=True, duplicate_attr=True)

In [139]:
# extract appris principal score from tags
appris_principal_score = (
    gencode_df["tag"].str.extract(r"appris_principal_(\d)", expand=False).astype(float)
)

In [140]:
appris_principal_score.value_counts()

1.00    484929
2.00     88170
3.00     69849
4.00     15559
5.00      8279
Name: tag, dtype: int64

In [141]:
appris_principal_score.isnull().sum()

1205266

In [142]:
appris_principal_score.notnull().sum()

666786

In [143]:
is_principal_transcript = appris_principal_score.notnull()

In [144]:
is_protein_coding = gencode_df["gene_type"].eq("protein_coding")

In [145]:
gencode_df_coding_canonical = gencode_df.loc[
    is_principal_transcript & is_protein_coding
].copy()

In [146]:
gencode_df_coding_canonical.head(3)

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,transcript_type,transcript_name,transcript_support_level,tag,havana_transcript,exon_number,exon_id,protein_id,ccdsid,ont
13,chr1,HAVANA,transcript,3214481,3671498,.,-,.,ENSMUSG00000051951.5,protein_coding,...,protein_coding,Xkr4-201,1,"basic,appris_principal_1,CCDS",OTTMUST00000065166.1,,,ENSMUSP00000070648.4,CCDS14803.1,
14,chr1,HAVANA,exon,3670551,3671498,.,-,.,ENSMUSG00000051951.5,protein_coding,...,protein_coding,Xkr4-201,1,"basic,appris_principal_1,CCDS",OTTMUST00000065166.1,1.0,ENSMUSE00000485541.3,ENSMUSP00000070648.4,CCDS14803.1,
15,chr1,HAVANA,CDS,3670551,3671348,.,-,0,ENSMUSG00000051951.5,protein_coding,...,protein_coding,Xkr4-201,1,"basic,appris_principal_1,CCDS",OTTMUST00000065166.1,1.0,ENSMUSE00000485541.3,ENSMUSP00000070648.4,CCDS14803.1,


In [147]:
gencode_df_coding_canonical.shape

(663082, 25)

In [150]:
gencode_df_coding_canonical["Chromosome"] = gencode_df_coding_canonical[
    "Chromosome"
].str.replace("chr", "")
gencode_df_coding_canonical["Chromosome"] = gencode_df_coding_canonical[
    "Chromosome"
].replace("M", "MT")

In [151]:
gencode_pr = pr.PyRanges(gencode_df_coding_canonical)
gencode_pr.df.Chromosome.unique()

['1', '2', '3', '4', '5', ..., '18', '19', 'MT', 'X', 'Y']
Length: 22
Categories (22, object): ['1', '2', '3', '4', ..., '19', 'MT', 'X', 'Y']

In [152]:
gencode_pr.to_gtf(gencode_coding_canonical_gtf)

In [153]:
!zcat {gencode_coding_canonical_gtf} | head

1	HAVANA	transcript	4807823	4846739	.	+	.	gene_id "ENSMUSG00000025903.14"; gene_type "protein_coding"; gene_name "Lypla1"; level "2"; mgi_id "MGI:1344588"; havana_gene "OTTMUSG00000021562.4"; transcript_id "ENSMUST00000027036.10"; transcript_type "protein_coding"; transcript_name "Lypla1-201"; transcript_support_level "1"; tag "basic,appris_principal_1,CCDS"; havana_transcript "OTTMUST00000051162.1"; protein_id "ENSMUSP00000027036.4"; ccdsid "CCDS14806.1";
1	HAVANA	exon	4807823	4807982	.	+	.	gene_id "ENSMUSG00000025903.14"; gene_type "protein_coding"; gene_name "Lypla1"; level "2"; mgi_id "MGI:1344588"; havana_gene "OTTMUSG00000021562.4"; transcript_id "ENSMUST00000027036.10"; transcript_type "protein_coding"; transcript_name "Lypla1-201"; transcript_support_level "1"; tag "basic,appris_principal_1,CCDS"; havana_transcript "OTTMUST00000051162.1"; exon_number "1"; exon_id "ENSMUSE00000792454.1"; protein_id "ENSMUSP00000027036.4"; ccdsid "CCDS14806.1";
1	HAVANA	CDS	4807914	4807982	.	+	0	

verify gtf

In [154]:
!zcat {gencode_coding_canonical_gtf} | grep ^protein_coding





































































































































In [155]:
!zcat {gencode_coding_canonical_gtf} | grep ^appris

















































































































































































































































### Prepare and inspect probes files

#### Probe file from Maxi

##### Inspect original probes file

- file has duplicates
- file is not fully sorted

###### General overview

In [43]:
!head -n 3 {original_probes_bed}

chr1	3102470	3102471	cg36603287_TC21
chr1	3199334	3199335	cg36603791_TC21
chr1	3216335	3216336	cg36603848_TC21


In [44]:
!cut -f 1 < {original_probes_bed} | uniq

chr1
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chrX
chrY


In [46]:
original_probes_df = pd.read_csv(
    original_probes_bed,
    sep="\t",
    header=None,
    names=["Chromosome", "Start", "End", "name"],
)
original_probes_df["Chromosome"] = pd.Categorical(
    original_probes_df["Chromosome"],
    categories=original_probes_df["Chromosome"].unique(),
    ordered=True,
)
original_probes_df

Unnamed: 0,Chromosome,Start,End,name
0,chr1,3102470,3102471,cg36603287_TC21
1,chr1,3199334,3199335,cg36603791_TC21
2,chr1,3216335,3216336,cg36603848_TC21
3,chr1,3253527,3253528,cg36604001_TC21
4,chr1,3353526,3353527,cg36604489_TC21
...,...,...,...,...
262159,chrY,87877338,87877339,cg48338431_BC11
262160,chrY,88188140,88188141,cg48338916_BC11
262161,chrY,90739047,90739048,cg48343391_BC11
262162,chrY,90805372,90805373,cg48344082_BC11


###### File is not fully sorted

**Note that the original probes df is not completely sorted on Start/End**

In [66]:
original_probes_df_sorted = original_probes_df.sort_values(
    ["Chromosome", "Start", "End"]
).reset_index(drop=True)
original_probes_df_sorted

Unnamed: 0,Chromosome,Start,End,name
0,chr1,3035832,3035833,cg36602902_BC11
1,chr1,3102470,3102471,cg36603287_TC21
2,chr1,3121638,3121639,cg36603393_BC21
3,chr1,3199334,3199335,cg36603791_TC21
4,chr1,3216335,3216336,cg36603848_TC21
...,...,...,...,...
262159,chrY,90805372,90805373,cg48344082_BC11
262160,chrY,90806785,90806786,cg48344102_TC21
262161,chrY,90808023,90808024,cg48344138_TC21
262162,chrY,90808869,90808870,cg48344145_TC21


In [186]:
original_probes_df_sorted.Chromosome.dtype

CategoricalDtype(categories=['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7',
                  'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14',
                  'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chrX', 'chrY'],
, ordered=True)

###### Several probes are present with the same coordinates, but different names

In [48]:
original_probes_df.loc[
    original_probes_df.duplicated(["Chromosome", "Start", "End"], keep=False)
]

Unnamed: 0,Chromosome,Start,End,name
528,chr1,24417083,24417084,cg36730985_TC11
529,chr1,24417083,24417084,cg36730985_TC12
806,chr1,36104057,36104058,cg36801891_TC11
807,chr1,36104057,36104058,cg36801891_TO11
2960,chr1,82925910,82925911,cg37153974_TC11
...,...,...,...,...
259982,chrX,132046592,132046593,cg48007966_BC12
260591,chrX,140664692,140664693,cg48058116_BC11
260592,chrX,140664692,140664693,cg48058116_BC12
261408,chrX,159641362,159641363,cg48151117_BC11


In [49]:
original_probes_df.loc[
    original_probes_df.duplicated(["Chromosome", "Start", "End", "name"], keep=False)
]

Unnamed: 0,Chromosome,Start,End,name


##### Reformat probes file

- need to resort
- need to remove chr prefix
- drop duplicates

In [18]:
probes_df_no_prefix_sorted = (
    original_probes_df.assign(
        Chromosome=lambda df: df["Chromosome"].str.replace("chr", ""),
    )[["Chromosome", "Start", "End"]]
    .drop_duplicates()
    .sort_values(["Chromosome", "Start", "End"])
    .reset_index(drop=True)
)

In [105]:
probes_df_no_prefix_sorted.to_csv(
    reformatted_probes_bed, sep="\t", header=False, index=False
)

In [106]:
!head {reformatted_probes_bed}

1	3035832	3035833
1	3102470	3102471
1	3121638	3121639
1	3199334	3199335
1	3216335	3216336
1	3253527	3253528
1	3353526	3353527
1	3367941	3367942
1	3469621	3469622
1	3482624	3482625


#### Illumina probe file

##### Schema

- MFG_CHANGE probes haben ein problem
- there may be one row separating assay probes from controls somewhere in the dataframe? (info from Maxi)

##### Download

In [148]:
if recompute:
    subprocess.run(["wget", "-O", illumina_probes_csv, illumina_probes_url], check=True)

In [149]:
!head {illumina_probes_csv}

Illumina, Inc.,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
[Heading],,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Descriptor File Name,MouseMethylation-12v1-0_A1.csv,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Assay Format,Infinium 2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Date Manufactured,10/15/2020,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Loci Count,287050,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
[Assay],,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
IlmnID,Name,AddressA_ID,AlleleA_ProbeSeq,AddressB_ID,AlleleB_ProbeSeq,Infinium_Design_Type,Next_Base,Color_Channel,Forward_Sequence,Top_Sequence,Genome_Build,Genome_Build_NCBI,CHR,MAPINFO,SourceSeq,Strand,Strand_TB,Strand_CO,N_Shelf,N_Shore,CpG_Island,CpG_Island_chrom,CpG_Island_chromStart,CpG_Island_chromEnd,CpG_Island_length,CpG_Island_cpgNum,CpG_Island_gcNum,CpG_Island_perCpg,CpG_Island_perGc,CpG_Island_obsExp,S_Shore,S_Shelf,MFG_Change_Flagged
cg00101675_BC21,cg00101675,91684905,TTCCTAAAAAAACTTCAACAATACTATATCATTTTTAACTTCATAAACAC,,,II,,,AGCCTGAACTGTTCCTAAAGAAACTTCAGCAGTGCTGTGTCATTTTTGACTTCATGGACA[CG]CTATCTGA

##### Get curated BED intervals for probes

In [70]:
illumina_probes = pd.read_csv(
    illumina_probes_csv,
    skiprows=7,
    dtype={
        "AddressA_ID": str,
        "CHR": str,
        "MFG_Change_Flagged": "boolean",
        "MAPINFO": "Int64",
    },
)

Fields, drop fields with longish sequence strings for display

In [71]:
illumina_probes.drop(["Forward_Sequence", "Top_Sequence"], axis=1).iloc[0].to_frame()

Unnamed: 0,0
IlmnID,cg00101675_BC21
Name,cg00101675
AddressA_ID,91684905
AlleleA_ProbeSeq,TTCCTAAAAAAACTTCAACAATACTATATCATTTTTAACTTCATAAACAC
AddressB_ID,
AlleleB_ProbeSeq,
Infinium_Design_Type,II
Next_Base,
Color_Channel,
Genome_Build,mm10


There are nan chromosomes entries, and also some entries for chromosome 0, just 410, so I assume this can just be discarded as controls or something similar

In [72]:
illumina_probes.CHR.value_counts()

2     21675
11    18941
7     18875
4     18151
5     17173
1     17077
X     15147
9     14907
10    14824
6     14427
8     13998
3     13511
17    13055
13    11861
15    11458
12    11130
14    10532
16     9093
18     8596
19     8508
Y      3665
0       410
MT       36
Name: CHR, dtype: int64

checked manually in my index files: 1-based Start info is in MAPINFO

- for comparability with Maxis probes, also add 'chr' prefix and make Categorical
- provide BED interval for cytosine

In [94]:
illumina_probes_curated_chrom_defined = (
    illumina_probes[["CHR", "MAPINFO", "IlmnID"]]
    .rename(columns={"CHR": "Chromosome", "MAPINFO": "Start", "IlmnID": "name"})
    .loc[lambda df: df.Chromosome.notnull() & df.Chromosome.ne("0")]
    .assign(
        Start=lambda df: df["Start"] - 1,
        End=lambda df: df["Start"] + 1,
        Chromosome=lambda df: ("chr" + df["Chromosome"]).astype(chrom_dtype_prefixed),
    )
    .sort_values(["Chromosome", "Start", "End"])
    .reset_index(drop=True)[["Chromosome", "Start", "End", "name"]]
)
illumina_probes_curated_chrom_defined

Unnamed: 0,Chromosome,Start,End,name
0,chr1,3005997,3005998,cg36602742_TC11
1,chr1,3006186,3006187,cg36602743_TC21
2,chr1,3035832,3035833,cg36602902_BC11
3,chr1,3062738,3062739,cg36603113_TC21
4,chr1,3102469,3102470,cg36603287_TC21
...,...,...,...,...
286635,chrMT,12086,12087,cg47407346_TC21
286636,chrMT,12865,12866,cg47407363_BC21
286637,chrMT,13343,13344,cg47407375_TC11
286638,chrMT,14354,14355,cg47407385_BC21


drop duplicate rows, remove prefix, change to alphabetic sorting order

In [95]:
illumina_probes_curated_chrom_defined.assign(
    Chromosome=lambda df: df.Chromosome.astype(str).str.replace("chr", "")
).iloc[:, 0:3].sort_values(["Chromosome", "Start", "End"]).drop_duplicates().to_csv(
    illumina_coordinate_bed, sep="\t", header=False, index=False
)

In [96]:
!head {illumina_coordinate_bed}

1	3005997	3005998
1	3006186	3006187
1	3035832	3035833
1	3062738	3062739
1	3102469	3102470
1	3121638	3121639
1	3199333	3199334
1	3216334	3216335
1	3253526	3253527
1	3275903	3275904


##### Check against Maxis probes to see whether I have correct manifest file

this is the correct manifest file - maxis coordinates are shifted when on minus strand

In [75]:
pd.merge(
    original_probes_df_sorted,
    illumina_probes_curated_chrom_defined,
    on=["Chromosome", "Start", "End", "name"],
    how="inner",
)

Unnamed: 0,Chromosome,Start,End,name
0,chr1,3035832,3035833,cg36602902_BC11
1,chr1,3121638,3121639,cg36603393_BC21
2,chr1,3514033,3514034,cg36605130_BC21
3,chr1,3526405,3526406,cg36605172_BC21
4,chr1,3548143,3548144,cg36605321_BC21
...,...,...,...,...
131877,chrY,87877338,87877339,cg48338431_BC11
131878,chrY,88188140,88188141,cg48338916_BC11
131879,chrY,90739047,90739048,cg48343391_BC11
131880,chrY,90805372,90805373,cg48344082_BC11


In [76]:
df = pd.merge(
    original_probes_df_sorted,
    illumina_probes_curated_chrom_defined,
    on=["name"],
    how="inner",
)
display(df)
assert df.shape[0] == original_probes_df_sorted.shape[0]

Unnamed: 0,Chromosome_x,Start_x,End_x,name,Chromosome_y,Start_y,End_y
0,chr1,3035832,3035833,cg36602902_BC11,chr1,3035832,3035833
1,chr1,3102470,3102471,cg36603287_TC21,chr1,3102469,3102470
2,chr1,3121638,3121639,cg36603393_BC21,chr1,3121638,3121639
3,chr1,3199334,3199335,cg36603791_TC21,chr1,3199333,3199334
4,chr1,3216335,3216336,cg36603848_TC21,chr1,3216334,3216335
...,...,...,...,...,...,...,...
262159,chrY,90805372,90805373,cg48344082_BC11,chrY,90805372,90805373
262160,chrY,90806785,90806786,cg48344102_TC21,chrY,90806784,90806785
262161,chrY,90808023,90808024,cg48344138_TC21,chrY,90808022,90808023
262162,chrY,90808869,90808870,cg48344145_TC21,chrY,90808868,90808869


##### Add motif and strand

## Annotation

### Gene annotation

#### Perform annotation

In [156]:
%%time
ga.annotate(
    query_bed=illumina_coordinate_bed,
    gtf_fp=gencode_coding_canonical_gtf,
    trunk_path=custom_intervals_trunk_path,
    tmpdir=temp_dir_name,
    promoter=(-1500, 500),
    distant_cis_regulatory_domain=(-100_000, 100_000),
)

Loading data
Annotating promoter regions
FIXED DCRD ANNO
Annotating transcript parts
UTR classification
Merge results
Classify annotations
Add intergenic regions
Save results
Basic stats for primary annotations
    #Primary annotations  Frequency
0                      1     235952
1                      2      20045
2                      3       8610
3                      4       5253
4                      5       3642
5                      6       2377
6                      7       1579
7                      8       1201
8                      9        913
9                     10        644
10                    11        538
11                    12        359
12                    13        312
13                    14        164
14                    15        113
15                    17         93
16                    16         60
17                    18         55
18                    21         26
19                    19         22
20                    20         

#### Inspect annotations

In [157]:
primary_annos = pd.read_pickle(custom_intervals_results_paths_d["primary_annos_p"])

In [158]:
primary_annos.shape

(414506, 20)

##### General checks

In [159]:
primary_annos.query('feat_class == "Promoter"').head(3)

Unnamed: 0,Chromosome,Start,End,gtfanno_uid,center,feat_class,perc_feature,perc_region,distance,has_center,gene_name,gene_id,transcript_id,appris_principal_score,feat_chrom,feat_start,feat_end,feat_center,feat_strand,feature_rank
32,1,3671231,3671232,24,3671231.5,Promoter,,,266.5,True,Xkr4,ENSMUSG00000051951.5,ENSMUST00000070533.4,0.0,1,3571498.0,3771498.0,,-,primary
66,1,4408881,4408882,43,4408881.5,Promoter,,,359.5,True,Rp1,ENSMUSG00000025900.13,ENSMUST00000208660.1,0.0,1,4309241.0,4509241.0,,-,primary
70,1,4409378,4409379,44,4409378.5,Promoter,,,-137.5,True,Rp1,ENSMUSG00000025900.13,ENSMUST00000208660.1,0.0,1,4309241.0,4509241.0,,-,primary


In [160]:
primary_annos.query('feat_class == "exon"').head(3)

Unnamed: 0,Chromosome,Start,End,gtfanno_uid,center,feat_class,perc_feature,perc_region,distance,has_center,gene_name,gene_id,transcript_id,appris_principal_score,feat_chrom,feat_start,feat_end,feat_center,feat_strand,feature_rank
7,1,3216334,3216335,7,3216334.0,exon,0.0,1.0,609.5,True,Xkr4,ENSMUSG00000051951.5,ENSMUST00000070533.4,0.0,1,3214482.0,3216968.0,3215724.5,-,primary
23,1,3670721,3670722,21,3670721.0,exon,0.0,1.0,-303.5,True,Xkr4,ENSMUSG00000051951.5,ENSMUST00000070533.4,0.0,1,3670552.0,3671498.0,3671024.5,-,primary
26,1,3670834,3670835,22,3670834.0,exon,0.0,1.0,-190.5,True,Xkr4,ENSMUSG00000051951.5,ENSMUST00000070533.4,0.0,1,3670552.0,3671498.0,3671024.5,-,primary


##### Multiple assignments per region

###### How is this distributed across feature classes?

In [161]:
multi_annos_crosstab = (
    primary_annos.groupby(["feat_class", "gtfanno_uid"], observed=True)
    .size()
    .groupby("feat_class")
    .value_counts()
    .unstack()
)
multi_annos_crosstab

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,16,17,18,19,20,21,22,23,24,25
feat_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Promoter,56553.0,4704.0,184.0,3.0,,,,,,,...,,,,,,,,,,
5'-UTR,1003.0,4.0,,,,,,,,,...,,,,,,,,,,
3'-UTR,5284.0,89.0,,,,,,,,,...,,,,,,2.0,,,,
exon,26634.0,186.0,3.0,,,,,,1.0,,...,,,,,,,1.0,,,
intron,90028.0,1929.0,84.0,4.0,10.0,6.0,5.0,1.0,3.0,9.0,...,9.0,,1.0,,5.0,9.0,4.0,,8.0,
DCRD,21944.0,13133.0,8339.0,5246.0,3632.0,2371.0,1574.0,1200.0,909.0,635.0,...,51.0,93.0,54.0,22.0,12.0,15.0,7.0,3.0,1.0,7.0
intergenic,34506.0,,,,,,,,,,...,,,,,,,,,,


###### Example for Promoter multiple annotations - random samples indicate that these are indeed ambiguous sites

In [162]:
primary_annos["is_duplicated"] = primary_annos.duplicated(
    subset=["Chromosome", "Start", "End"], keep=False
)

In [163]:
df = primary_annos.query('feat_class == "Promoter" & is_duplicated')[
    ["Chromosome", "Start", "End", "gtfanno_uid", "gene_name"]
]
display(df.head(20))
display(df.tail(20))

Unnamed: 0,Chromosome,Start,End,gtfanno_uid,gene_name
238,1,4808196,4808197,96,Gm37988
239,1,4808196,4808197,96,Lypla1
244,1,4808291,4808292,97,Gm37988
245,1,4808291,4808292,97,Lypla1
1252,1,10037877,10037878,355,Cspp1
1253,1,10037877,10037878,355,Cops5
1259,1,10038004,10038005,356,Cspp1
1260,1,10038004,10038005,356,Cops5
1266,1,10038459,10038460,357,Cops5
1267,1,10038459,10038460,357,Cspp1


Unnamed: 0,Chromosome,Start,End,gtfanno_uid,gene_name
1638159,X,155624572,155624573,277148,Ptchd1
1638160,X,155624572,155624573,277148,Gm15155
1638161,X,155624668,155624669,277149,Ptchd1
1638162,X,155624668,155624669,277149,Gm15155
1641301,X,164980364,164980365,278241,Fancb
1641302,X,164980364,164980365,278241,Mospd2
1641304,X,164980482,164980483,278242,Mospd2
1641305,X,164980482,164980483,278242,Fancb
1641307,X,164980683,164980684,278243,Mospd2
1641308,X,164980683,164980684,278243,Fancb


Nsdhl
http://www.ensembl.org/Mus_musculus/Gene/Summary?db=core;g=ENSMUSG00000031349;r=X:71962163-72002120

Rpl7
http://www.ensembl.org/Mus_musculus/Gene/Summary?db=core;g=ENSMUSG00000043716;r=1:16171519-16174886

#### merge annotations

Merging strategy: keep all
- for Promoters, the window is relatively small. Ranking on TSS distance in such a small window seems arbitrary.
- for enhancer candidates, a typical strategy would be to identify all TSS in +-100 kb window and try to find the target through correlation with gene expression, eg PMID: 30686579. So it also makes sense to indicate all genes in the window to give an impression of the number of possible target genes.

In [164]:
%%time
merged_annos = lib.merge_annos(primary_annos=primary_annos)

merge unique value fields
CPU times: user 8min 9s, sys: 10.1 s, total: 8min 19s
Wall time: 7min 58s


In [174]:
merged_annos

Unnamed: 0,Chromosome,Start,End,gtfanno_uid,center,feat_class,perc_feature,perc_region,distance,has_center,...,gene_id,transcript_id,appris_principal_score,feat_chrom,feat_start,feat_end,feat_center,feat_strand,feature_rank,is_duplicated
0,1,3005997,3005998,0,,intergenic,,,,,...,,,,,,,,,primary,False
1,1,3006186,3006187,1,,intergenic,,,,,...,,,,,,,,,primary,False
2,1,3035832,3035833,2,,intergenic,,,,,...,,,,,,,,,primary,False
3,1,3062738,3062739,3,,intergenic,,,,,...,,,,,,,,,primary,False
4,1,3102469,3102470,4,,intergenic,,,,,...,,,,,,,,,primary,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
282001,Y,90808022,90808023,282001,90808022.50,DCRD,,,31154.5,True,...,ENSMUSG00000096850.1,ENSMUST00000179623.1,0.0,Y,90739177.0,90939177.0,,-,primary,False
282002,Y,90808868,90808869,282002,90808868.50,DCRD,,,30308.5,True,...,ENSMUSG00000096850.1,ENSMUST00000179623.1,0.0,Y,90739177.0,90939177.0,,-,primary,False
282003,Y,90809535,90809536,282003,90809535.50,DCRD,,,29641.5,True,...,ENSMUSG00000096850.1,ENSMUST00000179623.1,0.0,Y,90739177.0,90939177.0,,-,primary,False
282004,Y,90813404,90813405,282004,90813404.50,DCRD,,,25772.5,True,...,ENSMUSG00000096850.1,ENSMUST00000179623.1,0.0,Y,90739177.0,90939177.0,,-,primary,False


In [175]:
merged_annos_new_chrom_dtype = merged_annos.copy()
merged_annos_new_chrom_dtype["Chromosome"] = (
    "chr" + merged_annos["Chromosome"].astype(str)
).astype(chrom_dtype_prefixed)
merged_annos_new_chrom_dtype = (
    merged_annos_new_chrom_dtype.sort_values(["Chromosome", "Start", "End"])
    .drop("gtfanno_uid", axis=1)
    .reset_index(drop=True)
)

In [176]:
merged_annos_final = pd.merge(
    merged_annos_new_chrom_dtype,
    illumina_probes_curated_chrom_defined,
    on=["Chromosome", "Start", "End"],
    how="left",
)

In [177]:
merged_annos_final.head(3)

Unnamed: 0,Chromosome,Start,End,center,feat_class,perc_feature,perc_region,distance,has_center,gene_name,...,transcript_id,appris_principal_score,feat_chrom,feat_start,feat_end,feat_center,feat_strand,feature_rank,is_duplicated,name
0,chr1,3005997,3005998,,intergenic,,,,,,...,,,,,,,,primary,False,cg36602742_TC11
1,chr1,3006186,3006187,,intergenic,,,,,,...,,,,,,,,primary,False,cg36602743_TC21
2,chr1,3035832,3035833,,intergenic,,,,,,...,,,,,,,,primary,False,cg36602902_BC11


In [178]:
merged_annos_final.shape

(286640, 21)

In [179]:
illumina_probes_curated_chrom_defined

Unnamed: 0,Chromosome,Start,End,name
0,chr1,3005997,3005998,cg36602742_TC11
1,chr1,3006186,3006187,cg36602743_TC21
2,chr1,3035832,3035833,cg36602902_BC11
3,chr1,3062738,3062739,cg36603113_TC21
4,chr1,3102469,3102470,cg36603287_TC21
...,...,...,...,...
286635,chrMT,12086,12087,cg47407346_TC21
286636,chrMT,12865,12866,cg47407363_BC21
286637,chrMT,13343,13344,cg47407375_TC11
286638,chrMT,14354,14355,cg47407385_BC21


In [180]:
assert merged_annos_final["name"].notnull().all()

In [181]:
pd.testing.assert_frame_equal(
    merged_annos_final[["Chromosome", "Start", "End"]],
    illumina_probes_curated_chrom_defined[["Chromosome", "Start", "End"]].astype(
        {"Start": "i8", "End": "i8"}
    ),
)

In [182]:
merged_annos_final.iloc[0]

Chromosome                           chr1
Start                             3005997
End                               3005998
center                                NaN
feat_class                     intergenic
perc_feature                          nan
perc_region                           nan
distance                              nan
has_center                            nan
gene_name                             nan
gene_id                               nan
transcript_id                         nan
appris_principal_score                nan
feat_chrom                            NaN
feat_start                            nan
feat_end                              nan
feat_center                           nan
feat_strand                           nan
feature_rank                      primary
is_duplicated                       False
name                      cg36602742_TC11
Name: 0, dtype: object

#### Finalize annotation tables

In [186]:
merged_annos_final.rename(columns={"Chromosome": "#Chromosome"}).to_csv(
    gene_annos_primary_one_row, sep="\t", header=True, index=False
)

In [184]:
primary_annos_final = (
    primary_annos.drop("gtfanno_uid", axis=1)
    .assign(
        Chromosome=lambda df: ("chr" + df["Chromosome"].astype(str)).astype(
            chrom_dtype_prefixed
        )
    )
    .sort_values(["Chromosome", "Start", "End"])
    .reset_index(drop=True)
)

In [187]:
primary_annos_final.rename(columns={"Chromosome": "#Chromosome"}).to_csv(
    gene_annos_primary_multi_row, sep="\t", header=True, index=False
)

In [188]:
!head {gene_annos_primary_multi_row}

#Chromosome	Start	End	center	feat_class	perc_feature	perc_region	distance	has_center	gene_name	gene_id	transcript_id	appris_principal_score	feat_chrom	feat_start	feat_end	feat_center	feat_strand	feature_rank	is_duplicated
chr1	3005997	3005998		intergenic														primary	False
chr1	3006186	3006187		intergenic														primary	False
chr1	3035832	3035833		intergenic														primary	False
chr1	3062738	3062739		intergenic														primary	False
chr1	3102469	3102470		intergenic														primary	False
chr1	3121638	3121639		intergenic														primary	False
chr1	3199333	3199334		intergenic														primary	False
chr1	3216334	3216335	3216334.0	exon	0.00040225261464199515	1.0	609.5	True	Xkr4	ENSMUSG00000051951.5	ENSMUST00000070533.4	0.0	1	3214482.0	3216968.0	3215724.5	-	primary	False
chr1	3253526	3253527	3253526.0	intron	2.18810719974793e-06	1.0	-189463.5	True	Xkr4	ENSMUSG00000051951.5	ENSMUST00000070533.4	0.0	1	3214482.0	3671498.0	3442989.5	-	primary	False


### CpG island annotations

In [141]:
cpg_island_classif_df = lib.classify_cpg_island_overlap(
    granges_df=original_probes_df_sorted,
    cpg_islands_pickle_d=cpg_islands_pickle_d,
)
cpg_island_classif_df.head(3)

Unnamed: 0,Chromosome,Start,End,name,region_name,distance_signed,north_south_of_island
0,chr1,3035832,3035833,cg36602902_BC11,open sea,-495792,
1,chr1,3102470,3102471,cg36603287_TC21,open sea,-429154,
2,chr1,3121638,3121639,cg36603393_BC21,open sea,-409986,


### Merge all annotations

In [None]:
cpg_island_classif_df
merged_annos_final

# End