Taken from Parada 2021 Additional file 2: Table S1.

https://static-content.springer.com/esm/art%3A10.1186%2Fs13059-020-02246-2/MediaObjects/13059_2020_2246_MOESM2_ESM.tsv


High confidence list of detected microexons. 
Here we report all of the high confidence microexons detected by MicroExonator from mouse bulk and scRNA-seq. 
The table also includes information about the downstream analyses. 

Columns PC1-3 summarize the PPCA results; In.10_percent_of_bulk indicates if the microexon was supported by >5 reads in >10% of bulk RNA-seq samples; MHN/F.diff columns indicate if they were found to be alternatively included in these sample groups at any of the time points that were compared with the control group. 


For the microexons that remained differentially included during brain development MHN/F.change_dir and MHN/F.diff_age indicate the direction of inclusion and the embryonic stage since they started to be detected as differentially included.


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import os, sys
import pandas as pd
import pybedtools as pbt
import seaborn as sns
sys.path.append("/wynton/home/ahituv/fongsl/tools/py_")
import config_readwrite as crw

In [1]:
config_tag = "config.ini"
config, configfile_name = crw.read_config(os.path.join(os.getcwd(), config_tag))


ME_MOUSE = config["mouse"]["microexons"]
MOUSE_PATH ="/wynton/home/ahituv/fongsl/microexons/data/mouse/"
ME_MOUSE_FILTERED = os.path.join(MOUSE_PATH, "mouse_dev_microexons_In.10_percent_of_bulk.bed")

# write filtered output file to config
config["mouse"]["path"] = MOUSE_PATH
config["mouse"]["microexons_filtered"] = ME_MOUSE_FILTERED
crw.write_config(config, configfile_name)

In [2]:
df = pd.read_csv(ME_MOUSE, sep = '\t')

print(df.shape)
df.head()

(2987, 33)


Unnamed: 0,ME,transcript,sum_total_coverage,total_SJs,total_coverages,len_micro_exon_seq_found,micro_exon_seq_found,total_number_of_micro_exons_matches,U2_scores,mean_conservations_vertebrates,...,In.10_percent_of_bulk,MHN.diff,F.diff,HNM.diff_age,HNM.change_dir,F.diff_age,F.change_dir,SKM.diff,Heart.diff,AG.diff
0,chr10_+_101864522_101864547,ENSMUST00000146230.7,18,chr10:101864030+102011732,18,25,ATATCCATGTGATGGGATTACACAG,1,84.618868,-0.0601724101038,...,True,False,False,,,,,False,False,False
1,chr10_+_101889302_101889318,ENSMUST00000146230.7,30,chr10:101864030+102011732,30,16,AAGAACAACAGAATGG,1,84.675354,,...,True,False,False,,,,,False,False,False
2,chr10_+_102380225_102380232,ENSMUST00000179929.7,160,chr10:102378304+102385005,160,7,TCCTGTG,1,76.279252,0.405090905049,...,True,False,False,,,,,False,False,False
3,chr10_+_106859776_106859806,ENSMUST00000029404.16,0,chr10:106858398+106863253,0,30,CCAAGTGATCTAAGGAAACATCGGAGAAAG,1,76.427509,4.7085882548,...,True,True,False,,,,,False,False,False
4,chr10_+_106882735_106882744,ENSMUST00000029404.16,799,chr10:106869972+106893431,799,9,CACATCGGG,1,73.777136,3.38323080998,...,True,True,False,13.5,-1.0,,,False,False,False


# filter for microexons supported by >5 reads and 10% of bulk RNA-seq 

In [3]:
fdf = df.loc[df["In.10_percent_of_bulk"]==True].copy()

# make bed fields
fdf["#chr"] = fdf["ME"].apply(lambda x: x.split("_")[0])
fdf["strand"] = fdf["ME"].apply(lambda x: x.split("_")[1])
fdf["start"] = fdf["ME"].apply(lambda x: x.split("_")[2])
fdf["end"] = fdf["ME"].apply(lambda x: x.split("_")[3])

fdf.head()

Unnamed: 0,ME,transcript,sum_total_coverage,total_SJs,total_coverages,len_micro_exon_seq_found,micro_exon_seq_found,total_number_of_micro_exons_matches,U2_scores,mean_conservations_vertebrates,...,HNM.change_dir,F.diff_age,F.change_dir,SKM.diff,Heart.diff,AG.diff,#chr,strand,start,end
0,chr10_+_101864522_101864547,ENSMUST00000146230.7,18,chr10:101864030+102011732,18,25,ATATCCATGTGATGGGATTACACAG,1,84.618868,-0.0601724101038,...,,,,False,False,False,chr10,+,101864522,101864547
1,chr10_+_101889302_101889318,ENSMUST00000146230.7,30,chr10:101864030+102011732,30,16,AAGAACAACAGAATGG,1,84.675354,,...,,,,False,False,False,chr10,+,101889302,101889318
2,chr10_+_102380225_102380232,ENSMUST00000179929.7,160,chr10:102378304+102385005,160,7,TCCTGTG,1,76.279252,0.405090905049,...,,,,False,False,False,chr10,+,102380225,102380232
3,chr10_+_106859776_106859806,ENSMUST00000029404.16,0,chr10:106858398+106863253,0,30,CCAAGTGATCTAAGGAAACATCGGAGAAAG,1,76.427509,4.7085882548,...,,,,False,False,False,chr10,+,106859776,106859806
4,chr10_+_106882735_106882744,ENSMUST00000029404.16,799,chr10:106869972+106893431,799,9,CACATCGGG,1,73.777136,3.38323080998,...,-1.0,,,False,False,False,chr10,+,106882735,106882744


## write filtered .bed

In [4]:
filtered_df = fdf[["#chr", "start", "end",  "ME", "strand", "transcript" ]].drop_duplicates()
filtered_df.sort_values(by=["#chr", "start", "end"]).to_csv(ME_MOUSE_FILTERED, sep='\t', index=False)

In [9]:
filtered_df.shape

(2599, 6)

# liftover mouse microexons from MM10 -> hg38 

In [17]:
# add liftOver script to config
liftOver = "/wynton/home/ahituv/fongsl/tools/evo/liftover_bed-wynton.py"
crw.check_section(config, "src")
config["src"]["liftover"] = liftOver
LIFTED = "/wynton/home/ahituv/fongsl/microexons/data/mouse/mouse_dev_microexons_In.liftOver.to.Hg38.bed"
config["mouse"]["lifted_over"]=LIFTED
crw.write_config(config, configfile_name)

# do the liftover
FROM = "mm10"
TO = "Hg38"
cmd = f"python {liftOver} -b {ME_MOUSE_FILTERED} -f {FROM} -t {TO}"
os.chdir("../data/")
os.system(cmd)

lifting over /wynton/home/ahituv/fongsl/microexons/data/mouse/mouse_dev_microexons_In.10_percent_of_bulk.bed from, to mm10 Hg38 in /wynton/home/ahituv/fongsl/microexons/data/mouse
Sorting .bed /wynton/home/ahituv/fongsl/microexons/data/mouse/temp_mouse_dev_microexons_In.bed
lifted this already?
cleaned up temp file


0

## Liftover results
- 2230 regions liftOver
- 369 regions do not

# human - mouse microexons = human-specific gains/ mouse-specific losses

In [13]:
ME = config["microexon"]["threedb"]


In [14]:
import pybedtools as pbt

In [18]:
hu_me=pbt.BedTool(ME)
lifted_mouse = pbt.BedTool(LIFTED)

In [26]:
conserved = hu_me.intersect(lifted_mouse, wa=True)
subtract = hu_me.intersect(lifted_mouse, v=True, wa=True)
for i, c in enumerate(subtract):
    print(i, c[4])
    # 2245 microexons are conserved

0 microexon
1 microexon
2 microexon
3 microexon
4 microexon
5 microexon
6 microexon
7 microexon
8 microexon
9 microexon
10 microexon
11 microexon
12 microexon
13 microexon
14 microexon
15 microexon
16 microexon
17 microexon
18 microexon
19 microexon
20 microexon
21 microexon
22 microexon
23 microexon
24 microexon
25 microexon
26 microexon
27 microexon
28 microexon
29 microexon
30 microexon
31 microexon
32 microexon
33 microexon
34 microexon
35 microexon
36 microexon
37 microexon
38 microexon
39 microexon
40 microexon
41 microexon
42 microexon
43 microexon
44 microexon
45 microexon
46 microexon
47 microexon
48 microexon
49 microexon
50 microexon
51 microexon
52 microexon
53 microexon
54 microexon
55 microexon
56 microexon
57 microexon
58 microexon
59 microexon
60 microexon
61 microexon
62 microexon
63 microexon
64 microexon
65 microexon
66 microexon
67 microexon
68 microexon
69 microexon
70 microexon
71 microexon
72 microexon
73 microexon
74 microexon
75 microexon
76 microexon
77 microe

15351 dsINT
15352 dsINT
15353 dsINT
15354 dsINT
15355 dsINT
15356 dsINT
15357 dsINT
15358 dsINT
15359 dsINT
15360 dsINT
15361 dsINT
15362 dsINT
15363 dsINT
15364 dsINT
15365 dsINT
15366 microexon
15367 microexon
15368 microexon
15369 microexon
15370 microexon
15371 microexon
15372 microexon
15373 microexon
15374 microexon
15375 microexon
15376 microexon
15377 microexon
15378 microexon
15379 microexon
15380 microexon
15381 microexon
15382 microexon
15383 microexon
15384 microexon
15385 microexon
15386 microexon
15387 microexon
15388 microexon
15389 microexon
15390 microexon
15391 microexon
15392 microexon
15393 microexon
15394 microexon
15395 microexon
15396 microexon
15397 microexon
15398 microexon
15399 microexon
15400 microexon
15401 microexon
15402 microexon
15403 microexon
15404 microexon
15405 microexon
15406 microexon
15407 microexon
15408 microexon
15409 microexon
15410 microexon
15411 microexon
15412 microexon
15413 microexon
15414 microexon
15415 microexon
15416 microexon
1541

In [25]:
c[4]

'dsINT'

# mouse - human microexons =  human-specific losses/ mouse-specific gains

# human/mouse overlap = microexon conservation