### Download the following utilities
- http://hgdownload.soe.ucsc.edu/admin/exe/macOSX.x86_64/bigWigToBedGraph
- http://hgdownload.soe.ucsc.edu/admin/exe/macOSX.x86_64/hgWiggle
- bedtools
- bedops
- pandas (Python package)

### Download the following annotations
- https://public.hoffman2.idre.ucla.edu/ernst/R0RG6/LECIF/hg19.LECIFv1.1.bw
- https://public.hoffman2.idre.ucla.edu/ernst/ZHYRB/CNEP/cnep.bw
- https://public.hoffman2.idre.ucla.edu/ernst/ZHYRB/CNEP/css_cnep.bw 
- http://hgdownload.soe.ucsc.edu/goldenPath/hg19/phastCons100way/hg19.100way.phastCons.bw 
- http://hgdownload.soe.ucsc.edu/gbdb/hg19/multiz46way/phastCons46wayPlacental.wib
- http://hgdownload.soe.ucsc.edu/goldenPath/hg19/phyloP100way/hg19.100way.phyloP100way.bw
- http://hgdownload.soe.ucsc.edu/gbdb//hg19/multiz46way/phyloP46wayPlacental.wib
- https://public.hoffman2.idre.ucla.edu/ernst/1G6UT/hg19_genome_100_segments.bed.gz
- https://github.com/shorvath/MammalianMethylationConsortium/blob/main/Annotations%2C%20Amin%20Haghani/Manifest%2C%20HorvathMammalMethylChip40.csv.zip

### Download Roadmap epigenome ID annotation files

In [None]:
%%bash
curl -s https://egg2.wustl.edu/roadmap/data/byFileType/chromhmmSegmentations/ChmmModels/imputed12marks/jointModel/final/EIDlegend.txt > EIDlegend_RoadmapChromHMM.txt
curl -s https://egg2.wustl.edu/roadmap/data/byFileType/chromhmmSegmentations/ChmmModels/core_K27ac/jointModel/analyses/src/pairs/k27ac.list > EID_RoadmapExpandedChromHMM.txt

### Write a bed file with all hg19 coordinates

In [None]:
import pandas as pd

input_file_path = "Manifest, HorvathMammalMethylChip40.csv"
df = pd.read_csv(input_file_path,usecols=['IlmnID',"Human.Hg19_CGstart","Human.Hg19_CGend","Human.Hg19_seqnames"]).dropna().sort_values(by=['Human.Hg19_seqnames','Human.Hg19_CGstart'])
df['start'] = df['Human.Hg19_CGstart'].astype(int)
df['end'] = df['start']+1
df[['Human.Hg19_seqnames','start','end','IlmnID']].to_csv('hg19_coord.bed',sep='\t',header=False,index=False)

### Convert bigWig files to gzipped bed files

In [None]:
%%bash

./bigWigToBedGraph hg19.mm10.LECIF.bw hg19.mm10.LECIF.bed
gzip hg19.mm10.LECIF.bed

./bigWigToBedGraph cnep.bw hg19.CNEP.bed
gzip hg19.CNEP.bed

./bigWigToBedGraph css_cnep.bw hg19.CSS-CNEP.bed
gzip hg19.CSS-CNEP.bed

./bigWigToBedGraph hg19.100way.phastCons.bw hg19.100way.phastCons.bed
gzip hg19.100way.phastCons.bed

./bigWigToBedGraph hg19.100way.phyloP100way.bw hg19.phyloP100way.bed
gzip hg19.phyloP100way.bed

### Map LECIF score

In [None]:
! bedtools map -a hg19_coord.bed -b hg19.mm10.LECIF.bed.gz -c 4 -o mean > hg19_coord.LECIF.bed

### Map CNEP score

In [None]:
! bedtools map -a hg19_coord.bed -b hg19.CNEP.bed.gz -c 4 -o mean > hg19_coord.CNEP.bed

### Map CSS-CNEP score

In [None]:
! bedtools map -a hg19_coord.bed -b hg19.CSS-CNEP.bed.gz -c 4 -o mean > hg19_coord.CSS-CNEP.bed

### Map PhastCons score

In [None]:
! bedtools map -a hg19_coord.bed -b hg19.100way.phastCons.bed.gz -c 4 -o mean > hg19_coord.PhastCons.bed

### Map mammalian PhastCons score

In [None]:
%%bash
./hgWiggle -db=hg19 -bedFile=hg19_coord.bed -lift=1 phastCons46wayPlacental > hg19.PhastConsPlacental.wig
wig2bed < hg19.PhastConsPlacental.wig > hg19.PhastConsPlacental.bed
bedtools map -a hg19_coord.bed -b hg19.PhastConsPlacental.bed -c 5 -o mean > hg19_coord.PhastConsPlacental.bed

### Map PhyloP score

In [None]:
! bedtools map -a hg19_coord.bed -b hg19.100way.phyloPScore.bed.gz -c 4 -o mean > hg19_coord.PhyloP.bed

### Map mammalian PhyloP score

In [None]:
%%bash
./hgWiggle -db=hg19 -bedFile=hg19_coord.bed -lift=1 phyloP46wayPlacental > hg19.PhyloPPlacental.wig
wig2bed < hg19.PhyloPPlacental.wig > hg19.PhyloPPlacental.bed
bedtools map -a hg19_coord.bed -b hg19.PhyloPPlacental.bed -c 5 -o mean > hg19_coord.PhyloPPlacental.bed

### Map ConsHMM states

In [None]:
%%bash
path_prefix=https://ernst.cass.idre.ucla.edu/public/ConsHMM/Segmentations/hg19_multiz100way/

cp hg19_coord.bed hg19_coord.ConsHMM.bed
for i in {1..22} X Y; do
    curl -s $path_prefix/chr"$i"/chr"$i"_segmentation.bed.gz | gzip -cd |\
    bedtools map -a hg19_coord.ConsHMM.bed -b - -c 4 -o first > tmp.bed
    mv tmp.bed hg19_coord.ConsHMM.bed
done

In [None]:
%%bash
awk -v OFS="\t" '{m=$5;for(i=5;i<=NF;i++)if($i>m)m=$i;print $1,$2,$3,$4,m}' hg19_coord.ConsHMM.bed > tmp.bed
mv tmp.bed hg19_coord.ConsHMM.bed

### Map 100-state universal ChromHMM model states

In [None]:
%%bash
gzip -cd hg19_genome_100_segments.bed.gz | sort -k1,1 -k2,2n |\
bedtools map -a hg19_coord.bed -b - -c 4 -o first > hg19_coord.universalChromHMM.bed

### Map core 15-state ChromHMM model states

In [None]:
%%bash
path_prefix=https://egg2.wustl.edu/roadmap/data/byFileType/chromhmmSegmentations/ChmmModels/coreMarks/jointModel/final/
output_filename=hg19_coord.coreChromHMM.bed

cp hg19_coord.bed $output_filename
for i in `cat EIDlegend_RoadmapChromHMM.txt | cut -f 1`; do
    curl -s $path_prefix/"$i"_15_coreMarks_mnemonics.bed.gz | gzip -cd | sort -k1,1 -k2,2n |\
    bedtools map -a $output_filename -b - -c 4 -o first > tmp.bed
    mv tmp.bed $output_filename
done

### Map expanded 18-state ChromHMM model states

In [None]:
%%bash
path_prefix=https://egg2.wustl.edu/roadmap/data/byFileType/chromhmmSegmentations/ChmmModels/core_K27ac/jointModel/final/
output_filename=hg19_coord.expandedChromHMM.bed

cp hg19_coord.bed $output_filename
for i in `cat EID_RoadmapExpandedChromHMM.txt | cut -f 1`; do
    curl -s $path_prefix/"$i"_18_core_K27ac_mnemonics.bed.gz | gzip -cd | sort -k1,1 -k2,2n |\
    bedtools map -a $output_filename -b - -c 4 -o first > tmp.bed
    mv tmp.bed $output_filename
done

### Map imputed 25-state ChromHMM model states

In [None]:
%%bash
path_prefix=https://egg2.wustl.edu/roadmap/data/byFileType/chromhmmSegmentations/ChmmModels/imputed12marks/jointModel/final/
output_filename=hg19_coord.imputedChromHMM.bed

cp hg19_coord.bed $output_filename
for i in `cat EIDlegend_RoadmapChromHMM.txt | cut -f 1`; do
    curl -s $path_prefix/"$i"_25_imputed12marks_mnemonics.bed.gz | gzip -cd | sort -k1,1 -k2,2n |\
    bedtools map -a $output_filename -b - -c 4 -o first > tmp.bed
    mv tmp.bed $output_filename
done

### Combine them into a formatted csv file

In [None]:
import pandas as pd

coord_filename = 'hg19_coord.bed'
bed_filenames = ['hg19_coord.LECIF.bed',
                 'hg19_coord.CNEP.bed',
                 'hg19_coord.CSS-CNEP.bed',
                 'hg19_coord.PhastCons.bed',
                 'hg19_coord.PhastConsPlacental.bed',
                 'hg19_coord.PhyloP.bed',
                 'hg19_coord.PhyloPPlacental.bed',
                 'hg19_coord.ConsHMM.bed',
                 'hg19_coord.universalChromHMM.bed',
                 'hg19_coord.coreChromHMM.bed',
                 'hg19_coord.expandedChromHMM.bed',
                 'hg19_coord.imputedChromHMM.bed']

position_col_names = ['chrom','start','end','probeID']
chromhmm_eid_all = pd.read_table('EIDlegend_RoadmapChromHMM.txt',header=None).values
chromhmm_eid_all_dict = dict(chromhmm_eid_all)
chromhmm_eid_subset = pd.read_table('EID_RoadmapExpandedChromHMM.txt',header=None,squeeze=True).tolist()

core_chromhmm_col_name_prefix = 'ChromHMMState_Roadmap5Mark127Epigenome15State_ErnstKellis2012NatureMethods_RoadmapEpigenomicsConsortium2015Nature_'
expanded_chromhmm_col_name_prefix = 'ChromHMMState_Roadmap6Mark98Epigenome18State_ErnstKellis2012NatureMethods_RoadmapEpigenomicsConsortium2015Nature_'
imputed_chromhmm_col_name_prefix = 'ChromHMMState_Roadmap12Mark127Epigenome25State_ErnstKellis2012NatureMethods_RoadmapEpigenomicsConsortium2015Nature_'

core_chromhmm_col_names = [core_chromhmm_col_name_prefix+i[0]+'_'+i[1].replace(' ','') for i in chromhmm_eid_all]
expanded_chromhmm_col_names = [expanded_chromhmm_col_name_prefix+chromhmm_eid_subset[i]+'_'+chromhmm_eid_all_dict[chromhmm_eid_subset[i]].replace(' ','') for i in range(len(chromhmm_eid_subset))]
imputed_chromhmm_col_names = [imputed_chromhmm_col_name_prefix+i[0]+'_'+i[1].replace(' ','') for i in chromhmm_eid_all]
names = [position_col_names + n for n in [['LECIFScore_HumanMouse_KwonErnst2021NatureCommunications'],
                                          ['CNEPScore_GrujicEtAl2020NatureCommunications'],
                                          ['CSS-CNEPScore_GrujicEtAl2020NatureCommunications'], 
                                          ['PhastConsScore_SiepelEtAl2005GenomeResearch'],
                                          ['PhastConsPlacentalScore_SiepelEtAl2005GenomeResearch'],
                                          ['PhyloPScore_PollardEtAl2009GenomeResearch'],
                                          ['PhyloPPlacentalScore_PollardEtAl2009GenomeResearch'],
                                          ['ConsHMMState_Multiz100way_ArnesonErnst2019CommunicationsBiology'],
                                          ['ChromHMMState_VuErnst2022GenomeBiology'],
                                          core_chromhmm_col_names,
                                          expanded_chromhmm_col_names,
                                          imputed_chromhmm_col_names]]

df = pd.read_table(coord_filename,header=None,names=position_col_names)
for i in range(len(bed_filenames)):
    input_df = pd.read_table(bed_filenames[i],header=None,names=names[i])
    df = df.merge(input_df,on=position_col_names,how='left')
    
df.to_csv('hg19_annotErnstLab.csv',index=False)